diff --git a/abi/snapshot.json b/abi/snapshot.json index fd1b81280..87ae45000 100644 --- a/abi/snapshot.json +++ b/abi/snapshot.json @@ -1,5 +1,5 @@ { - "abi_version": 15, + "abi_version": 16, "channel_buffers": { "data_offset": 72, "data_size": 65536, @@ -113,7 +113,7 @@ }, "host_adapter": { "manifest": { - "abi_version": 15, + "abi_version": 16, "channel_data_offset": 72, "channel_data_size": 65536, "channel_header_size": 72, @@ -1309,6 +1309,11 @@ "name": "kernel_set_cwd", "signature": "(i32,i32,i32) -> (i32)" }, + { + "kind": "func", + "name": "kernel_set_fd_pipe", + "signature": "(i32,i32) -> (i32)" + }, { "kind": "func", "name": "kernel_set_fork_exec", @@ -1349,6 +1354,11 @@ "name": "kernel_set_stdin_pipe", "signature": "(i32) -> (i32)" }, + { + "kind": "func", + "name": "kernel_set_stdio_pipe", + "signature": "(i32,i32) -> (i32)" + }, { "kind": "func", "name": "kernel_set_tid_address", diff --git a/crates/fork-instrument/src/call_graph.rs b/crates/fork-instrument/src/call_graph.rs index 980bf65ca..593de285f 100644 --- a/crates/fork-instrument/src/call_graph.rs +++ b/crates/fork-instrument/src/call_graph.rs @@ -348,6 +348,14 @@ const MAX_INDIRECT_DEPTH: u8 = 2; pub fn reaching_closure(module: &Module, seed: FunctionId) -> HashSet { let profiles = profile_functions(module); let table_targets = table_targets(module, &profiles); + let has_dynamic_linker_imports = module.imports.iter().any(|import| { + import.module == "env" + && matches!(import.kind, ImportKind::Function(_)) + && matches!( + import.name.as_str(), + "__wasm_dlopen" | "__wasm_dlsym" | "__wasm_dlclose" | "__wasm_dlerror" + ) + }); // Reverse direct-call graph: `callee -> set of callers`. let mut reverse_direct: HashMap> = HashMap::new(); @@ -411,6 +419,22 @@ pub fn reaching_closure(module: &Module, seed: FunctionId) -> HashSet = profiles + .iter() + .filter_map(|(caller, profile)| (!profile.indirect.is_empty()).then_some(*caller)) + .collect(); + for caller in dynamic_indirect_roots { + enqueue( + caller, + 1, + &mut best_indirect_depth, + &mut result, + &mut worklist, + ); + } + } + while let Some((g, indirect_depth)) = worklist.pop_front() { // (2) Direct-reverse: who calls g directly? if let Some(callers) = reverse_direct.get(&g) { diff --git a/crates/fork-instrument/tests/call_graph.rs b/crates/fork-instrument/tests/call_graph.rs index 4062105e4..c3159bb6e 100644 --- a/crates/fork-instrument/tests/call_graph.rs +++ b/crates/fork-instrument/tests/call_graph.rs @@ -388,6 +388,43 @@ fn passive_element_with_table_init_is_followed() { ); } +#[test] +fn dynamic_linker_indirect_call_is_conservative_fork_boundary() { + // A dlopen/dlsym-capable main module can have side-module functions + // inserted into its indirect function table by the host after static + // analysis. If such a side-module function calls fork(), the main-module + // frame above the call_indirect must be serializable even though the + // side-module target is not present in any static element segment. + let wat = r#" + (module + (import "kernel" "kernel_fork" (func $fork (result i32))) + (import "env" "__wasm_dlsym" (func $dlsym (param i32 i32 i32) (result i32))) + (type $side_fn_ty (func (result i32))) + (table $t 1 funcref) + (func $dispatch_side_callback (export "dispatch_side_callback") (result i32) + i32.const 0 + call_indirect $t (type $side_fn_ty)) + (func $parent_frame (export "parent_frame") (result i32) + call $dispatch_side_callback) + (func $ordinary (export "ordinary") (result i32) + i32.const 7)) + "#; + let found = discover(wat); + assert!( + found.iter().any(|n| n == "dispatch_side_callback"), + "dynamic-linking call_indirect sites must be instrumented as potential \ + side-module fork boundaries; got {found:?}" + ); + assert!( + found.iter().any(|n| n == "parent_frame"), + "direct callers above a dynamic side-module dispatch must also be saved; got {found:?}" + ); + assert!( + !found.iter().any(|n| n == "ordinary"), + "unrelated dynamic-linking functions without call_indirect should stay out; got {found:?}" + ); +} + #[test] fn indirect_closure_allows_two_hops_but_does_not_cascade_forever() { // Models trampoline-shaped runtimes without allowing unbounded diff --git a/crates/kernel/src/fork.rs b/crates/kernel/src/fork.rs index 087c47e4b..e8786ed31 100644 --- a/crates/kernel/src/fork.rs +++ b/crates/kernel/src/fork.rs @@ -45,6 +45,8 @@ const MAX_ENV_VARS: u32 = 65536; const MAX_ARGV: u32 = 65536; const MAX_PATH_LEN: usize = 1048576; // 1 MiB const MAX_STRING_LEN: usize = 1048576; // 1 MiB +const INITIAL_EXEC_STATE_BUFFER_LEN: usize = 64 * 1024; +const MAX_EXEC_STATE_BUFFER_LEN: usize = 4 * 1024 * 1024; // ── Writer helper ─────────────────────────────────────────────────────────── @@ -1176,6 +1178,7 @@ pub fn deserialize_fork_state(buf: &[u8], child_pid: u32) -> Result Result) -> Result { + let mut sockets = SocketTable::new(); + if r.remaining() < 8 { + return Ok(sockets); + } + + use crate::socket::{SocketDomain, SocketInfo, SocketState, SocketType}; + let _total_slots = r.read_u32()? as usize; + let sock_count = r.read_u32()? as usize; + for _ in 0..sock_count { + let idx = r.read_u32()? as usize; + let domain = match r.read_u32()? { + 0 => SocketDomain::Unix, + 1 => SocketDomain::Inet, + 2 => SocketDomain::Inet6, + 3 => SocketDomain::Netlink, + _ => return Err(Errno::EINVAL), + }; + let sock_type = match r.read_u32()? { + 0 => SocketType::Stream, + 1 => SocketType::Dgram, + _ => return Err(Errno::EINVAL), + }; + let protocol = r.read_u32()?; + let state = match r.read_u32()? { + 0 => SocketState::Unbound, + 1 => SocketState::Bound, + 2 => SocketState::Listening, + 3 => SocketState::Connected, + 4 => SocketState::Closed, + _ => return Err(Errno::EINVAL), + }; + let peer_idx_raw = r.read_u32()?; + let peer_idx = if peer_idx_raw == 0xFFFFFFFF { + None + } else { + Some(peer_idx_raw as usize) + }; + let recv_raw = r.read_u32()?; + let recv_buf_idx = if recv_raw == 0xFFFFFFFF { + None + } else { + Some(recv_raw as usize) + }; + let send_raw = r.read_u32()?; + let send_buf_idx = if send_raw == 0xFFFFFFFF { + None + } else { + Some(send_raw as usize) + }; + let shut_rd = r.read_u32()? != 0; + let shut_wr = r.read_u32()? != 0; + let host_handle_raw = r.read_u32()?; + let host_net_handle = if host_handle_raw == 0xFFFFFFFF { + None + } else { + Some(host_handle_raw as i32) + }; + + let opt_count = r.read_u32()? as usize; + let mut options = Vec::new(); + for _ in 0..opt_count { + let level = r.read_u32()?; + let optname = r.read_u32()?; + let value = r.read_u32()?; + options.push((level, optname, value)); + } + + let mut bind_addr = [0u8; 4]; + bind_addr.copy_from_slice(r.read_bytes(4)?); + let bind_port = r.read_u32()? as u16; + let mut peer_addr = [0u8; 4]; + peer_addr.copy_from_slice(r.read_bytes(4)?); + let peer_port = r.read_u32()? as u16; + + let backlog_count = r.read_u32()? as usize; + for _ in 0..backlog_count { + let _ = r.read_u32()?; + } + + let mut sock = SocketInfo::new(domain, sock_type, protocol); + sock.state = state; + sock.peer_idx = peer_idx; + sock.recv_buf_idx = recv_buf_idx; + sock.send_buf_idx = send_buf_idx; + sock.shut_rd = shut_rd; + sock.shut_wr = shut_wr; + sock.host_net_handle = host_net_handle; + sock.options = options; + sock.bind_addr = bind_addr; + sock.bind_port = bind_port; + sock.peer_addr = peer_addr; + sock.peer_port = peer_port; + sock.global_pipes = r.read_u32()? != 0; + + let shared_backlog_raw = r.read_u32()?; + sock.shared_backlog_idx = if shared_backlog_raw == 0xFFFFFFFF { + None + } else { + Some(shared_backlog_raw as usize) + }; + + if r.remaining() >= 4 { + let bind_path_len = r.read_u32()?; + if bind_path_len != 0xFFFFFFFF { + sock.bind_path = Some(r.read_bytes(bind_path_len as usize)?.to_vec()); + } + } + if r.remaining() >= 4 { + let accept_wake_raw = r.read_u32()?; + sock.accept_wake_idx = if accept_wake_raw == 0xFFFFFFFF { + None + } else { + Some(accept_wake_raw) + }; + } + sockets.insert_at(idx, sock); + } + + Ok(sockets) +} + // ── Exec Serialize ────────────────────────────────────────────────────────── /// Serialize the process state into a binary buffer for exec. @@ -1378,6 +1504,26 @@ pub fn serialize_exec_state(proc: &Process, buf: &mut [u8]) -> Result Result, Errno> { + let mut len = INITIAL_EXEC_STATE_BUFFER_LEN; + + loop { + let mut buf = Vec::new(); + buf.resize(len, 0u8); + + match serialize_exec_state(proc, &mut buf) { + Ok(written) => { + buf.truncate(written); + return Ok(buf); + } + Err(Errno::ENOMEM) if len < MAX_EXEC_STATE_BUFFER_LEN => { + len = len.saturating_mul(2).min(MAX_EXEC_STATE_BUFFER_LEN); + } + Err(err) => return Err(err), + } + } +} + // ── Exec Deserialize ──────────────────────────────────────────────────────── /// Deserialize process state from an exec buffer. @@ -1583,6 +1729,7 @@ pub fn deserialize_exec_state(buf: &[u8], pid: u32) -> Result { is_session_leader, state: ProcessState::Running, exit_status: 0, + exit_signal: 0, fd_table, ofd_table, lock_table: LockTable::new(), @@ -1806,6 +1953,31 @@ mod tests { assert_eq!(restored.signals.pending, 0); } + #[test] + fn test_exec_state_grows_for_large_environment() { + let mut proc = Process::new(1); + proc.environ.clear(); + for i in 0..1200 { + let mut var = b"KDE_LONG_ENV_".to_vec(); + var.extend_from_slice(i.to_string().as_bytes()); + var.push(b'='); + var.extend(core::iter::repeat(b'x').take(80)); + proc.environ.push(var); + } + + let mut old_limit_buf = vec![0u8; 64 * 1024]; + assert_eq!( + serialize_exec_state(&proc, &mut old_limit_buf), + Err(Errno::ENOMEM), + ); + + let serialized = serialize_exec_state_with_growing_buffer(&proc).unwrap(); + assert!(serialized.len() > 64 * 1024); + + let restored = deserialize_exec_state(&serialized, 1).unwrap(); + assert_eq!(restored.environ, proc.environ); + } + #[test] fn test_exec_state_filters_cloexec_fds() { use wasm_posix_shared::fd_flags::FD_CLOEXEC; diff --git a/crates/kernel/src/lib.rs b/crates/kernel/src/lib.rs index e8f0271bb..5c076b6fb 100644 --- a/crates/kernel/src/lib.rs +++ b/crates/kernel/src/lib.rs @@ -78,6 +78,29 @@ pub fn current_time_secs() -> i64 { } } +// --------------------------------------------------------------------------- +// Kernel mode flag +// --------------------------------------------------------------------------- + +use core::sync::atomic::{AtomicU32, Ordering}; + +/// Kernel operating mode. +/// +/// - Mode 0 (default): Traditional per-process kernel. Blocking syscalls spin +/// or delegate to the host. +/// - Mode 1: Centralized kernel. Blocking syscalls return EAGAIN immediately +/// so the host JS event loop can handle waiting asynchronously. +static KERNEL_MODE: AtomicU32 = AtomicU32::new(0); + +#[inline] +pub fn is_centralized_mode() -> bool { + KERNEL_MODE.load(Ordering::Relaxed) != 0 +} + +pub fn set_kernel_mode(mode: u32) { + KERNEL_MODE.store(mode, Ordering::Relaxed); +} + #[cfg(any(target_arch = "wasm32", target_arch = "wasm64"))] mod wasm { use core::alloc::{GlobalAlloc, Layout}; diff --git a/crates/kernel/src/memory.rs b/crates/kernel/src/memory.rs index d63cb47ac..81535d610 100644 --- a/crates/kernel/src/memory.rs +++ b/crates/kernel/src/memory.rs @@ -133,9 +133,27 @@ impl MemoryManager { }); hint } else { - // Find first gap in [mmap_base, max_addr) that fits aligned_len. - // Mappings are kept sorted by address. - match self.find_gap(aligned_len) { + // Without MAP_FIXED, POSIX treats a non-null address as a hint: + // the implementation may choose another address, but if the + // hinted page-aligned range is free and inside the normal mmap + // arena, using it preserves standard allocator expectations. In + // particular, allocators commonly try to extend a mapping by + // issuing mmap(old_end, delta, ...) without MAP_FIXED so the call + // fails safely instead of clobbering an occupied range. + let hinted_addr = if hint != 0 + && hint & 0xFFFF == 0 + && hint >= self.mmap_base.max(self.program_break) + && !self.overlaps_brk_heap(hint, aligned_len) + && self.can_grow_at(hint, aligned_len) + { + Some(hint) + } else { + None + }; + + // Otherwise find the first gap in [mmap_base, max_addr) that fits + // aligned_len. Mappings are kept sorted by address. + match hinted_addr.or_else(|| self.find_gap(aligned_len)) { Some(a) => a, None => return wasm_posix_shared::mmap::MAP_FAILED, } @@ -267,7 +285,11 @@ impl MemoryManager { if len == 0 { return false; } - let unmap_end = addr.saturating_add(len); + let aligned_len = match len.checked_add(0xFFFF) { + Some(v) => v & !0xFFFF, + None => return false, + }; + let unmap_end = addr.saturating_add(aligned_len); let mut found = false; let mut new_mappings: Vec = Vec::new(); @@ -650,6 +672,81 @@ mod tests { assert_eq!(addr2, addr + 0x10000); } + #[test] + fn test_mmap_gap_scan_merges_guest_and_reserved_regions() { + let mut mm = MemoryManager::new(); + let rw = PROT_READ | PROT_WRITE; + let anon = MAP_PRIVATE | MAP_ANONYMOUS; + let base = MemoryManager::MMAP_BASE; + + // Interleave a guest mapping and a host-reserved control range. The + // automatic mmap scan must consider both address-ordered streams and + // return the first real gap, not a range that is free in only one list. + assert_eq!( + mm.mmap_anonymous(base, 0x10000, rw, anon | MAP_FIXED), + base + ); + assert_eq!( + mm.reserve_host_region_at(base + 0x10000, 0x10000), + base + 0x10000 + ); + + let addr = mm.mmap_anonymous(0, 0x10000, rw, anon); + assert_eq!(addr, base + 0x20000); + } + + #[test] + fn test_mmap_non_fixed_honors_free_address_hint() { + let mut mm = MemoryManager::new(); + let rw = PROT_READ | PROT_WRITE; + let anon = MAP_PRIVATE | MAP_ANONYMOUS; + let base = MemoryManager::MMAP_BASE; + + assert_eq!(mm.mmap_anonymous(base, 0x10000, rw, anon | MAP_FIXED), base); + assert_eq!( + mm.mmap_anonymous(base + 0x20000, 0x10000, rw, anon | MAP_FIXED), + base + 0x20000 + ); + + // There is an earlier first-fit gap at base+0x10000, but a non-fixed + // hint at base+0x30000 is free. Prefer the usable hint instead of + // treating non-fixed hints as if they were NULL. + assert_eq!( + mm.mmap_anonymous(base + 0x30000, 0x10000, rw, anon), + base + 0x30000 + ); + + // An occupied hint is only a hint; fall back to the ordinary first-fit + // address without replacing the existing mapping. + assert_eq!( + mm.mmap_anonymous(base + 0x20000, 0x10000, rw, anon), + base + 0x10000 + ); + assert!(mm.is_mapped(base + 0x20000)); + } + + #[test] + fn test_munmap_rounds_length_up_to_page() { + let mut mm = MemoryManager::new(); + let rw = PROT_READ | PROT_WRITE; + let anon = MAP_PRIVATE | MAP_ANONYMOUS; + + // SQLite sysfault.test allocates this size and later munmaps a + // different, still page-rounded length. Both must cover the same + // kernel mapping, otherwise a tiny tail fragment prevents reuse. + let requested_len = 0x1ea102e; + let munmap_len = 0x1ea2000; + + let addr = mm.mmap_anonymous(0, requested_len, rw, anon); + assert_ne!(addr, MAP_FAILED); + assert!(mm.munmap(addr, munmap_len)); + assert!(!mm.is_mapped(addr)); + assert!(!mm.is_mapped(addr + munmap_len)); + + let reused = mm.mmap_anonymous(0, requested_len, rw, anon); + assert_eq!(reused, addr); + } + #[test] fn test_brk() { let mut mm = MemoryManager::new(); diff --git a/crates/kernel/src/process.rs b/crates/kernel/src/process.rs index 1c7ecd0ab..d51934b75 100644 --- a/crates/kernel/src/process.rs +++ b/crates/kernel/src/process.rs @@ -511,7 +511,11 @@ pub struct Process { /// POSIX uses this flag (not `sid == pid`) to gate setpgid EPERM checks. pub is_session_leader: bool, pub state: ProcessState, + /// Low 8-bit status supplied to _exit()/exit_group() for normal exits. + /// POSIX wait status encoding keeps normal exit codes 0..255 distinct + /// from signal termination; `exit_signal != 0` records the latter. pub exit_status: i32, + pub exit_signal: u32, pub fd_table: FdTable, pub ofd_table: OfdTable, pub lock_table: LockTable, @@ -627,6 +631,7 @@ impl Process { is_session_leader: false, state: ProcessState::Running, exit_status: 0, + exit_signal: 0, fd_table, ofd_table, lock_table: LockTable::new(), diff --git a/crates/kernel/src/process_table.rs b/crates/kernel/src/process_table.rs index 337d1d8d5..8cbc37831 100644 --- a/crates/kernel/src/process_table.rs +++ b/crates/kernel/src/process_table.rs @@ -475,6 +475,7 @@ impl ProcessTable { limbo.is_session_leader = proc.is_session_leader; limbo.state = ProcessState::Limbo; limbo.exit_status = proc.exit_status; + limbo.exit_signal = proc.exit_signal; limbo.cwd = proc.cwd.clone(); limbo.environ = proc.environ.clone(); limbo.argv = proc.argv.clone(); @@ -858,7 +859,8 @@ impl ProcessTable { pub fn mark_process_signaled(&mut self, pid: u32, signum: u32) -> Result<(), Errno> { let proc = self.processes.get_mut(&pid).ok_or(Errno::ESRCH)?; proc.state = ProcessState::Exited; - proc.exit_status = 128 + signum as i32; + proc.exit_status = 0; + proc.exit_signal = signum & 0x7f; Ok(()) } @@ -888,7 +890,7 @@ impl ProcessTable { if child.state == ProcessState::Exited { return Ok(Some(( child_pid, - Self::wait_status_from_exit_status(child.exit_status), + Self::wait_status_from_process(child), ))); } } @@ -929,11 +931,11 @@ impl ProcessTable { child.pgid == target_pgid } - fn wait_status_from_exit_status(exit_status: i32) -> i32 { - if exit_status >= 128 { - (exit_status - 128) & 0x7f + fn wait_status_from_process(proc: &Process) -> i32 { + if proc.exit_signal != 0 { + (proc.exit_signal as i32) & 0x7f } else { - (exit_status & 0xff) << 8 + (proc.exit_status & 0xff) << 8 } } } @@ -947,7 +949,7 @@ mod wait_tests { let mut table = ProcessTable::new(); let first_pid = table.allocate_spawn_pid(); - table.processes.insert(first_pid, Process::new(first_pid)); + table.processes.insert(first_pid, *Process::new_boxed(first_pid)); table.processes.remove(&first_pid); let second_pid = table.allocate_spawn_pid(); @@ -1089,6 +1091,23 @@ mod tests { ); } + #[test] + fn poll_waitable_child_preserves_high_normal_exit_status() { + let mut table = ProcessTable::new(); + table.create_process(10).unwrap(); + table.create_process(11).unwrap(); + let child = table.processes.get_mut(&11).unwrap(); + child.ppid = 10; + child.state = ProcessState::Exited; + child.exit_status = 255; + child.exit_signal = 0; + + assert_eq!( + table.poll_waitable_child(10, -1).unwrap(), + Some((11, 255 << 8)) + ); + } + #[test] fn poll_waitable_child_encodes_signal_status() { let mut table = ProcessTable::new(); diff --git a/crates/kernel/src/socket.rs b/crates/kernel/src/socket.rs index 674cb6495..4b3690dec 100644 --- a/crates/kernel/src/socket.rs +++ b/crates/kernel/src/socket.rs @@ -544,6 +544,10 @@ impl SocketTable { /// A pending TCP connection waiting in a shared accept queue. pub struct PendingConnection { pub peer_addr: [u8; 4], + /// IPv6 peer address when this pending connection originated from an + /// AF_INET6 client. `None` on an AF_INET6 listener means the peer was an + /// IPv4 client and must be exposed as an IPv4-mapped IPv6 address. + pub peer_addr6: Option<[u8; 16]>, pub peer_port: u16, /// Recv pipe index (in the global pipe table). Host writes incoming /// TCP data here; the accepting process reads from it. diff --git a/crates/kernel/src/syscalls.rs b/crates/kernel/src/syscalls.rs index 2a476f8dc..28a48f0bb 100644 --- a/crates/kernel/src/syscalls.rs +++ b/crates/kernel/src/syscalls.rs @@ -5452,7 +5452,8 @@ pub fn sys_exit(proc: &mut Process, host: &mut dyn HostIO, status: i32) { fallback_lock_table(proc).remove_all_for_pid(pid); proc.state = ProcessState::Exited; - proc.exit_status = status; + proc.exit_status = status & 0xff; + proc.exit_signal = 0; } /// Get the current time from the specified clock. @@ -6215,11 +6216,21 @@ fn is_loopback_addr(addr: [u8; 4]) -> bool { addr[0] == 127 } -fn is_loopback_addr6(addr: [u8; 16]) -> bool { +pub(crate) fn is_loopback_addr6(addr: [u8; 16]) -> bool { addr == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1] } -fn is_unspecified_addr6(addr: [u8; 16]) -> bool { +pub(crate) fn loopback_addr6() -> [u8; 16] { + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1] +} + +pub(crate) fn ipv4_mapped_addr6(addr: [u8; 4]) -> [u8; 16] { + [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, addr[0], addr[1], addr[2], addr[3], + ] +} + +pub(crate) fn is_unspecified_addr6(addr: [u8; 16]) -> bool { addr == [0; 16] } @@ -8016,10 +8027,26 @@ pub fn sys_accept(proc: &mut Process, _host: &mut dyn HostIO, fd: i32) -> Result accepted.state = SocketState::Connected; accepted.recv_buf_idx = Some(pc.recv_pipe_idx); accepted.send_buf_idx = Some(pc.send_pipe_idx); - accepted.bind_addr = bind_addr; - accepted.bind_port = bind_port; - accepted.peer_addr = pc.peer_addr; - accepted.peer_port = pc.peer_port; + match domain { + SocketDomain::Inet => { + accepted.bind_addr = bind_addr; + accepted.bind_port = bind_port; + accepted.peer_addr = pc.peer_addr; + accepted.peer_port = pc.peer_port; + } + SocketDomain::Inet6 => { + accepted.bind_addr6 = bind_addr6; + accepted.bind_port = bind_port; + accepted.peer_addr6 = pc + .peer_addr6 + .unwrap_or_else(|| ipv4_mapped_addr6(pc.peer_addr)); + accepted.peer_port = pc.peer_port; + } + SocketDomain::Unix => { + accepted.bind_path = bind_path; + } + SocketDomain::Netlink => return Err(Errno::EOPNOTSUPP), + } accepted.global_pipes = true; let accepted_sock_idx = proc.sockets.alloc(accepted); let host_handle = -((accepted_sock_idx as i64) + 1); @@ -8164,8 +8191,13 @@ pub fn sys_connect( if sock.sock_type != SocketType::Stream { return Err(Errno::EOPNOTSUPP); } - let (ip6, port) = parse_sockaddr_in6(addr)?; - if !(is_loopback_addr6(ip6) || is_unspecified_addr6(ip6)) { + let (raw_ip6, port) = parse_sockaddr_in6(addr)?; + let ip6 = if is_unspecified_addr6(raw_ip6) { + loopback_addr6() + } else { + raw_ip6 + }; + if !is_loopback_addr6(ip6) { return Err(Errno::EADDRNOTAVAIL); } @@ -8188,61 +8220,100 @@ pub fn sys_connect( } } } - let listener_idx = listener_idx.ok_or(Errno::ECONNREFUSED)?; - - let (pipe_a_idx, pipe_b_idx) = - proc.alloc_pipe_pair(PipeBuffer::new(65536), PipeBuffer::new(65536)); + if let Some(listener_idx) = listener_idx { + let (pipe_a_idx, pipe_b_idx) = + proc.alloc_pipe_pair(PipeBuffer::new(65536), PipeBuffer::new(65536)); - let client_sock = proc.sockets.get(sock_idx).ok_or(Errno::EBADF)?; - let client_addr6 = if is_unspecified_addr6(client_sock.bind_addr6) { - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1] - } else { - client_sock.bind_addr6 - }; - let mut client_port = client_sock.bind_port; - if client_port == 0 { - client_port = proc.next_ephemeral_port; - proc.next_ephemeral_port = proc.next_ephemeral_port.wrapping_add(1); - if proc.next_ephemeral_port == 0 { - proc.next_ephemeral_port = 49152; + let client_sock = proc.sockets.get(sock_idx).ok_or(Errno::EBADF)?; + let client_addr6 = if is_unspecified_addr6(client_sock.bind_addr6) { + loopback_addr6() + } else { + client_sock.bind_addr6 + }; + let mut client_port = client_sock.bind_port; + if client_port == 0 { + client_port = proc.next_ephemeral_port; + proc.next_ephemeral_port = proc.next_ephemeral_port.wrapping_add(1); + if proc.next_ephemeral_port == 0 { + proc.next_ephemeral_port = 49152; + } } - } - let listener = proc.sockets.get(listener_idx).ok_or(Errno::EBADF)?; - let mut accepted_sock = SocketInfo::new(SocketDomain::Inet6, SocketType::Stream, 0); - accepted_sock.state = SocketState::Connected; - accepted_sock.recv_buf_idx = Some(pipe_a_idx); - accepted_sock.send_buf_idx = Some(pipe_b_idx); - accepted_sock.bind_addr6 = listener.bind_addr6; - accepted_sock.bind_port = listener.bind_port; - accepted_sock.peer_addr6 = client_addr6; - accepted_sock.peer_port = client_port; - let accepted_idx = proc.sockets.alloc(accepted_sock); + let listener = proc.sockets.get(listener_idx).ok_or(Errno::EBADF)?; + let mut accepted_sock = SocketInfo::new(SocketDomain::Inet6, SocketType::Stream, 0); + accepted_sock.state = SocketState::Connected; + accepted_sock.recv_buf_idx = Some(pipe_a_idx); + accepted_sock.send_buf_idx = Some(pipe_b_idx); + accepted_sock.bind_addr6 = listener.bind_addr6; + accepted_sock.bind_port = listener.bind_port; + accepted_sock.peer_addr6 = client_addr6; + accepted_sock.peer_port = client_port; + let accepted_idx = proc.sockets.alloc(accepted_sock); + + let listener = proc.sockets.get_mut(listener_idx).ok_or(Errno::EBADF)?; + listener.listen_backlog.push(accepted_idx); + let accept_wake_idx = listener.accept_wake_idx; - let listener = proc.sockets.get_mut(listener_idx).ok_or(Errno::EBADF)?; - listener.listen_backlog.push(accepted_idx); - let accept_wake_idx = listener.accept_wake_idx; + let client = proc.sockets.get_mut(sock_idx).ok_or(Errno::EBADF)?; + client.send_buf_idx = Some(pipe_a_idx); + client.recv_buf_idx = Some(pipe_b_idx); + client.state = SocketState::Connected; + client.peer_addr6 = ip6; + client.peer_port = port; + client.peer_idx = Some(accepted_idx); + if client.bind_port == 0 { + client.bind_port = client_port; + client.bind_addr6 = loopback_addr6(); + } - let client = proc.sockets.get_mut(sock_idx).ok_or(Errno::EBADF)?; - client.send_buf_idx = Some(pipe_a_idx); - client.recv_buf_idx = Some(pipe_b_idx); - client.state = SocketState::Connected; - client.peer_addr6 = ip6; - client.peer_port = port; - client.peer_idx = Some(accepted_idx); - if client.bind_port == 0 { - client.bind_port = client_port; - client.bind_addr6 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]; - } + let accepted = proc.sockets.get_mut(accepted_idx).ok_or(Errno::EBADF)?; + accepted.peer_idx = Some(sock_idx); - let accepted = proc.sockets.get_mut(accepted_idx).ok_or(Errno::EBADF)?; - accepted.peer_idx = Some(sock_idx); + if let Some(idx) = accept_wake_idx { + crate::wakeup::push_accept(idx); + } - if let Some(idx) = accept_wake_idx { - crate::wakeup::push_accept(idx); + return Ok(()); } - return Ok(()); + let net_handle = sock_idx as i32; + if sock.state != SocketState::Connecting { + host.host_net_connect(net_handle, &[127, 0, 0, 1], port)?; + let client_sock = proc.sockets.get(sock_idx).ok_or(Errno::EBADF)?; + let mut client_port = client_sock.bind_port; + if client_port == 0 { + client_port = proc.next_ephemeral_port; + proc.next_ephemeral_port = proc.next_ephemeral_port.wrapping_add(1); + if proc.next_ephemeral_port == 0 { + proc.next_ephemeral_port = 49152; + } + } + let client = proc.sockets.get_mut(sock_idx).ok_or(Errno::EBADF)?; + client.state = SocketState::Connecting; + client.host_net_handle = Some(net_handle); + if client.bind_port == 0 { + client.bind_port = client_port; + client.bind_addr6 = loopback_addr6(); + } + } + return match host.host_net_connect_status(net_handle) { + Ok(()) => { + let client = proc.sockets.get_mut(sock_idx).ok_or(Errno::EBADF)?; + client.state = SocketState::Connected; + client.host_net_handle = Some(net_handle); + client.peer_addr6 = ip6; + client.peer_port = port; + client.connect_error = 0; + Ok(()) + } + Err(Errno::EAGAIN) => Err(Errno::EAGAIN), + Err(e) => { + let client = proc.sockets.get_mut(sock_idx).ok_or(Errno::EBADF)?; + client.state = SocketState::Closed; + client.connect_error = e as u32; + Err(e) + } + }; } // Parse sockaddr_in: family(2) + port(2 big-endian) + addr(4) @@ -8477,33 +8548,55 @@ pub fn sys_connect( let pipe_a_idx = pipe_table.alloc(PipeBuffer::new(65536)); let pipe_b_idx = pipe_table.alloc(PipeBuffer::new(65536)); - // Create accepted socket (server side) - let mut accepted_sock = SocketInfo::new(SocketDomain::Unix, SocketType::Stream, 0); - accepted_sock.state = SocketState::Connected; - accepted_sock.recv_buf_idx = Some(pipe_a_idx); // reads client's writes - accepted_sock.send_buf_idx = Some(pipe_b_idx); // writes to client's reads - accepted_sock.global_pipes = true; - let accepted_idx = proc.sockets.alloc(accepted_sock); - - // Push to listener's backlog let listener = proc .sockets - .get_mut(listener_sock_idx) + .get(listener_sock_idx) .ok_or(Errno::EBADF)?; - listener.listen_backlog.push(accepted_idx); let accept_wake_idx = listener.accept_wake_idx; - // Set up client socket - let client = proc.sockets.get_mut(sock_idx).ok_or(Errno::EBADF)?; - client.send_buf_idx = Some(pipe_a_idx); // writes to pipe_a (server's reads) - client.recv_buf_idx = Some(pipe_b_idx); // reads from pipe_b (server's writes) - client.state = SocketState::Connected; - client.peer_idx = Some(accepted_idx); - client.global_pipes = true; + if let Some(shared_idx) = listener.shared_backlog_idx { + let pc = crate::socket::PendingConnection { + peer_addr: [0, 0, 0, 0], + peer_addr6: None, + peer_port: 0, + recv_pipe_idx: pipe_a_idx, // server reads client's writes + send_pipe_idx: pipe_b_idx, // server writes to client's reads + }; + if !unsafe { crate::socket::shared_listener_backlog_table().push(shared_idx, pc) } + { + return Err(Errno::ECONNREFUSED); + } + let client = proc.sockets.get_mut(sock_idx).ok_or(Errno::EBADF)?; + client.send_buf_idx = Some(pipe_a_idx); + client.recv_buf_idx = Some(pipe_b_idx); + client.state = SocketState::Connected; + client.global_pipes = true; + } else { + // Legacy fallback for listeners without a shared queue. + let mut accepted_sock = + SocketInfo::new(SocketDomain::Unix, SocketType::Stream, 0); + accepted_sock.state = SocketState::Connected; + accepted_sock.recv_buf_idx = Some(pipe_a_idx); + accepted_sock.send_buf_idx = Some(pipe_b_idx); + accepted_sock.global_pipes = true; + let accepted_idx = proc.sockets.alloc(accepted_sock); + + let listener = proc + .sockets + .get_mut(listener_sock_idx) + .ok_or(Errno::EBADF)?; + listener.listen_backlog.push(accepted_idx); + + let client = proc.sockets.get_mut(sock_idx).ok_or(Errno::EBADF)?; + client.send_buf_idx = Some(pipe_a_idx); + client.recv_buf_idx = Some(pipe_b_idx); + client.state = SocketState::Connected; + client.peer_idx = Some(accepted_idx); + client.global_pipes = true; - // Set peer_idx on accepted socket - let accepted = proc.sockets.get_mut(accepted_idx).ok_or(Errno::EBADF)?; - accepted.peer_idx = Some(sock_idx); + let accepted = proc.sockets.get_mut(accepted_idx).ok_or(Errno::EBADF)?; + accepted.peer_idx = Some(sock_idx); + } if let Some(idx) = accept_wake_idx { crate::wakeup::push_accept(idx); @@ -19084,6 +19177,123 @@ mod tests { ); } + #[test] + fn test_inet6_accept_reports_ipv4_mapped_and_native_ipv6_peers() { + let mut proc = Process::new(1); + let mut host = MockHostIO::new(); + use crate::pipe::PipeBuffer; + use wasm_posix_shared::socket::*; + + let fd = sys_socket(&mut proc, &mut host, AF_INET6, SOCK_STREAM, 0).unwrap(); + let mut addr = [0u8; 28]; + addr[0] = 10; // AF_INET6 + addr[2] = 0x19; + addr[3] = 0x46; // port 6470 + // Bind to :: so the listener is dual-stack. IPv4 peers accepted on an + // AF_INET6 socket are observed through getpeername(2) as + // IPv4-mapped IPv6 addresses; native IPv6 peers remain native. + sys_bind(&mut proc, &mut host, fd, &addr).unwrap(); + sys_listen(&mut proc, &mut host, fd, 5).unwrap(); + + let listener_entry = proc.fd_table.get(fd).unwrap(); + let listener_ofd = proc.ofd_table.get(listener_entry.ofd_ref.0).unwrap(); + let listener_idx = (-(listener_ofd.host_handle + 1)) as usize; + let shared_idx = proc + .sockets + .get(listener_idx) + .unwrap() + .shared_backlog_idx + .unwrap(); + + let pipe_table = unsafe { crate::pipe::global_pipe_table() }; + let (recv4, send4) = + pipe_table.alloc_pair(PipeBuffer::new(1024), PipeBuffer::new(1024)); + unsafe { crate::socket::shared_listener_backlog_table() }.push( + shared_idx, + crate::socket::PendingConnection { + peer_addr: [127, 0, 0, 1], + peer_addr6: None, + peer_port: 50000, + recv_pipe_idx: recv4, + send_pipe_idx: send4, + }, + ); + let accepted4_fd = sys_accept(&mut proc, &mut host, fd).unwrap(); + let accepted4_entry = proc.fd_table.get(accepted4_fd).unwrap(); + let accepted4_ofd = proc.ofd_table.get(accepted4_entry.ofd_ref.0).unwrap(); + let accepted4_idx = (-(accepted4_ofd.host_handle + 1)) as usize; + assert_eq!( + proc.sockets.get(accepted4_idx).unwrap().peer_addr6, + ipv4_mapped_addr6([127, 0, 0, 1]), + ); + + let (recv6, send6) = + pipe_table.alloc_pair(PipeBuffer::new(1024), PipeBuffer::new(1024)); + unsafe { crate::socket::shared_listener_backlog_table() }.push( + shared_idx, + crate::socket::PendingConnection { + peer_addr: [0, 0, 0, 0], + peer_addr6: Some(loopback_addr6()), + peer_port: 50001, + recv_pipe_idx: recv6, + send_pipe_idx: send6, + }, + ); + let accepted6_fd = sys_accept(&mut proc, &mut host, fd).unwrap(); + let accepted6_entry = proc.fd_table.get(accepted6_fd).unwrap(); + let accepted6_ofd = proc.ofd_table.get(accepted6_entry.ofd_ref.0).unwrap(); + let accepted6_idx = (-(accepted6_ofd.host_handle + 1)) as usize; + assert_eq!(proc.sockets.get(accepted6_idx).unwrap().peer_addr6, loopback_addr6()); + } + + #[test] + fn test_inet6_loopback_listen_registers_host_transport() { + let mut proc = Process::new(1); + let mut host = MockHostIO::new(); + use wasm_posix_shared::socket::*; + + let fd = sys_socket(&mut proc, &mut host, AF_INET6, SOCK_STREAM, 0).unwrap(); + let mut addr = [0u8; 28]; + addr[0] = 10; // AF_INET6 + addr[2] = 0x19; + addr[3] = 0x44; // port 6468 + addr[23] = 1; // ::1 + + sys_bind(&mut proc, &mut host, fd, &addr).unwrap(); + sys_listen(&mut proc, &mut host, fd, 5).unwrap(); + + assert_eq!(host.net_listen_calls, vec![(fd, 6468, [127, 0, 0, 1])]); + } + + #[test] + fn test_inet6_loopback_cross_process_connect_uses_host_transport() { + let mut proc = Process::new(1); + let mut host = MockHostIO::new(); + host.net_connect_result = Ok(()); + host.net_connect_status_result = Ok(()); + use wasm_posix_shared::socket::*; + + let fd = sys_socket(&mut proc, &mut host, AF_INET6, SOCK_STREAM, 0).unwrap(); + let mut addr = [0u8; 28]; + addr[0] = 10; // AF_INET6 + addr[2] = 0x19; + addr[3] = 0x45; // port 6469 + addr[23] = 1; // ::1 + + sys_connect(&mut proc, &mut host, fd, &addr).unwrap(); + + assert_eq!(host.net_connect_calls, vec![(0, vec![127, 0, 0, 1], 6469)]); + let entry = proc.fd_table.get(fd).unwrap(); + let ofd = proc.ofd_table.get(entry.ofd_ref.0).unwrap(); + let sock_idx = (-(ofd.host_handle + 1)) as usize; + let sock = proc.sockets.get(sock_idx).unwrap(); + assert_eq!(sock.state, crate::socket::SocketState::Connected); + assert_eq!(sock.peer_addr6, loopback_addr6()); + assert_eq!(sock.peer_port, 6469); + assert_eq!(sock.bind_addr6, loopback_addr6()); + assert_ne!(sock.bind_port, 0); + } + // ── Threading tests ────────────────────────────────────────────── #[test] diff --git a/crates/kernel/src/wasm_api.rs b/crates/kernel/src/wasm_api.rs index ff7c0d18a..79d210743 100644 --- a/crates/kernel/src/wasm_api.rs +++ b/crates/kernel/src/wasm_api.rs @@ -1188,6 +1188,19 @@ fn ensure_memory_covers(_end_addr: usize) { // No-op on non-Wasm targets (tests) } +fn terminate_process_by_signal( + proc: &mut crate::process::Process, + host: &mut WasmHostIO, + signum: u32, +) { + proc.sigsuspend_saved_mask = None; + for t in proc.threads.iter_mut() { + t.signals.sigsuspend_saved_mask = None; + } + crate::syscalls::sys_exit(proc, host, 0); + proc.exit_signal = signum & 0x7f; +} + // 3c. Signal delivery at syscall boundaries // --------------------------------------------------------------------------- @@ -1195,7 +1208,6 @@ fn ensure_memory_covers(_end_addr: usize) { fn deliver_pending_signals(proc: &mut Process, host: &mut WasmHostIO) { use crate::signal::{DefaultAction, SignalHandler, default_action}; let tid = crate::process_table::current_tid(); - let _ = host; loop { // Caught signals are delivered by the glue code via // kernel_dequeue_signal; default and ignored signals are consumed here. @@ -1214,8 +1226,7 @@ fn deliver_pending_signals(proc: &mut Process, host: &mut WasmHostIO) { let _ = dequeue_signal_for(proc, tid, signum); match default_action(signum) { DefaultAction::Terminate | DefaultAction::CoreDump => { - proc.state = crate::process::ProcessState::Exited; - proc.exit_status = 128 + signum as i32; + terminate_process_by_signal(proc, host, signum); } _ => {} } @@ -1460,11 +1471,37 @@ pub extern "C" fn kernel_set_process_argv(pid: u32, data_ptr: *const u8, data_le /// Returns 0 on success, -ESRCH if pid not found. #[unsafe(no_mangle)] pub extern "C" fn kernel_set_stdin_pipe(pid: u32) -> i32 { + kernel_set_stdio_pipe(pid, 0) +} + +/// Mark one of a process's standard descriptors as a host-backed pipe. +/// +/// Hosts use this when they connect stdin/stdout/stderr to capture pipes +/// rather than a terminal. The descriptor continues to use its existing +/// host handle (0, 1, or 2), so reads and writes still delegate to the host +/// stdio callbacks, but POSIX-visible metadata changes from character device +/// to FIFO: isatty() returns ENOTTY and fstat() reports S_IFIFO. +/// Returns 0 on success, -EINVAL for non-stdio fds, -ESRCH if pid not found. +#[unsafe(no_mangle)] +pub extern "C" fn kernel_set_stdio_pipe(pid: u32, fd: i32) -> i32 { + if !(0..=2).contains(&fd) { + return -(Errno::EINVAL as i32); + } let table = unsafe { &mut *PROCESS_TABLE.0.get() }; if let Some(proc) = table.get_mut(pid) { - // Change OFD 0 (stdin) from CharDevice to Pipe - if let Some(ofd) = proc.ofd_table.get_mut(0) { - ofd.file_type = crate::ofd::FileType::Pipe; + if let Ok(entry) = proc.fd_table.get(fd) { + let ofd_idx = entry.ofd_ref.0; + if let Some(ofd) = proc.ofd_table.get_mut(ofd_idx) { + ofd.host_handle = fd as i64; + ofd.path = match fd { + 0 => b"/dev/stdin".to_vec(), + 1 => b"/dev/stdout".to_vec(), + _ => b"/dev/stderr".to_vec(), + }; + ofd.file_type = crate::ofd::FileType::Pipe; + } + } else { + return -(Errno::EBADF as i32); } 0 } else { @@ -1472,6 +1509,12 @@ pub extern "C" fn kernel_set_stdin_pipe(pid: u32) -> i32 { } } +/// Backwards-compatible alias for older host code. +#[unsafe(no_mangle)] +pub extern "C" fn kernel_set_fd_pipe(pid: u32, fd: i32) -> i32 { + kernel_set_stdio_pipe(pid, fd) +} + fn finish_removed_process(pid: u32, result: crate::process_table::RemoveProcessResult) { use core::sync::atomic::Ordering; @@ -1643,13 +1686,21 @@ pub extern "C" fn kernel_clear_fork_child(pid: u32) -> i32 { } } -/// Get process exit status. -/// Returns exit_status if process is exited, -1 if still alive, -ESRCH if not found. +/// Get process exit status (centralized mode). +/// Returns the shell-style status used by host-side kill scans: normal exit +/// code for regular exits, 128+signal for signal termination, -1 if still +/// alive, or -ESRCH if not found. #[unsafe(no_mangle)] pub extern "C" fn kernel_get_process_exit_status(pid: u32) -> i32 { let table = unsafe { &*PROCESS_TABLE.0.get() }; match table.get(pid) { - Some(proc) if proc.state == crate::process::ProcessState::Exited => proc.exit_status, + Some(proc) if proc.state == crate::process::ProcessState::Exited => { + if proc.exit_signal != 0 { + 128 + proc.exit_signal as i32 + } else { + proc.exit_status + } + } Some(_) => -1, None => -(Errno::ESRCH as i32), } @@ -2116,13 +2167,8 @@ pub extern "C" fn kernel_dequeue_signal(pid: u32, out_ptr: *mut u8) -> i32 { let _ = dequeue_signal_for(proc, tid, signum); match default_action(signum) { DefaultAction::Terminate | DefaultAction::CoreDump => { - // Process is dying; clear sigsuspend state - proc.sigsuspend_saved_mask = None; - for t in proc.threads.iter_mut() { - t.signals.sigsuspend_saved_mask = None; - } - proc.state = crate::process::ProcessState::Exited; - proc.exit_status = 128 + signum as i32; + let mut host = WasmHostIO; + terminate_process_by_signal(proc, &mut host, signum); return 0; } _ => continue, @@ -2272,16 +2318,15 @@ pub extern "C" fn kernel_exec_setup(pid: u32) -> i32 { None => return -(Errno::ESRCH as i32), }; - // Serialize as exec state (signal handler reset, etc.) + // Serialize as exec state (signal handler reset, etc.). // CLOEXEC fds were already closed above, so serialization just preserves what's left. - let mut buf = alloc::vec![0u8; 64 * 1024]; - let written = match crate::fork::serialize_exec_state(proc, &mut buf) { - Ok(n) => n, + let buf = match crate::fork::serialize_exec_state_with_growing_buffer(proc) { + Ok(buf) => buf, Err(e) => return -(e as i32), }; // Deserialize back to replace the process with exec-sanitized version - match crate::fork::deserialize_exec_state(&buf[..written], pid) { + match crate::fork::deserialize_exec_state(&buf, pid) { Ok(new_proc) => { table.get_mut(pid).map(|p| { *p = new_proc; @@ -6930,7 +6975,8 @@ pub extern "C" fn kernel_exit(status: i32) -> ! { if unsafe { host_is_thread_worker() } != 0 { // Thread exit: don't destroy shared process state (FDs, pipes, etc.). // Just set exit status and return — the glue will trap via unreachable. - proc.exit_status = status; + proc.exit_status = status & 0xff; + proc.exit_signal = 0; // Drop GKL guard before trapping } else { let mut host = WasmHostIO; @@ -7134,6 +7180,28 @@ pub extern "C" fn kernel_connect(fd: i32, addr_ptr: *const u8, addr_len: u32) -> let (_gkl, proc) = unsafe { get_process() }; let mut host = WasmHostIO; let addr = unsafe { slice::from_raw_parts(addr_ptr, addr_len as usize) }; + if crate::is_centralized_mode() && addr_len >= 28 { + let family = u16::from_le_bytes([addr[0], addr[1]]); + if family == 10 { + let mut ip6 = [0u8; 16]; + ip6.copy_from_slice(&addr[8..24]); + if crate::syscalls::is_loopback_addr6(ip6) + || crate::syscalls::is_unspecified_addr6(ip6) + { + match cross_process_loopback_connect6(proc, fd, addr) { + Ok(()) => { + deliver_pending_signals(proc, &mut host); + return 0; + } + Err(Errno::ECONNREFUSED) => {} + Err(e) => { + deliver_pending_signals(proc, &mut host); + return -(e as i32); + } + } + } + } + } let result = match syscalls::sys_connect(proc, &mut host, fd, addr) { Ok(()) => 0, Err(Errno::ECONNREFUSED) if addr_len >= 3 => { @@ -7214,7 +7282,10 @@ fn cross_process_loopback_connect(proc: &mut Process, fd: i32, addr: &[u8]) -> R if s.state == SocketState::Listening && s.bind_port == port && s.sock_type == SocketType::Stream - && (s.bind_addr == [0, 0, 0, 0] || s.bind_addr == [127, 0, 0, 1]) + && ((s.domain == crate::socket::SocketDomain::Inet + && (s.bind_addr == [0, 0, 0, 0] || s.bind_addr == [127, 0, 0, 1])) + || (s.domain == crate::socket::SocketDomain::Inet6 + && crate::syscalls::is_unspecified_addr6(s.bind_addr6))) { listener_pid = Some(pid); listener_sock_idx = Some(idx); @@ -7283,6 +7354,7 @@ fn cross_process_loopback_connect(proc: &mut Process, fd: i32, addr: &[u8]) -> R let pc = crate::socket::PendingConnection { peer_addr: client_addr, + peer_addr6: None, peer_port: client_port, recv_pipe_idx: pipe_a_idx, // server reads client's writes send_pipe_idx: pipe_b_idx, // server writes to client's reads @@ -7298,7 +7370,136 @@ fn cross_process_loopback_connect(proc: &mut Process, fd: i32, addr: &[u8]) -> R Ok(()) } -/// Cross-process AF_UNIX connect. +/// Cross-process loopback TCP connect for AF_INET6 (centralized mode only). +/// +/// Searches all processes for a matching AF_INET6 listener and queues a +/// pending connection carrying the real IPv6 peer address. This avoids routing +/// guest ::1 connections through the host IPv4 bridge, where they are +/// indistinguishable from IPv4 clients and get reported to acceptors as the +/// wrong peer address. +fn cross_process_loopback_connect6(proc: &mut Process, fd: i32, addr: &[u8]) -> Result<(), Errno> { + use crate::pipe::PipeBuffer; + use crate::socket::{SocketDomain, SocketState, SocketType}; + + if addr.len() < 28 { + return Err(Errno::EINVAL); + } + let port = u16::from_be_bytes([addr[2], addr[3]]); + let mut ip6 = [0u8; 16]; + ip6.copy_from_slice(&addr[8..24]); + let dst_ip6 = if crate::syscalls::is_unspecified_addr6(ip6) { + crate::syscalls::loopback_addr6() + } else { + ip6 + }; + + let entry = proc.fd_table.get(fd)?; + let ofd = proc.ofd_table.get(entry.ofd_ref.0).ok_or(Errno::EBADF)?; + let sock_idx = (-(ofd.host_handle + 1)) as usize; + + { + let sock = proc.sockets.get(sock_idx).ok_or(Errno::EBADF)?; + if sock.sock_type == SocketType::Dgram { + return Err(Errno::ECONNREFUSED); + } + if sock.sock_type != SocketType::Stream || sock.domain != SocketDomain::Inet6 { + return Err(Errno::EOPNOTSUPP); + } + } + + let table = unsafe { &mut *PROCESS_TABLE.0.get() }; + let my_pid = proc.pid; + let mut listener_pid: Option = None; + let mut listener_sock_idx: Option = None; + + for (&pid, target_proc) in table.processes.iter().rev() { + if pid == my_pid { + continue; + } + for idx in 0..target_proc.sockets.len() { + if let Some(s) = target_proc.sockets.get(idx) { + if s.domain == SocketDomain::Inet6 + && s.state == SocketState::Listening + && s.bind_port == port + && s.sock_type == SocketType::Stream + && (crate::syscalls::is_unspecified_addr6(s.bind_addr6) + || s.bind_addr6 == dst_ip6) + { + listener_pid = Some(pid); + listener_sock_idx = Some(idx); + break; + } + } + } + if listener_pid.is_some() { + break; + } + } + + let listener_pid = listener_pid.ok_or(Errno::ECONNREFUSED)?; + let listener_sock_idx = listener_sock_idx.ok_or(Errno::ECONNREFUSED)?; + + let pipe_table = unsafe { crate::pipe::global_pipe_table() }; + let pipe_a_idx = pipe_table.alloc(PipeBuffer::new(65536)); + let pipe_b_idx = pipe_table.alloc(PipeBuffer::new(65536)); + + let proc = table.get_mut(my_pid).ok_or(Errno::ESRCH)?; + let client_sock = proc.sockets.get(sock_idx).ok_or(Errno::EBADF)?; + let client_addr6 = if crate::syscalls::is_unspecified_addr6(client_sock.bind_addr6) { + crate::syscalls::loopback_addr6() + } else { + client_sock.bind_addr6 + }; + let mut client_port = client_sock.bind_port; + if client_port == 0 { + client_port = proc.next_ephemeral_port; + proc.next_ephemeral_port = proc.next_ephemeral_port.wrapping_add(1); + if proc.next_ephemeral_port == 0 { + proc.next_ephemeral_port = 49152; + } + } + + let client = proc.sockets.get_mut(sock_idx).ok_or(Errno::EBADF)?; + client.send_buf_idx = Some(pipe_a_idx); + client.recv_buf_idx = Some(pipe_b_idx); + client.state = SocketState::Connected; + client.peer_addr6 = dst_ip6; + client.peer_port = port; + client.global_pipes = true; + if client.bind_port == 0 { + client.bind_port = client_port; + client.bind_addr6 = client_addr6; + } + + let listener_proc = table.get_mut(listener_pid).ok_or(Errno::ESRCH)?; + let listener_sock = listener_proc + .sockets + .get(listener_sock_idx) + .ok_or(Errno::ECONNREFUSED)?; + let shared_idx = listener_sock + .shared_backlog_idx + .ok_or(Errno::ECONNREFUSED)?; + let accept_wake_idx = listener_sock.accept_wake_idx; + + let pc = crate::socket::PendingConnection { + peer_addr: [0, 0, 0, 0], + peer_addr6: Some(client_addr6), + peer_port: client_port, + recv_pipe_idx: pipe_a_idx, + send_pipe_idx: pipe_b_idx, + }; + if !unsafe { crate::socket::shared_listener_backlog_table().push(shared_idx, pc) } { + return Err(Errno::ECONNREFUSED); + } + + if let Some(idx) = accept_wake_idx { + crate::wakeup::push_accept(idx); + } + + Ok(()) +} + +/// Cross-process AF_UNIX connect (centralized mode only). /// /// Looks up the target path in the global UnixSocketRegistry, then creates /// global pipe pairs to connect the client (current process) to the listener @@ -7358,21 +7559,41 @@ fn cross_process_unix_connect(proc: &mut Process, fd: i32, addr: &[u8]) -> Resul return Err(Errno::ECONNREFUSED); } - // Create accepted socket in the listener's process - let mut accepted_sock = SocketInfo::new(SocketDomain::Unix, SocketType::Stream, 0); - accepted_sock.state = SocketState::Connected; - accepted_sock.recv_buf_idx = Some(pipe_a_idx); - accepted_sock.send_buf_idx = Some(pipe_b_idx); - accepted_sock.global_pipes = true; - let accepted_idx = listener_proc.sockets.alloc(accepted_sock); - - // Push to listener's backlog + // Queue the connection on the listener's shared accept queue when + // available. POSIX listener fds inherited across fork/spawn share the same + // underlying socket queue, so the accepted socket must be materialized in + // whichever process actually calls accept(). let listener = listener_proc .sockets - .get_mut(listener_sock_idx) + .get(listener_sock_idx) .ok_or(Errno::EBADF)?; - listener.listen_backlog.push(accepted_idx); let accept_wake_idx = listener.accept_wake_idx; + if let Some(shared_idx) = listener.shared_backlog_idx { + let pc = crate::socket::PendingConnection { + peer_addr: [0, 0, 0, 0], + peer_addr6: None, + peer_port: 0, + recv_pipe_idx: pipe_a_idx, + send_pipe_idx: pipe_b_idx, + }; + if !unsafe { crate::socket::shared_listener_backlog_table().push(shared_idx, pc) } { + return Err(Errno::ECONNREFUSED); + } + } else { + // Legacy fallback for listeners without a shared queue. + let mut accepted_sock = SocketInfo::new(SocketDomain::Unix, SocketType::Stream, 0); + accepted_sock.state = SocketState::Connected; + accepted_sock.recv_buf_idx = Some(pipe_a_idx); + accepted_sock.send_buf_idx = Some(pipe_b_idx); + accepted_sock.global_pipes = true; + let accepted_idx = listener_proc.sockets.alloc(accepted_sock); + + let listener = listener_proc + .sockets + .get_mut(listener_sock_idx) + .ok_or(Errno::EBADF)?; + listener.listen_backlog.push(accepted_idx); + } // Set up client socket (in current process) let client_proc = table.get_mut(my_pid).ok_or(Errno::ESRCH)?; @@ -9871,6 +10092,7 @@ pub extern "C" fn kernel_inject_connection( peer_addr_c as u8, peer_addr_d as u8, ], + peer_addr6: None, peer_port: peer_port as u16, recv_pipe_idx, send_pipe_idx, diff --git a/crates/shared/src/lib.rs b/crates/shared/src/lib.rs index 950098fd4..2b0f12907 100644 --- a/crates/shared/src/lib.rs +++ b/crates/shared/src/lib.rs @@ -24,7 +24,9 @@ pub mod host_abi; /// with a wasm-declared reserved thread-slot count. /// 15: remove the obsolete `kernel_set_mode` export; the kernel is always /// the shared point of contact for all programs. -pub const ABI_VERSION: u32 = 15; +/// 16: remove the obsolete `kernel_mark_process_exited` export and split +/// `kernel_preadv`/`kernel_pwritev` offsets into explicit lo/hi i32 args. +pub const ABI_VERSION: u32 = 16; /// Syscall numbers for the POSIX kernel interface. #[derive(Debug, Clone, Copy, PartialEq, Eq)] diff --git a/examples/mmap_shared_anonymous_fork.c b/examples/mmap_shared_anonymous_fork.c new file mode 100644 index 000000000..b876abfa2 --- /dev/null +++ b/examples/mmap_shared_anonymous_fork.c @@ -0,0 +1,84 @@ +#include +#include +#include +#include +#include + +static int wait_ok(pid_t pid) { + int status = 0; + if (waitpid(pid, &status, 0) < 0) { + perror("waitpid"); + return 0; + } + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + fprintf(stderr, "child failed: status=%d\n", status); + return 0; + } + return 1; +} + +int main(void) { + const long page_size = sysconf(_SC_PAGESIZE); + if (page_size <= 0) { + perror("sysconf"); + return 1; + } + + char *shared = mmap( + NULL, + (size_t)page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + -1, + 0 + ); + if (shared == MAP_FAILED) { + perror("mmap"); + return 1; + } + + shared[0] = 'A'; + pid_t pid = fork(); + if (pid < 0) { + perror("fork"); + return 1; + } + if (pid == 0) { + if (shared[0] != 'A') { + fprintf(stderr, "child did not see parent write: %c\n", shared[0]); + _exit(2); + } + shared[0] = 'B'; + _exit(0); + } + if (!wait_ok(pid)) return 1; + if (shared[0] != 'B') { + fprintf(stderr, "parent did not see child write: %c\n", shared[0]); + return 1; + } + printf("inherited anonymous mapping coherent\n"); + + shared[1] = 'C'; + pid = fork(); + if (pid < 0) { + perror("fork second"); + return 1; + } + if (pid == 0) { + shared[1] = 'D'; + _exit(0); + } + if (!wait_ok(pid)) return 1; + if (shared[1] != 'D') { + fprintf(stderr, "parent did not see second child write: %c\n", shared[1]); + return 1; + } + printf("reused anonymous backing coherent\n"); + + if (munmap(shared, (size_t)page_size) < 0) { + perror("munmap"); + return 1; + } + printf("PASS\n"); + return 0; +} diff --git a/examples/mmap_shared_anonymous_fork.wasm b/examples/mmap_shared_anonymous_fork.wasm new file mode 100644 index 000000000..1f4b9a739 Binary files /dev/null and b/examples/mmap_shared_anonymous_fork.wasm differ diff --git a/examples/run-example.ts b/examples/run-example.ts index 4ef23dcbb..25cf5784b 100644 --- a/examples/run-example.ts +++ b/examples/run-example.ts @@ -24,6 +24,8 @@ const repoRoot = resolve(dirname(new URL(import.meta.url).pathname), ".."); // need the path must handle null explicitly. const coreutilsWasm = tryResolveBinary("programs/coreutils.wasm"); const dashWasm = tryResolveBinary("programs/dash.wasm"); +const shWasm = tryResolveBinary("programs/sh.wasm"); +const shellWasm = dashWasm ?? shWasm; const grepWasm = tryResolveBinary("programs/grep.wasm"); const sedWasm = tryResolveBinary("programs/sed.wasm"); const gitWasm = tryResolveBinary("programs/git/git.wasm"); @@ -77,8 +79,8 @@ const builtinPrograms: Record = { "echo": echoWasm, "/bin/echo": echoWasm, "/usr/bin/echo": echoWasm, - "sh": dashWasm, - "/bin/sh": dashWasm, + "sh": shellWasm, + "/bin/sh": shellWasm, "dash": dashWasm, "/bin/dash": dashWasm, "grep": grepWasm, diff --git a/host/src/browser-kernel-host.ts b/host/src/browser-kernel-host.ts index f15566ee4..083b5065f 100644 --- a/host/src/browser-kernel-host.ts +++ b/host/src/browser-kernel-host.ts @@ -115,6 +115,8 @@ export interface BrowserKernelBootOptions { pty?: boolean; /** Initial stdin bytes (with implicit EOF). */ stdin?: Uint8Array; + /** Stdio fds (0, 1, 2) that should be host-backed pipes, not terminals. */ + pipeStdio?: number[]; } export class BrowserKernel { @@ -421,6 +423,7 @@ export class BrowserKernel { gid: options.gid, pty: options.pty, stdin: options.stdin, + pipeStdio: options.pipeStdio, maxPages: this.maxPages, }) as number; @@ -465,6 +468,7 @@ export class BrowserKernel { env?: string[]; cwd?: string; stdin?: Uint8Array; + pipeStdio?: number[]; pty?: boolean; uid?: number; gid?: number; @@ -497,6 +501,7 @@ export class BrowserKernel { ptyCols: options?.ptyCols, ptyRows: options?.ptyRows, stdin: options?.stdin, + pipeStdio: options?.pipeStdio, maxPages: this.maxPages, }, [bytesToSend]); @@ -538,6 +543,7 @@ export class BrowserKernel { gid?: number; pty?: boolean; stdin?: Uint8Array; + pipeStdio?: number[]; ptyCols?: number; ptyRows?: number; }, @@ -556,6 +562,7 @@ export class BrowserKernel { ptyCols: options?.ptyCols, ptyRows: options?.ptyRows, stdin: options?.stdin, + pipeStdio: options?.pipeStdio, maxPages: this.maxPages, }) as number; diff --git a/host/src/browser-kernel-protocol.ts b/host/src/browser-kernel-protocol.ts index b64d202cb..ce9521b5f 100644 --- a/host/src/browser-kernel-protocol.ts +++ b/host/src/browser-kernel-protocol.ts @@ -89,6 +89,8 @@ export interface SpawnMessage { ptyCols?: number; ptyRows?: number; stdin?: Uint8Array; + /** Stdio fds (0, 1, 2) that should be host-backed pipes, not terminals. */ + pipeStdio?: number[]; maxPages?: number; } diff --git a/host/src/browser-kernel-worker-entry.ts b/host/src/browser-kernel-worker-entry.ts index fa3b2fc86..f2414e1b9 100644 --- a/host/src/browser-kernel-worker-entry.ts +++ b/host/src/browser-kernel-worker-entry.ts @@ -903,9 +903,15 @@ async function handleSpawn(msg: Extract) if (msg.ptyCols != null && msg.ptyRows != null) { kernelWorker.ptySetWinsize(ptyIdx, msg.ptyRows, msg.ptyCols); } - } else if (msg.stdin) { - const stdinData = msg.stdin instanceof Uint8Array ? msg.stdin : new Uint8Array(msg.stdin); - kernelWorker.setStdinData(pid, stdinData); + } else { + if (msg.pipeStdio) { + kernelWorker.setStdioPipes(pid, msg.pipeStdio); + } + if (msg.stdin) { + const stdinData = + msg.stdin instanceof Uint8Array ? msg.stdin : new Uint8Array(msg.stdin); + kernelWorker.setStdinData(pid, stdinData); + } } const initData: CentralizedWorkerInitMessage = { @@ -1066,6 +1072,7 @@ async function handleFork( maxAddr: childLayout.maxAddr, mmapBase: childLayout.mmapBase, }); + kernelWorker.inheritProcessSharedMappings(parentPid, childPid); const forkBufAddr = threadFork ? threadFork.forkBufAddr diff --git a/host/src/dylink.ts b/host/src/dylink.ts index f159d2b8e..ef0b613f9 100644 --- a/host/src/dylink.ts +++ b/host/src/dylink.ts @@ -6,6 +6,8 @@ * https://github.com/WebAssembly/tool-conventions/blob/main/DynamicLinking.md */ +import { FORK_SAVE_BUFFER_SIZE } from "./process-memory"; + // dylink.0 sub-section types const WASM_DYLINK_MEM_INFO = 1; const WASM_DYLINK_NEEDED = 2; @@ -16,6 +18,14 @@ const WASM_DYLINK_IMPORT_INFO = 4; const WASM_DYLINK_FLAG_TLS = 0x01; const WASM_DYLINK_FLAG_WEAK = 0x02; +const WPK_FORK_EXPORTS = [ + "wpk_fork_unwind_begin", + "wpk_fork_unwind_end", + "wpk_fork_rewind_begin", + "wpk_fork_rewind_end", + "wpk_fork_state", +] as const; + export interface DylinkMetadata { /** Bytes of linear memory this module needs */ memorySize: number; @@ -170,6 +180,21 @@ export interface LoadedSharedLibrary { metadata: DylinkMetadata; /** Path/name of the library */ name: string; + /** Fork save buffer for side modules that can call fork via env.fork. */ + forkBufAddr?: number; +} + +export interface SideModuleForkState { + name: string; + instance: WebAssembly.Instance; + forkBufAddr: number; +} + +export interface SideModuleForkSupport { + /** Mark the side module whose stack is currently unwinding for fork(). */ + setActiveFork: (state: SideModuleForkState) => void; + /** Clear an active side-module fork after rewind reaches env.fork again. */ + clearActiveFork: (state: SideModuleForkState) => void; } /** @@ -193,6 +218,8 @@ export interface DylinkReplayOptions { * the memcpy'd data section encode (memoryBase + offset); using any * other base corrupts pointers. */ memoryBase: number; + /** Side-module fork save buffer copied from the parent, if any. */ + forkBufAddr?: number; } /** @@ -215,6 +242,8 @@ export interface LoadSharedLibraryOptions { got: Map; /** Already-loaded libraries for dedup and dependency resolution */ loadedLibraries: Map; + /** Multi-module fork support for side modules loaded into this process. */ + sideModuleFork?: SideModuleForkSupport; /** Callback to locate and read a library file by name (async version) */ resolveLibrary?: (name: string) => Promise; /** Callback to locate and read a library file by name (sync version) */ @@ -345,6 +374,72 @@ function instantiateSharedLibrary( ? new (WebAssembly as any).Tag({ parameters: ["i32"] }) : undefined; + const module = new WebAssembly.Module(wasmBytes as unknown as BufferSource); + const moduleImports = WebAssembly.Module.imports(module); + const moduleExports = WebAssembly.Module.exports(module); + const importsFork = moduleImports.some((imp) => + imp.module === "env" && imp.name === "fork" && imp.kind === "function" + ); + const hasCompleteForkInstrumentation = + WPK_FORK_EXPORTS.every((name) => + moduleExports.some((exp) => exp.kind === "function" && exp.name === name), + ); + const hasAnyForkInstrumentation = + WPK_FORK_EXPORTS.some((name) => + moduleExports.some((exp) => exp.kind === "function" && exp.name === name), + ); + if (hasAnyForkInstrumentation && !hasCompleteForkInstrumentation) { + const missing = WPK_FORK_EXPORTS.filter((name) => + !moduleExports.some((exp) => exp.kind === "function" && exp.name === name), + ); + throw new Error(`${name}: incomplete wasm-fork-instrument exports; missing ${missing.join(", ")}`); + } + + const sideForkBufAddr = importsFork && hasCompleteForkInstrumentation && options.sideModuleFork + ? (replay?.forkBufAddr + ?? options.allocateMemory?.(FORK_SAVE_BUFFER_SIZE, 16) + ?? 0) + : 0; + let instance: WebAssembly.Instance | null = null; + let sideForkState: SideModuleForkState | null = null; + + const sideModuleForkImport = (): number => { + if (!options.sideModuleFork || sideForkBufAddr === 0 || !instance) { + throw new Error( + `${name}: env.fork reached without complete side-module fork support; ` + + "rebuild the side module with wasm-fork-instrument --entry env.fork", + ); + } + + const mainFork = options.globalSymbols.get("fork"); + if (typeof mainFork !== "function") { + throw new Error(`${name}: env.fork could not resolve main module fork`); + } + + const state = (instance.exports.wpk_fork_state as () => number)(); + if (state === 2) { + (instance.exports.wpk_fork_rewind_end as () => void)(); + const result = Number((mainFork as () => number)()); + if (sideForkState) { + options.sideModuleFork.clearActiveFork(sideForkState); + sideForkState = null; + } + return result; + } + + (instance.exports.wpk_fork_unwind_begin as (addr: number) => void)(sideForkBufAddr); + sideForkState = { name, instance, forkBufAddr: sideForkBufAddr }; + options.sideModuleFork.setActiveFork(sideForkState); + return Number((mainFork as () => number)()); + }; + + const uninstrumentedSideModuleForkImport = (): number => { + throw new Error( + `${name}: env.fork reached from an uninstrumented side module. ` + + "Rebuild the side module with wasm-fork-instrument --entry env.fork.", + ); + }; + // Construct imports const imports: WebAssembly.Imports = { env: new Proxy({} as Record, { @@ -356,6 +451,13 @@ function instantiateSharedLibrary( case "__table_base": return tableBaseGlobal; case "__stack_pointer": return options.stackPointer; case "__c_longjmp": return longjmpTag; + case "fork": + if (importsFork && options.sideModuleFork) { + return hasCompleteForkInstrumentation + ? sideModuleForkImport + : uninstrumentedSideModuleForkImport; + } + break; } const sym = options.globalSymbols.get(prop); if (sym !== undefined) return sym; @@ -364,6 +466,7 @@ function instantiateSharedLibrary( has(_target, prop: string) { if (["memory", "__indirect_function_table", "__memory_base", "__table_base", "__stack_pointer", "__c_longjmp"].includes(prop)) return true; + if (prop === "fork" && importsFork && options.sideModuleFork) return true; return options.globalSymbols.has(prop); }, }), @@ -379,9 +482,8 @@ function instantiateSharedLibrary( }), }; - // Compile and instantiate synchronously - const module = new WebAssembly.Module(wasmBytes as unknown as BufferSource); - const instance = new WebAssembly.Instance(module, imports); + // Instantiate synchronously + instance = new WebAssembly.Instance(module, imports); // Relocate exports: data address globals need memoryBase added const relocatedExports: Record = {}; @@ -448,6 +550,7 @@ function instantiateSharedLibrary( exports: relocatedExports, metadata, name, + forkBufAddr: sideForkBufAddr || undefined, }; options.loadedLibraries.set(name, loaded); diff --git a/host/src/generated/abi.ts b/host/src/generated/abi.ts index 8537e5976..515696b9c 100644 --- a/host/src/generated/abi.ts +++ b/host/src/generated/abi.ts @@ -1,7 +1,7 @@ /* GENERATED by `cargo xtask dump-abi`. Do not edit by hand. */ /* Regenerated by scripts/check-abi-version.sh; drift is a CI failure. */ -export const ABI_VERSION = 15 as const; +export const ABI_VERSION = 16 as const; export const ABI_CUSTOM_SECTION = "wasm-posix-abi" as const; export const ABI_KERNEL_EXPORT = "__abi_version" as const; diff --git a/host/src/kernel-worker.ts b/host/src/kernel-worker.ts index d22c521b7..92f447155 100644 --- a/host/src/kernel-worker.ts +++ b/host/src/kernel-worker.ts @@ -97,8 +97,12 @@ const FORK_BUF_SIZE = FORK_SAVE_BUFFER_SIZE; /** Errno values */ const EAGAIN = 11; +const EFAULT = 14; +const EEXIST = 17; +const ENAMETOOLONG = 36; const ETIMEDOUT = 110; const EINTR_ERRNO = 4; +const SHM_RDONLY = 0o10000; /** Syscall numbers for sleep/delay */ const SYS_NANOSLEEP = ABI_SYSCALLS.Nanosleep; @@ -177,12 +181,17 @@ const SYS_MREMAP = ABI_SYSCALLS.Mremap; const SYS_MSYNC = ABI_SYSCALLS.Msync; const SYS_WRITE = ABI_SYSCALLS.Write; const SYS_READ = ABI_SYSCALLS.Read; +const SYS_FSTAT = ABI_SYSCALLS.Fstat; const SYS_PREAD = ABI_SYSCALLS.Pread; const SYS_PWRITE = ABI_SYSCALLS.Pwrite; +const SYS_SENDFILE = ABI_SYSCALLS.Sendfile; const SYS_SEND = ABI_SYSCALLS.Send; const SYS_RECV = ABI_SYSCALLS.Recv; const SYS_SENDTO = ABI_SYSCALLS.Sendto; const SYS_RECVFROM = ABI_SYSCALLS.Recvfrom; +const SYS_FSYNC = ABI_SYSCALLS.Fsync; +const SYS_FDATASYNC = ABI_SYSCALLS.Fdatasync; +const SYS_FTRUNCATE = ABI_SYSCALLS.Ftruncate; const SYS_SENDMSG = ABI_SYSCALLS.Sendmsg; const SYS_RECVMSG = ABI_SYSCALLS.Recvmsg; const SYS_ACCEPT = ABI_SYSCALLS.Accept; @@ -194,6 +203,10 @@ const MSG_DONTWAIT = 0x0040; /** mmap flags */ const MAP_SHARED = 0x01; const MAP_ANONYMOUS = 0x20; +const PROT_WRITE = 0x02; +const O_RDONLY = 0; +const O_RDWR = 2; +const FILE_PAGE_SIZE = 4096; /** Syscall numbers for scatter/gather I/O */ const SYS_WRITEV = ABI_SYSCALLS.Writev; @@ -214,6 +227,13 @@ const SYS_MQ_TIMEDSEND = ABI_SYSCALLS.MqTimedsend; const SYS_MQ_TIMEDRECEIVE = ABI_SYSCALLS.MqTimedreceive; const SYS_CLOSE = ABI_SYSCALLS.Close; +const SYS_DUP = ABI_SYSCALLS.Dup; +const SYS_DUP2 = ABI_SYSCALLS.Dup2; +const SYS_DUP3 = ABI_SYSCALLS.Dup3; + +const F_DUPFD = 0; +const F_DUPFD_CLOEXEC = 1030; +const F_DUPFD_CLOFORK = 1028; /** IPC constants (must match musl) */ const IPC_64 = 0x100; @@ -233,6 +253,8 @@ const EAGAIN_RETRY_MS = 1; /** Profiling: enabled via WASM_POSIX_PROFILE env var. Zero-cost when disabled. */ const PROFILING = typeof process !== 'undefined' && !!process.env?.WASM_POSIX_PROFILE; +const THREAD_TRACE = typeof process !== "undefined" && !!process.env?.KERNEL_THREAD_TRACE; +const EXIT_TRACE = typeof process !== "undefined" && !!process.env?.KERNEL_EXIT_TRACE; /** Read-like syscalls that may block on pipe/socket data */ const READ_LIKE_SYSCALLS = new Set([ @@ -454,6 +476,43 @@ interface ProcessRegistration { * max_addr as channels are added; dynamic pthread control slots must not. */ explicitMaxAddr: boolean; + /** True when the guest glue traps instead of returning on CH_ERROR. */ + channelErrorTraps?: boolean; +} + +interface SharedMmapFdStat { + key: string; + size: number; +} + +interface SharedMmapBacking { + key: string; + path: string; + handle: number; + anonymous: boolean; + writable: boolean; + pages: Map; + dirtyPages: Set; + refCount: number; + version: number; +} + +interface SharedMmapMapping { + fd: number; + fileOffset: number; + len: number; + writable: boolean; + backingKey: string; + snapshot: Uint8Array; + version: number; +} + +interface SysvShmMapping { + segId: number; + size: number; + readOnly: boolean; + snapshot: Uint8Array; + version: number; } interface RegisterProcessOptions { @@ -734,6 +793,7 @@ export class CentralizedKernelWorker { /** Pending pselect6/select retries — keyed by channelOffset for per-thread tracking */ private pendingSelectRetries = new Map void; }>>(); - /** Per-process MAP_SHARED file-backed mappings: pid → Map */ - private sharedMappings = new Map>(); + /** Per-process MAP_SHARED mappings: pid → Map */ + private sharedMappings = new Map>(); + /** Host page-cache entries backing MAP_SHARED mappings. */ + private sharedMmapBackings = new Map(); + /** Cached process-fd to shared-mmap backing resolution. Negative entries avoid per-read fstat/path probes. */ + private sharedMmapFdCache = new Map(); + /** Monotonic id for anonymous MAP_SHARED backings. */ + private nextAnonymousMmapBackingId = 1; /** Host-side mirror of epoll interest lists: "pid:epfd" → interests. * Maintained by intercepting epoll_ctl results. Used by handleEpollPwait * to convert epoll_pwait to poll without calling kernel_handle_channel * (which crashes in Chrome for epoll_pwait due to a suspected V8 bug). */ private epollInterests = new Map>(); private lockTable: SharedLockTable | null = null; - /** Per-process shared memory mappings: pid → Map */ - private shmMappings = new Map>(); + /** Per-process SysV shared memory mappings. */ + private shmMappings = new Map>(); + /** Monotonic version per SysV segment, bumped when an attachment publishes writes. */ + private shmSegmentVersions = new Map(); /** PTY index → pid mapping (for draining output after syscalls) */ private ptyIndexByPid = new Map(); @@ -1185,6 +1249,27 @@ export class CentralizedKernelWorker { } } + /** + * Mark selected stdio descriptors as host-backed pipes rather than + * terminal character devices. Reads/writes still use host handles 0/1/2, + * but POSIX-visible metadata changes so isatty() fails with ENOTTY and + * fstat() reports FIFO semantics. + */ + setStdioPipes(pid: number, fds: number[]): void { + if (!this.kernelInstance) return; + const kernelSetStdioPipe = + (this.kernelInstance.exports.kernel_set_stdio_pipe ?? + this.kernelInstance.exports.kernel_set_fd_pipe) as + | ((pid: number, fd: number) => number) + | undefined; + if (!kernelSetStdioPipe) return; + for (const fd of fds) { + if (fd >= 0 && fd <= 2) { + kernelSetStdioPipe(pid, fd); + } + } + } + /** * Set stdout/stderr capture callbacks on the underlying kernel instance. * Must be called after construction but works at any time. @@ -1491,6 +1576,8 @@ export class CentralizedKernelWorker { // Clean up network listeners/endpoints for this process this.cleanupUdpBindings(pid); this.cleanupTcpListeners(pid); + this.releaseAllSharedMappingsForProcess(pid); + this.releaseAllSysvShmMappingsForProcess(pid); // Clean up pending poll retries this.cleanupPendingPollRetries(pid); @@ -1604,6 +1691,8 @@ export class CentralizedKernelWorker { // Clean up network listeners/endpoints for this process this.cleanupUdpBindings(pid); this.cleanupTcpListeners(pid); + this.releaseAllSharedMappingsForProcess(pid); + this.releaseAllSysvShmMappingsForProcess(pid); // Clear the killed-but-not-yet-reaped guard for this pid; if the // pid is later reused for a fresh fork+register, the new process // gets its own reaping decision. @@ -1837,6 +1926,96 @@ export class CentralizedKernelWorker { return new TextDecoder("utf-8", { fatal: false }).decode(copy); } + private writeKernelScratchString(value: string, offset: number): { ptr: number; len: number } { + const encoded = new TextEncoder().encode(value); + const ptr = this.scratchOffset + offset; + const kernelMem = this.getKernelMem(); + if (offset + encoded.length + 1 > CH_DATA_SIZE) { + throw new Error(`kernel scratch string too large (${encoded.length} bytes)`); + } + kernelMem.set(encoded, ptr); + kernelMem[ptr + encoded.length] = 0; + return { ptr, len: encoded.length }; + } + + private unsetProcessEnv(pid: number, name: string): void { + const setCurrentPid = this.kernelInstance!.exports.kernel_set_current_pid as + ((pid: number) => void) | undefined; + const unsetEnv = this.kernelInstance!.exports.kernel_unsetenv as + ((namePtr: KernelPointer, nameLen: number) => number) | undefined; + if (!setCurrentPid || !unsetEnv) return; + + const nameBuf = this.writeKernelScratchString(name, 0); + setCurrentPid(pid); + unsetEnv(this.toKernelPtr(nameBuf.ptr), nameBuf.len); + } + + private setProcessEnv(pid: number, entry: string): void { + const eq = entry.indexOf("="); + if (eq <= 0) return; + + const setCurrentPid = this.kernelInstance!.exports.kernel_set_current_pid as + ((pid: number) => void) | undefined; + const setEnv = this.kernelInstance!.exports.kernel_setenv as + ((namePtr: KernelPointer, nameLen: number, valuePtr: KernelPointer, valueLen: number, overwrite: number) => number) | undefined; + if (!setCurrentPid || !setEnv) return; + + const name = entry.slice(0, eq); + const value = entry.slice(eq + 1); + const nameBuf = this.writeKernelScratchString(name, 0); + const valueBuf = this.writeKernelScratchString(value, nameBuf.len + 1); + setCurrentPid(pid); + setEnv(this.toKernelPtr(nameBuf.ptr), nameBuf.len, this.toKernelPtr(valueBuf.ptr), valueBuf.len, 1); + } + + private replaceProcessEnvironment(pid: number, env: string[]): void { + for (const entry of this.snapshotProcessEnv(pid)) { + const eq = entry.indexOf("="); + if (eq > 0) this.unsetProcessEnv(pid, entry.slice(0, eq)); + } + for (const entry of env) { + this.setProcessEnv(pid, entry); + } + } + + private snapshotCurrentProcessStrings( + pid: number, + countExport: "kernel_get_argc" | "kernel_environ_count", + readExport: "kernel_argv_read" | "kernel_environ_get", + ): string[] { + const setCurrentPid = this.kernelInstance!.exports.kernel_set_current_pid as + ((pid: number) => void) | undefined; + const countFn = this.kernelInstance!.exports[countExport] as + (() => number) | undefined; + const readFn = this.kernelInstance!.exports[readExport] as + ((index: number, bufPtr: KernelPointer, bufLen: number) => number) | undefined; + if (!setCurrentPid || !countFn || !readFn) return []; + + setCurrentPid(pid); + const count = countFn(); + const out: string[] = []; + const maxLen = Math.min(CH_DATA_SIZE, 65536); + const ptr = this.scratchOffset + CH_DATA; + const kernelMem = this.getKernelMem(); + const decoder = new TextDecoder(); + for (let i = 0; i < count; i++) { + const len = readFn(i, this.toKernelPtr(ptr), maxLen); + if (len <= 0 || len > maxLen) continue; + const copy = new Uint8Array(len); + copy.set(kernelMem.subarray(ptr, ptr + len)); + out.push(decoder.decode(copy)); + } + return out; + } + + snapshotProcessArgv(pid: number): string[] { + return this.snapshotCurrentProcessStrings(pid, "kernel_get_argc", "kernel_argv_read"); + } + + snapshotProcessEnv(pid: number): string[] { + return this.snapshotCurrentProcessStrings(pid, "kernel_environ_count", "kernel_environ_get"); + } + /** Format a syscall for logging, decoding path/string args from process memory */ private formatSyscallEntry(channel: ChannelInfo, syscallNr: number, args: number[]): string { const name = SYSCALL_NAMES[syscallNr] ?? `syscall_${syscallNr}`; @@ -1992,6 +2171,10 @@ export class CentralizedKernelWorker { logEntry = this.formatSyscallEntry(channel, syscallNr, origArgs); } + this.synchronizeSharedMappingsForSyscallBoundary(channel, syscallNr); + this.synchronizeSysvShmMappingsForSyscallBoundary(channel); + this.flushSharedMappingsBeforeFileSyscall(channel, syscallNr, origArgs); + // --- Intercept fork/exec/clone/exit before calling kernel --- // These syscalls need special async handling that can't go through // the blocking host_fork/host_exec imports. @@ -2405,26 +2588,20 @@ export class CentralizedKernelWorker { console.error(`[BRK ALERT] pid=${channel.pid} brk returned 0x${(retVal >>> 0).toString(16)} — IN THREAD REGION!`); } - // --- File-backed mmap: populate mapped region with file data --- + // --- mmap backing: populate file mappings and track MAP_SHARED mappings --- if (syscallNr === SYS_MMAP && retVal > 0 && (retVal >>> 0) !== 0xffffffff) { const mmapFd = origArgs[4]; const mmapFlags = origArgs[3] >>> 0; - if (mmapFd >= 0 && (mmapFlags & MAP_ANONYMOUS) === 0) { - this.populateMmapFromFile(channel, retVal >>> 0, origArgs); - // Track MAP_SHARED file-backed mappings for msync writeback - if (mmapFlags & MAP_SHARED) { - const pageOffset = origArgs[5] >>> 0; - let pidMap = this.sharedMappings.get(channel.pid); - if (!pidMap) { - pidMap = new Map(); - this.sharedMappings.set(channel.pid, pidMap); + if (mmapFlags & MAP_SHARED) { + if (mmapFlags & MAP_ANONYMOUS) { + this.mapSharedAnonymousMmap(channel, retVal >>> 0, origArgs); + } else if (mmapFd >= 0) { + if (!this.mapSharedMmapFromFile(channel, retVal >>> 0, origArgs)) { + this.populateMmapFromFile(channel, retVal >>> 0, origArgs); } - pidMap.set(retVal >>> 0, { - fd: mmapFd, - fileOffset: pageOffset * 4096, - len: origArgs[1] >>> 0, - }); } + } else if (mmapFd >= 0 && (mmapFlags & MAP_ANONYMOUS) === 0) { + this.populateMmapFromFile(channel, retVal >>> 0, origArgs); } // DRI bo mmap prime: the kernel's sys_mmap on /dev/dri/{render,card} // already called `host_gbm_bo_bind` to record metadata, but the @@ -2438,7 +2615,6 @@ export class CentralizedKernelWorker { this.kernel.bos.primeBindFromSab(channel.pid, boId, channel.memory); } } - // --- msync: flush MAP_SHARED regions back to file --- if (syscallNr === SYS_MSYNC && retVal === 0) { this.flushSharedMappings(channel, origArgs); @@ -2535,10 +2711,10 @@ export class CentralizedKernelWorker { * info to the process channel. The glue code (channel_syscall.c) reads * this after the syscall returns and invokes the handler. */ - private dequeueSignalForDelivery(channel: ChannelInfo): void { + private dequeueSignalForDelivery(channel: ChannelInfo): number { const dequeueSignal = this.kernelInstance!.exports .kernel_dequeue_signal as ((pid: number, outPtr: KernelPointer) => number) | undefined; - if (!dequeueSignal) return; + if (!dequeueSignal) return 0; // Use the signal area in kernel scratch as the output buffer const sigOutOffset = this.scratchOffset + CH_SIG_BASE; @@ -2553,10 +2729,12 @@ export class CentralizedKernelWorker { kernelMem.subarray(sigOutOffset, sigOutOffset + 44), channel.channelOffset + CH_SIG_BASE, ); + return sigResult; } else { // Clear entire signal delivery area in process channel (48 bytes) const sigStart = channel.channelOffset + CH_SIG_BASE; new Uint8Array(channel.memory.buffer, sigStart, 48).fill(0); + return 0; } } @@ -2646,6 +2824,9 @@ export class CentralizedKernelWorker { } } + this.synchronizeSharedMappingsForSyscallBoundary(channel, syscallNr); + this.synchronizeSysvShmMappingsForSyscallBoundary(channel); + // Clear handling flag (channel is done — poller can pick it up for next syscall) channel.handling = false; @@ -2795,6 +2976,9 @@ export class CentralizedKernelWorker { * Used for thread exit where we need to unblock the worker. */ private completeChannelRaw(channel: ChannelInfo, retVal: number, errVal: number): void { + this.syncSharedMappingsFromProcess(channel, true); + this.refreshSharedMappingsToProcess(channel); + // Clear handling flag (channel is done — poller can pick it up for next syscall) channel.handling = false; @@ -3210,15 +3394,13 @@ export class CentralizedKernelWorker { for (const [, entry] of selectEntries) { if (!this.processes.has(entry.channel.pid)) continue; - // Cancel both setTimeout and setImmediate handles (one will be a no-op) - clearTimeout(entry.timer); - clearImmediate(entry.timer); + this.clearSelectRetryTimer(entry); // Re-dispatch to the right handler — SYS_SELECT and SYS_PSELECT6 have // different time-struct shapes (timeval vs timespec). if (entry.syscallNr === SYS_SELECT) { - this.handleSelect(entry.channel, entry.origArgs); + this.handleSelect(entry.channel, entry.origArgs, entry.deadline); } else { - this.handlePselect6(entry.channel, entry.origArgs); + this.handlePselect6(entry.channel, entry.origArgs, entry.deadline); } } @@ -3251,6 +3433,15 @@ export class CentralizedKernelWorker { } } + private clearSelectRetryTimer(entry: { timer: any; timerKind?: "timeout" | "immediate" | "none" }): void { + if (entry.timerKind === "none" || entry.timer == null) return; + if (entry.timerKind === "immediate") { + clearImmediate(entry.timer); + return; + } + clearTimeout(entry.timer); + } + /** * Remove a process's entries from pendingPipeReaders. * Called during process cleanup. @@ -3405,8 +3596,7 @@ export class CentralizedKernelWorker { // 3) Select/pselect retry timer. const selEntry = this.pendingSelectRetries.get(target.channelOffset); if (selEntry && selEntry.channel === target) { - clearTimeout(selEntry.timer); - clearImmediate(selEntry.timer); + this.clearSelectRetryTimer(selEntry); this.pendingSelectRetries.delete(target.channelOffset); this.completeChannelRaw(target, -EINTR_ERRNO, EINTR_ERRNO); this.relistenChannel(target); @@ -3517,6 +3707,12 @@ export class CentralizedKernelWorker { ): void { if (!this.processes.has(channel.pid)) return; + // EAGAIN-driven host waits park the process without normal syscall + // completion. Publish MAP_SHARED writes before parking so other processes + // observe standard shared-memory visibility while this thread blocks. + this.syncSharedMappingsFromProcess(channel, true); + this.refreshSharedMappingsToProcess(channel); + // Futex wait: use Atomics.waitAsync on the target address in process memory if (syscallNr === SYS_FUTEX) { const futexOp = origArgs[1] & 0x7f; // mask out FUTEX_PRIVATE_FLAG @@ -3907,6 +4103,13 @@ export class CentralizedKernelWorker { } if (delayMs > 0) { + // A host-delayed sleep parks the process without going through normal + // completeChannel(). Treat that park as a syscall boundary for + // MAP_SHARED: writes made before nanosleep/usleep must be visible to + // peer processes while this thread sleeps. + this.syncSharedMappingsFromProcess(channel, true); + this.refreshSharedMappingsToProcess(channel); + const timer = setTimeout(() => { this.pendingSleeps.delete(channel.pid); if (this.processes.has(channel.pid)) { @@ -4041,7 +4244,7 @@ export class CentralizedKernelWorker { * own code is `select(0, NULL, NULL, NULL, &tv)` (mysys/my_sleep.c) — the * pure-sleep case, fast-path'd to a setTimeout. */ - private handleSelect(channel: ChannelInfo, origArgs: number[]): void { + private handleSelect(channel: ChannelInfo, origArgs: number[], existingDeadline?: number): void { const FD_SET_SIZE = 128; const nfds = origArgs[0]; const readPtr = origArgs[1]; @@ -4076,20 +4279,26 @@ export class CentralizedKernelWorker { this.completeChannel(channel, SYS_SELECT, origArgs, undefined, 0, 0); return; } - const finite = timeoutMs > 0; + const deadline = existingDeadline ?? (timeoutMs > 0 ? Date.now() + timeoutMs : -1); + if (deadline > 0 && Date.now() >= deadline) { + this.completeChannel(channel, SYS_SELECT, origArgs, undefined, 0, 0); + return; + } + const finite = deadline > 0; const timer = finite ? setTimeout(() => { this.pendingSelectRetries.delete(channel.channelOffset); if (this.processes.has(channel.pid)) { this.completeChannel(channel, SYS_SELECT, origArgs, undefined, 0, 0); } - }, timeoutMs) + }, Math.max(deadline - Date.now(), 1)) : (null as any); this.pendingSelectRetries.set(channel.channelOffset, { timer, + timerKind: finite ? "timeout" : "none", channel, origArgs, - deadline: finite ? Date.now() + timeoutMs : -1, + deadline, needsSignalSafeWake: false, syscallNr: SYS_SELECT, }); @@ -4169,7 +4378,11 @@ export class CentralizedKernelWorker { this.completeChannel(channel, SYS_SELECT, origArgs, undefined, 0, 0); return; } - const deadline = timeoutMs > 0 ? Date.now() + timeoutMs : -1; + const deadline = existingDeadline ?? (timeoutMs > 0 ? Date.now() + timeoutMs : -1); + if (deadline > 0 && Date.now() >= deadline) { + this.completeChannel(channel, SYS_SELECT, origArgs, undefined, 0, 0); + return; + } const retryFn = () => { this.pendingSelectRetries.delete(channel.channelOffset); if (!this.processes.has(channel.pid)) return; @@ -4177,13 +4390,13 @@ export class CentralizedKernelWorker { this.completeChannel(channel, SYS_SELECT, origArgs, undefined, 0, 0); return; } - this.handleSelect(channel, origArgs); + this.handleSelect(channel, origArgs, deadline); }; - const finite = timeoutMs > 0; + const finite = deadline > 0; const remainingMs = finite ? Math.max(deadline - Date.now(), 1) : 50; const timer = setTimeout(retryFn, Math.min(remainingMs, 50)); this.pendingSelectRetries.set(channel.channelOffset, { - timer, channel, origArgs, deadline, needsSignalSafeWake: false, + timer, timerKind: "timeout", channel, origArgs, deadline, needsSignalSafeWake: false, syscallNr: SYS_SELECT, }); return; @@ -4192,7 +4405,7 @@ export class CentralizedKernelWorker { this.completeChannel(channel, SYS_SELECT, origArgs, undefined, retVal, errVal); } - private handlePselect6(channel: ChannelInfo, origArgs: number[]): void { + private handlePselect6(channel: ChannelInfo, origArgs: number[], existingDeadline?: number): void { const FD_SET_SIZE = 128; const processMem = new Uint8Array(channel.memory.buffer); const kernelMem = this.getKernelMem(); @@ -4316,7 +4529,11 @@ export class CentralizedKernelWorker { return; } - const deadline = timeoutMs > 0 ? Date.now() + timeoutMs : -1; + const deadline = existingDeadline ?? (timeoutMs > 0 ? Date.now() + timeoutMs : -1); + if (deadline > 0 && Date.now() >= deadline) { + this.completeChannel(channel, SYS_PSELECT6, origArgs, undefined, 0, 0); + return; + } // pselect6 with a non-null sigmask pointer has the same late-signal // race as ppoll. See scheduleWakeBlockedRetriesDeferred. const needsSignalSafeWake = maskDataPtr !== 0; @@ -4326,27 +4543,30 @@ export class CentralizedKernelWorker { // With infinite timeout: block until signal (wakeAllBlockedRetries). if (nfds === 0) { if (timeoutMs > 0) { + const remainingMs = Math.max(deadline - Date.now(), 1); const timer = setTimeout(() => { this.pendingSelectRetries.delete(channel.channelOffset); if (this.processes.has(channel.pid)) { this.completeChannel(channel, SYS_PSELECT6, origArgs, undefined, 0, 0); } - }, timeoutMs); + }, remainingMs); this.pendingSelectRetries.set(channel.channelOffset, { - timer, channel, origArgs, deadline, needsSignalSafeWake, syscallNr: SYS_PSELECT6, + timer, timerKind: "timeout", channel, origArgs, deadline, needsSignalSafeWake, syscallNr: SYS_PSELECT6, }); } else { // Infinite timeout with nfds=0: wait for signal delivery. // No timer — wakeAllBlockedRetries will trigger the retry. this.pendingSelectRetries.set(channel.channelOffset, { - timer: null as any, channel, origArgs, deadline: -1, + timer: null as any, timerKind: "none", channel, origArgs, deadline: -1, needsSignalSafeWake, syscallNr: SYS_PSELECT6, }); } return; } - // For finite timeout with actual fds, track the deadline + // For finite timeout with actual fds, track the deadline. State changes + // wake this early through wakeAllBlockedRetries; otherwise the timer is + // the timeout/fallback retry. const retryFn = () => { this.pendingSelectRetries.delete(channel.channelOffset); if (!this.processes.has(channel.pid)) return; @@ -4354,11 +4574,14 @@ export class CentralizedKernelWorker { this.completeChannel(channel, SYS_PSELECT6, origArgs, undefined, 0, 0); return; } - this.handlePselect6(channel, origArgs); + this.handlePselect6(channel, origArgs, deadline); }; - const timer = setImmediate(retryFn); + const retryMs = deadline > 0 + ? Math.max(1, Math.min(deadline - Date.now(), 50)) + : 50; + const timer = setTimeout(retryFn, retryMs); this.pendingSelectRetries.set(channel.channelOffset, { - timer, channel, origArgs, deadline, needsSignalSafeWake, syscallNr: SYS_PSELECT6, + timer, timerKind: "timeout", channel, origArgs, deadline, needsSignalSafeWake, syscallNr: SYS_PSELECT6, }); return; } @@ -4601,8 +4824,25 @@ export class CentralizedKernelWorker { const retVal = Number(kernelView.getBigInt64(CH_RETURN, true)); const errVal = kernelView.getUint32(CH_ERRNO, true); - // Handle signal delivery - this.dequeueSignalForDelivery(channel); + // Handle signal delivery. This host-side epoll emulation calls the + // kernel's poll helper with timeout=0 and then decides whether to + // block/retry in TypeScript. POSIX still requires a caught signal to + // interrupt epoll_wait/epoll_pwait with EINTR so user code can run the + // handler before re-entering the wait. Without completing the channel + // here, a process with a queued handler signal can stay parked in the + // host retry loop indefinitely. + const deliveredSignal = this.dequeueSignalForDelivery(channel); + const getExitStatus = this.kernelInstance!.exports + .kernel_get_process_exit_status as ((pid: number) => number) | undefined; + if (getExitStatus && getExitStatus(channel.pid) >= 128) { + this.handleProcessTerminated(channel); + return; + } + if (deliveredSignal > 0) { + this.completeChannelRaw(channel, -EINTR_ERRNO, EINTR_ERRNO); + this.relistenChannel(channel); + return; + } // If poll returned error (not EAGAIN), propagate it if (retVal < 0 && errVal !== EAGAIN) { @@ -5634,19 +5874,31 @@ export class CentralizedKernelWorker { } const parentPid = channel.pid; - // Skip pids that are already registered (e.g., pid 3 is nginx master) - while (this.processes.has(this.nextChildPid)) { - this.nextChildPid++; - } - const childPid = this.nextChildPid++; - - // Clone the Process in the kernel's ProcessTable + // Clone the Process in the kernel's ProcessTable. The JS host tracks live + // workers, but the kernel is the source of truth for zombies/limbo process + // records that still occupy a pid until POSIX wait semantics release them. + // If the host-side monotonic counter lands on such a pid, retry with the + // next candidate instead of surfacing EEXIST to fork() callers. const kernelForkProcess = this.kernelInstance!.exports.kernel_fork_process as (parentPid: number, childPid: number) => number; - const forkResult = kernelForkProcess(parentPid, childPid); - if (forkResult < 0) { + let childPid = 0; + let forkResult = 0; + for (let attempts = 0; attempts < 4096; attempts++) { + while (this.processes.has(this.nextChildPid)) { + this.nextChildPid++; + } + childPid = this.nextChildPid++; + forkResult = kernelForkProcess(parentPid, childPid); + if (forkResult === 0) break; + if (((-forkResult) >>> 0) !== EEXIST) break; + } + if (forkResult < 0 || childPid === 0) { // Fork failed in kernel (e.g., ESRCH, ENOMEM) - this.completeChannel(channel, SYS_FORK, _origArgs, undefined, -1, (-forkResult) >>> 0); + const errno = ((-forkResult) >>> 0) || EEXIST; + console.error( + `[kernel] kernel_fork_process failed parent=${parentPid} child=${childPid} errno=${errno}`, + ); + this.completeChannel(channel, SYS_FORK, _origArgs, undefined, -1, errno); return; } @@ -5699,6 +5951,13 @@ export class CentralizedKernelWorker { } // Call the async fork handler to spawn child Worker + // A process may be the only current observer of a large shared-memory + // backing before fork (OPcache's arena is the common case). Normal syscall + // boundaries can skip single-observer publishes, but fork creates another + // observer and SysV segments may later be observed by new shmat callers. + this.syncSharedMappingsFromProcess(channel, true, { force: true }); + this.syncSysvShmMappingsFromProcess(channel, { force: true }); + this.callbacks.onFork(parentPid, childPid, channel.memory, threadFork).then((childChannelOffsets) => { if (!this.processes.has(parentPid)) return; @@ -5913,7 +6172,7 @@ export class CentralizedKernelWorker { /** * Read a null-terminated string from process memory at the given pointer. */ - private readCStringFromProcess(mem: Uint8Array, ptr: number, maxLen = 4096): string { + private readCStringFromProcess(mem: Uint8Array, ptr: number, maxLen = 1024 * 1024): string { if (ptr === 0) return ""; let len = 0; while (ptr + len < mem.length && mem[ptr + len] !== 0 && len < maxLen) { @@ -6220,13 +6479,19 @@ export class CentralizedKernelWorker { // (it loops on SYS_EXIT). if (tid > 0) { const ctidKey = `${channel.pid}:${tid}`; - const ctidPtr = this.threadCtidPtrs.get(ctidKey); - if (ctidPtr && ctidPtr !== 0) { - this.threadCtidPtrs.delete(ctidKey); + const ctidPtr = this.threadCtidPtrs.get(ctidKey) ?? 0; + this.threadCtidPtrs.delete(ctidKey); + if (ctidPtr !== 0) { const procView = new DataView(channel.memory.buffer); + const before = procView.getInt32(ctidPtr, true); procView.setInt32(ctidPtr, 0, true); const i32View = new Int32Array(channel.memory.buffer); - Atomics.notify(i32View, ctidPtr >>> 2, 1); + const woken = Atomics.notify(i32View, ctidPtr >>> 2, 1); + if (THREAD_TRACE) { + console.error(`[thread] exit pid=${channel.pid} tid=${tid} clear ctid=0x${ctidPtr.toString(16)} before=${before} woken=${woken}`); + } + } else if (THREAD_TRACE) { + console.error(`[thread] exit pid=${channel.pid} tid=${tid} missing ctid`); } } @@ -6248,19 +6513,31 @@ export class CentralizedKernelWorker { // Run the kernel's exit path so it closes all FDs (including pipe // write ends). kernel_exit calls sys_exit then traps — catch the trap. { - const kernelView = new DataView(this.kernelMemory!.buffer, this.scratchOffset); - kernelView.setUint32(CH_SYSCALL, syscallNr, true); - kernelView.setBigInt64(CH_ARGS, BigInt(exitStatus), true); - const handleChannel = this.kernelInstance!.exports.kernel_handle_channel as - (offset: KernelPointer, pid: number) => number; - this.currentHandlePid = channel.pid; - this.bindKernelTidForChannel(channel); - try { - handleChannel(this.toKernelPtr(this.scratchOffset), channel.pid); - } catch { - // Expected: kernel_exit traps with unreachable after closing FDs - } finally { - this.currentHandlePid = 0; + if (EXIT_TRACE) console.error(`[exit] pid=${channel.pid} status=${exitStatus} mark start`); + const markExited = this.kernelInstance!.exports.kernel_mark_process_exited as + ((pid: number, status: number) => number) | undefined; + if (markExited) { + const rc = markExited(channel.pid, exitStatus); + if (rc < 0) { + console.error(`[handleExit] kernel_mark_process_exited failed for pid=${channel.pid}: errno=${-rc}`); + } + if (EXIT_TRACE) console.error(`[exit] pid=${channel.pid} mark done rc=${rc}`); + } else { + const kernelView = new DataView(this.kernelMemory!.buffer, this.scratchOffset); + kernelView.setUint32(CH_SYSCALL, syscallNr, true); + kernelView.setBigInt64(CH_ARGS, BigInt(exitStatus), true); + const handleChannel = this.kernelInstance!.exports.kernel_handle_channel as + (offset: KernelPointer, pid: number) => number; + this.currentHandlePid = channel.pid; + this.bindKernelTidForChannel(channel); + try { + handleChannel(this.toKernelPtr(this.scratchOffset), channel.pid); + } catch { + // Compatibility with older kernels where kernel_exit traps after cleanup. + } finally { + this.currentHandlePid = 0; + } + if (EXIT_TRACE) console.error(`[exit] pid=${channel.pid} legacy mark done`); } } @@ -6280,6 +6557,10 @@ export class CentralizedKernelWorker { return; } this.hostReaped.add(exitingPid); + // Publish process-owned shared-memory writes before waking waiters. A + // parent returning from waitpid must see a child's MAP_SHARED updates. + this.releaseAllSharedMappingsForProcess(exitingPid); + this.releaseAllSysvShmMappingsForProcess(exitingPid); this.notifyParentOfExitedProcess(exitingPid); // Complete the channel so the worker unblocks from Atomics.wait(). @@ -6314,7 +6595,8 @@ export class CentralizedKernelWorker { this.notifyParentOfExitedProcess(exitingPid); // Clean up per-process state - this.sharedMappings.delete(exitingPid); + this.releaseAllSharedMappingsForProcess(exitingPid); + this.releaseAllSysvShmMappingsForProcess(exitingPid); // Do NOT complete the channel — the worker is blocked on Atomics.wait // and waking it would cause the C code to continue executing. @@ -6885,14 +7167,13 @@ export class CentralizedKernelWorker { // 3. Pending select/pselect6 retries for (const [key, selectEntry] of this.pendingSelectRetries) { if (selectEntry.channel.pid !== targetPid) continue; - clearTimeout(selectEntry.timer); - clearImmediate(selectEntry.timer); + this.clearSelectRetryTimer(selectEntry); this.pendingSelectRetries.delete(key); if (!this.processes.has(targetPid)) continue; if (selectEntry.syscallNr === SYS_SELECT) { - this.handleSelect(selectEntry.channel, selectEntry.origArgs); + this.handleSelect(selectEntry.channel, selectEntry.origArgs, selectEntry.deadline); } else { - this.handlePselect6(selectEntry.channel, selectEntry.origArgs); + this.handlePselect6(selectEntry.channel, selectEntry.origArgs, selectEntry.deadline); } } } @@ -7028,6 +7309,467 @@ export class CentralizedKernelWorker { } } + private mapSharedMmapFromFile( + channel: ChannelInfo, + mmapAddr: number, + origArgs: number[], + ): boolean { + const fd = origArgs[4]; + const mapLen = origArgs[1] >>> 0; + const pageOffset = origArgs[5] >>> 0; + const fileOffset = pageOffset * FILE_PAGE_SIZE; + const writable = (origArgs[2] & PROT_WRITE) !== 0; + if (mapLen === 0) return true; + + const path = this.getFdPathForSharedMapping(channel, fd); + const stat = this.getFdStatForSharedMapping(channel, fd); + if (!path || !stat) return false; + + const key = stat.key || `path:${path}`; + const backing = this.getOrCreateSharedMmapBacking(key, path, writable); + if (!backing) return false; + + try { + this.ensureBackingRangeLoaded(backing, fileOffset, mapLen); + } catch { + return false; + } + + const processMem = new Uint8Array(channel.memory.buffer); + if (mmapAddr + mapLen > processMem.length) return false; + + const initial = this.readBackingRange(backing, fileOffset, mapLen); + processMem.set(initial, mmapAddr); + + let pidMap = this.sharedMappings.get(channel.pid); + if (!pidMap) { + pidMap = new Map(); + this.sharedMappings.set(channel.pid, pidMap); + } + backing.refCount++; + pidMap.set(mmapAddr, { + fd, + fileOffset, + len: mapLen, + writable, + backingKey: key, + snapshot: initial.slice(), + version: backing.version, + }); + return true; + } + + private mapSharedAnonymousMmap( + channel: ChannelInfo, + mmapAddr: number, + origArgs: number[], + ): boolean { + const mapLen = origArgs[1] >>> 0; + const writable = (origArgs[2] & PROT_WRITE) !== 0; + if (mapLen === 0) return true; + + const processMem = new Uint8Array(channel.memory.buffer); + if (mmapAddr + mapLen > processMem.length) return false; + + const key = `anon:${channel.pid}:${mmapAddr}:${this.nextAnonymousMmapBackingId++}`; + const backing: SharedMmapBacking = { + key, + path: "", + handle: -1, + anonymous: true, + writable, + pages: new Map(), + dirtyPages: new Set(), + refCount: 0, + version: 0, + }; + this.sharedMmapBackings.set(key, backing); + + const initial = processMem.slice(mmapAddr, mmapAddr + mapLen); + this.copyRangeToBacking(backing, 0, initial, false); + + let pidMap = this.sharedMappings.get(channel.pid); + if (!pidMap) { + pidMap = new Map(); + this.sharedMappings.set(channel.pid, pidMap); + } + backing.refCount++; + pidMap.set(mmapAddr, { + fd: -1, + fileOffset: 0, + len: mapLen, + writable, + backingKey: key, + snapshot: initial, + version: backing.version, + }); + return true; + } + + private getFdStatForSharedMapping(channel: ChannelInfo, fd: number): SharedMmapFdStat | null { + const handleChannel = this.kernelInstance!.exports.kernel_handle_channel as + (offset: KernelPointer, pid: number) => number; + const kernelView = new DataView(this.kernelMemory!.buffer, this.scratchOffset); + const statPtr = this.scratchOffset + CH_DATA; + + kernelView.setUint32(CH_SYSCALL, SYS_FSTAT, true); + kernelView.setBigInt64(CH_ARGS + 0 * CH_ARG_SIZE, BigInt(fd), true); + kernelView.setBigInt64(CH_ARGS + 1 * CH_ARG_SIZE, BigInt(statPtr), true); + for (let i = 2; i < CH_ARGS_COUNT; i++) { + kernelView.setBigInt64(CH_ARGS + i * CH_ARG_SIZE, BigInt(0), true); + } + + this.currentHandlePid = channel.pid; + this.bindKernelTidForChannel(channel); + try { + handleChannel(this.toKernelPtr(this.scratchOffset), channel.pid); + } catch { + return null; + } finally { + this.currentHandlePid = 0; + } + + const retVal = Number(kernelView.getBigInt64(CH_RETURN, true)); + const errVal = kernelView.getUint32(CH_ERRNO, true); + if (retVal !== 0 || errVal !== 0) return null; + + const statView = new DataView(this.kernelMemory!.buffer, statPtr); + const dev = statView.getBigUint64(0, true); + const ino = statView.getBigUint64(8, true); + const size64 = statView.getBigUint64(32, true); + const size = size64 > BigInt(Number.MAX_SAFE_INTEGER) + ? Number.MAX_SAFE_INTEGER + : Number(size64); + const key = dev !== 0n || ino !== 0n ? `${dev.toString()}:${ino.toString()}` : ""; + return { key, size }; + } + + private getFdPathForSharedMapping(channel: ChannelInfo, fd: number): string | null { + const getFdPath = this.kernelInstance!.exports.kernel_get_fd_path as + ((pid: number, fd: number, bufPtr: KernelPointer, bufLen: number) => number) | undefined; + if (!getFdPath) return null; + + const bufPtr = this.scratchOffset + CH_DATA; + const maxLen = Math.min(4096, CH_DATA_SIZE); + const result = getFdPath(channel.pid, fd, this.toKernelPtr(bufPtr), maxLen); + if (result <= 0) return null; + + const kernelBuf = new Uint8Array(this.kernelMemory!.buffer); + return new TextDecoder().decode(kernelBuf.slice(bufPtr, bufPtr + result)); + } + + private getOrCreateSharedMmapBacking( + key: string, + path: string, + writable: boolean, + ): SharedMmapBacking | null { + const existing = this.sharedMmapBackings.get(key); + if (existing) { + if (writable && !existing.writable) { + const upgraded = this.openSharedMmapBackingHandle(path, true); + if (upgraded === null) return null; + try { + this.io.close(existing.handle); + } catch { + // Keep going: the replacement handle is already open. + } + existing.handle = upgraded; + existing.writable = true; + } + return existing; + } + + const handle = this.openSharedMmapBackingHandle(path, writable); + if (handle === null) return null; + const backing: SharedMmapBacking = { + key, + path, + handle, + anonymous: false, + writable, + pages: new Map(), + dirtyPages: new Set(), + refCount: 0, + version: 0, + }; + this.sharedMmapBackings.set(key, backing); + this.invalidateSharedMmapFdCache(); + return backing; + } + + private openSharedMmapBackingHandle(path: string, writable: boolean): number | null { + try { + return this.io.open(path, writable ? O_RDWR : O_RDONLY, 0); + } catch { + return null; + } + } + + private ensureBackingRangeLoaded(backing: SharedMmapBacking, offset: number, len: number): void { + if (len <= 0) return; + const firstPage = Math.floor(offset / FILE_PAGE_SIZE); + const lastPage = Math.floor((offset + len - 1) / FILE_PAGE_SIZE); + for (let page = firstPage; page <= lastPage; page++) { + this.ensureBackingPageLoaded(backing, page); + } + } + + private ensureBackingPageLoaded(backing: SharedMmapBacking, page: number): Uint8Array { + const existing = backing.pages.get(page); + if (existing) return existing; + const data = this.readBackingPageFromFile(backing, page); + backing.pages.set(page, data); + return data; + } + + private readBackingPageFromFile(backing: SharedMmapBacking, page: number): Uint8Array { + const data = new Uint8Array(FILE_PAGE_SIZE); + if (backing.handle < 0) { + return data; + } + try { + const bytesRead = this.io.read( + backing.handle, + data, + page * FILE_PAGE_SIZE, + FILE_PAGE_SIZE, + ); + if (bytesRead > 0 && bytesRead < FILE_PAGE_SIZE) { + data.fill(0, bytesRead); + } + } catch { + // Sparse EOF or a transient host read error leaves the page zero-filled. + } + return data; + } + + private readBackingRange(backing: SharedMmapBacking, offset: number, len: number): Uint8Array { + const out = new Uint8Array(len); + let copied = 0; + while (copied < len) { + const absolute = offset + copied; + const page = Math.floor(absolute / FILE_PAGE_SIZE); + const pageOffset = absolute % FILE_PAGE_SIZE; + const n = Math.min(FILE_PAGE_SIZE - pageOffset, len - copied); + const pageData = this.ensureBackingPageLoaded(backing, page); + out.set(pageData.subarray(pageOffset, pageOffset + n), copied); + copied += n; + } + return out; + } + + private copyRangeToBacking( + backing: SharedMmapBacking, + offset: number, + bytes: Uint8Array, + markDirty: boolean, + ): void { + let copied = 0; + while (copied < bytes.length) { + const absolute = offset + copied; + const page = Math.floor(absolute / FILE_PAGE_SIZE); + const pageOffset = absolute % FILE_PAGE_SIZE; + const n = Math.min(FILE_PAGE_SIZE - pageOffset, bytes.length - copied); + const pageData = this.ensureBackingPageLoaded(backing, page); + pageData.set(bytes.subarray(copied, copied + n), pageOffset); + if (markDirty) { + backing.dirtyPages.add(page); + } else { + backing.dirtyPages.delete(page); + } + copied += n; + } + } + + private rangeDiffersFromSnapshot( + processMem: Uint8Array, + memOffset: number, + snapshot: Uint8Array, + snapshotOffset: number, + len: number, + ): boolean { + const BufferCtor = (globalThis as { Buffer?: typeof Buffer }).Buffer; + if (BufferCtor?.compare && BufferCtor?.from) { + try { + const processView = BufferCtor.from( + processMem.buffer, + processMem.byteOffset + memOffset, + len, + ); + const snapshotView = BufferCtor.from( + snapshot.buffer, + snapshot.byteOffset + snapshotOffset, + len, + ); + return BufferCtor.compare(processView, snapshotView) !== 0; + } catch { + // Browser builds do not provide Buffer. Fall through to typed arrays. + } + } + + const processByteOffset = processMem.byteOffset + memOffset; + const snapshotByteOffset = snapshot.byteOffset + snapshotOffset; + if (((processByteOffset | snapshotByteOffset | len) & 3) === 0) { + const processWords = new Uint32Array(processMem.buffer, processByteOffset, len / 4); + const snapshotWords = new Uint32Array(snapshot.buffer, snapshotByteOffset, len / 4); + for (let i = 0; i < processWords.length; i++) { + if (processWords[i] !== snapshotWords[i]) { + return true; + } + } + return false; + } + + for (let i = 0; i < len; i++) { + if (processMem[memOffset + i] !== snapshot[snapshotOffset + i]) { + return true; + } + } + return false; + } + + private synchronizeSharedMappingsForSyscallBoundary(channel: ChannelInfo, syscallNr: number): void { + const includeAnonymous = this.syscallSynchronizesAnonymousSharedMemory(syscallNr); + this.syncSharedMappingsFromProcess(channel, includeAnonymous); + this.refreshSharedMappingsToProcess(channel, includeAnonymous); + } + + private syscallSynchronizesAnonymousSharedMemory(syscallNr: number): boolean { + // Anonymous MAP_SHARED mappings are ordinary shared memory: writes made by + // a process before it enters the kernel must be visible to peers that + // subsequently enter the kernel. Centralized Kandelo processes use + // separate Wasm memories, so every syscall boundary for the *current* + // process is our coherence point. This intentionally does not scrape + // other live processes from a peer's syscall; doing so can publish + // mid-update shared-memory state that the writer has not synchronized. + void syscallNr; + return true; + } + + private syncSharedMappingsFromProcess( + channel: ChannelInfo, + includeAnonymous = true, + options: { force?: boolean } = {}, + ): void { + const pidMap = this.sharedMappings.get(channel.pid); + if (!pidMap || pidMap.size === 0) return; + const processMem = new Uint8Array(channel.memory.buffer); + + for (const [mapAddr, mapping] of pidMap) { + if (!mapping.writable) continue; + const backing = this.sharedMmapBackings.get(mapping.backingKey); + if (!backing) continue; + if (backing.anonymous && !includeAnonymous) continue; + if ( + !options.force + && backing.anonymous + && backing.refCount <= 1 + && mapping.version === backing.version + ) { + continue; + } + if (mapAddr + mapping.len > processMem.length) continue; + + let changed = false; + for (let offset = 0; offset < mapping.len; offset += FILE_PAGE_SIZE) { + const n = Math.min(FILE_PAGE_SIZE, mapping.len - offset); + if (!this.rangeDiffersFromSnapshot( + processMem, + mapAddr + offset, + mapping.snapshot, + offset, + n, + )) { + continue; + } + // Copy only bytes this process actually changed relative to its last + // shared-memory snapshot. MAP_SHARED mappings can be modified by + // multiple processes between syscall boundaries. Copying an entire + // host page from one process when it changed only a small field would + // overwrite disjoint writes already published by another process with + // this process's stale view of that page. + if (this.copyChangedSharedMappingRanges( + backing, + processMem, + mapAddr + offset, + mapping.snapshot, + offset, + mapping.fileOffset + offset, + n, + )) { + changed = true; + } + } + + if (changed) { + backing.version++; + mapping.version = backing.version; + } + } + } + + private copyChangedSharedMappingRanges( + backing: SharedMmapBacking, + processMem: Uint8Array, + processOffset: number, + snapshot: Uint8Array, + snapshotOffset: number, + backingOffset: number, + len: number, + ): boolean { + let changed = false; + let i = 0; + while (i < len) { + while ( + i < len + && processMem[processOffset + i] === snapshot[snapshotOffset + i] + ) { + i++; + } + if (i >= len) break; + + const runStart = i; + do { + i++; + } while ( + i < len + && processMem[processOffset + i] !== snapshot[snapshotOffset + i] + ); + + const bytes = processMem.subarray( + processOffset + runStart, + processOffset + i, + ); + this.copyRangeToBacking( + backing, + backingOffset + runStart, + bytes, + true, + ); + snapshot.set(bytes, snapshotOffset + runStart); + changed = true; + } + return changed; + } + + private refreshSharedMappingsToProcess(channel: ChannelInfo, includeAnonymous = true): void { + const pidMap = this.sharedMappings.get(channel.pid); + if (!pidMap || pidMap.size === 0) return; + const processMem = new Uint8Array(channel.memory.buffer); + + for (const [mapAddr, mapping] of pidMap) { + const backing = this.sharedMmapBackings.get(mapping.backingKey); + if (!backing || mapping.version === backing.version) continue; + if (backing.anonymous && !includeAnonymous) continue; + if (mapAddr + mapping.len > processMem.length) continue; + + const latest = this.readBackingRange(backing, mapping.fileOffset, mapping.len); + processMem.set(latest, mapAddr); + mapping.snapshot = latest.slice(); + mapping.version = backing.version; + } + } + /** * Populate a file-backed mmap region by reading from the file fd via pread. * Called after the kernel allocates the anonymous region and the host zeroes it. @@ -7091,13 +7833,9 @@ export class CentralizedKernelWorker { } /** - * Flush MAP_SHARED regions that overlap the msync range back to the file. - * Reads from process memory and writes to the file via pwrite. + * Flush MAP_SHARED regions that overlap the msync/munmap range. */ - private flushSharedMappings( - channel: ChannelInfo, - origArgs: number[], - ): void { + private flushSharedMappings(channel: ChannelInfo, origArgs: number[]): void { const syncAddr = origArgs[0] >>> 0; const syncLen = origArgs[1] >>> 0; const pidMap = this.sharedMappings.get(channel.pid); @@ -7106,6 +7844,7 @@ export class CentralizedKernelWorker { const syncEnd = syncAddr + syncLen; for (const [mapAddr, mapping] of pidMap) { + if (!mapping.writable) continue; const mapEnd = mapAddr + mapping.len; // Check overlap if (mapAddr >= syncEnd || mapEnd <= syncAddr) continue; @@ -7116,69 +7855,275 @@ export class CentralizedKernelWorker { const flushLen = flushEnd - flushStart; if (flushLen <= 0) continue; - // File offset for the flush region const fileOffsetBase = mapping.fileOffset + (flushStart - mapAddr); + const backing = this.sharedMmapBackings.get(mapping.backingKey); + if (backing) { + this.flushBackingRange(backing, fileOffsetBase, flushLen); + } + } + } + + private flushBackingRange(backing: SharedMmapBacking, offset: number, len: number): boolean { + if (len <= 0 || backing.dirtyPages.size === 0) return true; + if (backing.handle < 0) { + backing.dirtyPages.clear(); + return true; + } + const end = offset + len; + let ok = true; + + for (const page of Array.from(backing.dirtyPages).sort((a, b) => a - b)) { + const pageStart = page * FILE_PAGE_SIZE; + const pageEnd = pageStart + FILE_PAGE_SIZE; + if (pageStart >= end || pageEnd <= offset) continue; - // Read from process memory and write to file via pwrite - this.pwriteFromProcessMemory( - channel, mapping.fd, flushStart, flushLen, fileOffsetBase, + const writeStart = Math.max(offset, pageStart); + const writeEnd = Math.min(end, pageEnd); + const pageData = this.ensureBackingPageLoaded(backing, page); + const source = pageData.subarray(writeStart - pageStart, writeEnd - pageStart); + if (!this.writeAllToBackingHandle(backing, source, writeStart)) { + ok = false; + continue; + } + if (writeStart <= pageStart && writeEnd >= pageEnd) { + backing.dirtyPages.delete(page); + } + } + return ok; + } + + private writeAllToBackingHandle( + backing: SharedMmapBacking, + source: Uint8Array, + fileOffset: number, + ): boolean { + let written = 0; + while (written < source.length) { + try { + const n = this.io.write( + backing.handle, + source.subarray(written), + fileOffset + written, + source.length - written, + ); + if (n <= 0) return false; + written += n; + } catch { + return false; + } + } + return true; + } + + private flushSharedMappingsBeforeFileSyscall( + channel: ChannelInfo, + syscallNr: number, + origArgs: number[], + ): void { + if (syscallNr === SYS_SENDFILE) { + this.flushSharedBackingForFd(channel, origArgs[0]); + this.flushSharedBackingForFd(channel, origArgs[1]); + return; + } + if (!this.syscallTouchesFdStorageBeforeKernel(syscallNr)) return; + this.flushSharedBackingForFd(channel, origArgs[0]); + } + + private flushSharedBackingForFd(channel: ChannelInfo, fd: number): void { + if (fd < 0) return; + const backing = this.findSharedBackingForFd(channel, fd); + if (backing && backing.dirtyPages.size > 0) { + this.flushBackingRange(backing, 0, Number.MAX_SAFE_INTEGER); + } + } + + private syscallTouchesFdStorageBeforeKernel(syscallNr: number): boolean { + return syscallNr === SYS_READ + || syscallNr === SYS_PREAD + || syscallNr === SYS_READV + || syscallNr === SYS_PREADV + || syscallNr === SYS_WRITE + || syscallNr === SYS_PWRITE + || syscallNr === SYS_WRITEV + || syscallNr === SYS_PWRITEV + || syscallNr === SYS_FSYNC + || syscallNr === SYS_FDATASYNC + || syscallNr === SYS_FTRUNCATE + || syscallNr === SYS_CLOSE; + } + + private handleSharedMappingsAfterFileSyscall( + channel: ChannelInfo, + syscallNr: number, + origArgs: number[], + retVal: number, + errVal: number, + ): void { + if (errVal !== 0) return; + if (syscallNr === SYS_CLOSE && retVal === 0) { + this.invalidateSharedMmapFdCache(channel.pid, origArgs[0]); + return; + } + if (syscallNr === SYS_DUP && retVal >= 0) { + this.invalidateSharedMmapFdCache(channel.pid, retVal); + return; + } + if ((syscallNr === SYS_DUP2 || syscallNr === SYS_DUP3) && retVal >= 0) { + this.invalidateSharedMmapFdCache(channel.pid, origArgs[1]); + return; + } + if (syscallNr === SYS_FCNTL && retVal >= 0) { + const cmd = origArgs[1] >>> 0; + if (cmd === F_DUPFD || cmd === F_DUPFD_CLOEXEC || cmd === F_DUPFD_CLOFORK) { + this.invalidateSharedMmapFdCache(channel.pid, retVal); + return; + } + } + if (syscallNr === SYS_PWRITE && retVal > 0) { + this.updateSharedBackingFromProcessBuffer( + channel, + origArgs[0], + origArgs[1] >>> 0, + retVal, + origArgs[3], ); + return; + } + if (syscallNr === SYS_WRITE && retVal > 0) { + this.reloadSharedBackingForFd(channel, origArgs[0]); + return; + } + if ((syscallNr === SYS_WRITEV || syscallNr === SYS_PWRITEV) && retVal > 0) { + this.reloadSharedBackingForFd(channel, origArgs[0]); + return; + } + if (syscallNr === SYS_SENDFILE && retVal > 0) { + this.reloadSharedBackingForFd(channel, origArgs[0]); + return; + } + if (syscallNr === SYS_FTRUNCATE && retVal === 0) { + this.reloadSharedBackingForFd(channel, origArgs[0]); } } - /** - * Write data from process memory to a file via kernel pwrite syscalls. - */ - private pwriteFromProcessMemory( + private syncSharedMappingsAfterDirectFileSyscall( + channel: ChannelInfo, + syscallNr: number, + origArgs: number[], + retVal: number, + errVal: number, + ): void { + this.handleSharedMappingsAfterFileSyscall(channel, syscallNr, origArgs, retVal, errVal); + this.syncSharedMappingsFromProcess(channel, false); + this.refreshSharedMappingsToProcess(channel, false); + } + + private updateSharedBackingFromProcessBuffer( channel: ChannelInfo, fd: number, - processAddr: number, + ptr: number, len: number, fileOffset: number, ): void { - const handleChannel = this.kernelInstance!.exports.kernel_handle_channel as - (offset: KernelPointer, pid: number) => number; - const kernelView = new DataView(this.kernelMemory!.buffer, this.scratchOffset); - const kernelMem = new Uint8Array(this.kernelMemory!.buffer); - const dataStart = this.scratchOffset + CH_DATA; + if (len <= 0) return; + const backing = this.findSharedBackingForFd(channel, fd); + if (!backing) return; + const processMem = new Uint8Array(channel.memory.buffer); + if (ptr + len > processMem.length) { + this.reloadSharedBackingRange(backing, fileOffset, len); + return; + } + this.copyRangeToBacking( + backing, + fileOffset, + processMem.subarray(ptr, ptr + len), + false, + ); + backing.version++; + } - let written = 0; - while (written < len) { - const chunkSize = Math.min(CH_DATA_SIZE, len - written); + private reloadSharedBackingForFd(channel: ChannelInfo, fd: number): void { + const backing = this.findSharedBackingForFd(channel, fd); + if (!backing) return; + const loadedPages = Array.from(backing.pages.keys()); + if (loadedPages.length === 0) return; + for (const page of loadedPages) { + backing.pages.set(page, this.readBackingPageFromFile(backing, page)); + backing.dirtyPages.delete(page); + } + backing.version++; + } - // Copy chunk from process memory to kernel scratch data area - const processMem = new Uint8Array(channel.memory.buffer); - kernelMem.set( - processMem.subarray(processAddr + written, processAddr + written + chunkSize), - dataStart, - ); + private reloadSharedBackingRange(backing: SharedMmapBacking, offset: number, len: number): void { + if (len <= 0) return; + const firstPage = Math.floor(offset / FILE_PAGE_SIZE); + const lastPage = Math.floor((offset + len - 1) / FILE_PAGE_SIZE); + let reloaded = false; + for (let page = firstPage; page <= lastPage; page++) { + if (!backing.pages.has(page)) continue; + backing.pages.set(page, this.readBackingPageFromFile(backing, page)); + backing.dirtyPages.delete(page); + reloaded = true; + } + if (reloaded) backing.version++; + } - // Set up pwrite syscall in kernel scratch: - // SYS_PWRITE (65): (fd, buf_ptr, count, offset_lo, offset_hi) - const curOffset = fileOffset + written; - kernelView.setUint32(CH_SYSCALL, SYS_PWRITE, true); - kernelView.setBigInt64(CH_ARGS + 0 * CH_ARG_SIZE, BigInt(fd), true); - kernelView.setBigInt64(CH_ARGS + 1 * CH_ARG_SIZE, BigInt(dataStart), true); - kernelView.setBigInt64(CH_ARGS + 2 * CH_ARG_SIZE, BigInt(chunkSize), true); - kernelView.setBigInt64(CH_ARGS + 3 * CH_ARG_SIZE, BigInt(curOffset & 0xffffffff), true); - kernelView.setBigInt64(CH_ARGS + 4 * CH_ARG_SIZE, BigInt(Math.floor(curOffset / 0x100000000) | 0), true); - kernelView.setBigInt64(CH_ARGS + 5 * CH_ARG_SIZE, BigInt(0), true); + private findSharedBackingForFd(channel: ChannelInfo, fd: number): SharedMmapBacking | null { + if (this.sharedMmapBackings.size === 0) return null; + const cacheKey = this.sharedMmapFdCacheKey(channel.pid, fd); + const cached = this.sharedMmapFdCache.get(cacheKey); + if (cached) { + return cached.backingKey === null + ? null + : this.sharedMmapBackings.get(cached.backingKey) ?? null; + } - this.currentHandlePid = channel.pid; - this.bindKernelTidForChannel(channel); - try { - handleChannel(this.toKernelPtr(this.scratchOffset), channel.pid); - } catch { - break; + const stat = this.getFdStatForSharedMapping(channel, fd); + if (stat?.key) { + const backing = this.sharedMmapBackings.get(stat.key); + if (backing) { + this.sharedMmapFdCache.set(cacheKey, { backingKey: backing.key }); + return backing; } - this.currentHandlePid = 0; + } + const path = this.getFdPathForSharedMapping(channel, fd); + if (!path) { + this.sharedMmapFdCache.set(cacheKey, { backingKey: null }); + return null; + } + const backing = this.sharedMmapBackings.get(`path:${path}`) ?? null; + this.sharedMmapFdCache.set(cacheKey, { backingKey: backing?.key ?? null }); + return backing; + } - const bytesWritten = Number(kernelView.getBigInt64(CH_RETURN, true)); - if (bytesWritten <= 0) break; + private sharedMmapFdCacheKey(pid: number, fd: number): string { + return `${pid}:${fd}`; + } - written += bytesWritten; - if (bytesWritten < chunkSize) break; + private invalidateSharedMmapFdCache(pid?: number, fd?: number): void { + if (pid === undefined || fd === undefined) { + this.sharedMmapFdCache.clear(); + return; } + this.sharedMmapFdCache.delete(this.sharedMmapFdCacheKey(pid, fd)); + } + + private releaseSharedMapping(mapping: SharedMmapMapping): void { + const backing = this.sharedMmapBackings.get(mapping.backingKey); + if (!backing) return; + backing.refCount = Math.max(0, backing.refCount - 1); + if (backing.refCount > 0) return; + + this.flushBackingRange(backing, 0, Number.MAX_SAFE_INTEGER); + if (backing.handle >= 0) { + try { + this.io.close(backing.handle); + } catch { + // The kernel should not fail teardown because a host close raced. + } + } + this.sharedMmapBackings.delete(backing.key); + this.invalidateSharedMmapFdCache(); } /** @@ -7188,13 +8133,66 @@ export class CentralizedKernelWorker { const pidMap = this.sharedMappings.get(pid); if (!pidMap) return; - const unmapEnd = addr + len; - for (const [mapAddr, mapping] of pidMap) { + const alignedLen = Math.ceil(len / WASM_PAGE_SIZE) * WASM_PAGE_SIZE; + if (alignedLen <= 0) return; + const unmapEnd = addr + alignedLen; + + for (const [mapAddr, mapping] of Array.from(pidMap.entries())) { const mapEnd = mapAddr + mapping.len; - // Remove if fully contained in unmap range - if (mapAddr >= addr && mapEnd <= unmapEnd) { + + const overlapStart = Math.max(addr, mapAddr); + const overlapEnd = Math.min(unmapEnd, mapEnd); + if (overlapStart >= overlapEnd) continue; + + if (overlapStart <= mapAddr && overlapEnd >= mapEnd) { + this.releaseSharedMapping(mapping); + pidMap.delete(mapAddr); + continue; + } + + if (overlapStart <= mapAddr) { + const trim = overlapEnd - mapAddr; + const newAddr = overlapEnd; + const newLen = mapEnd - overlapEnd; pidMap.delete(mapAddr); + if (newLen > 0) { + pidMap.set(newAddr, { + ...mapping, + fileOffset: mapping.fileOffset + trim, + len: newLen, + snapshot: mapping.snapshot.slice(trim), + }); + } else { + this.releaseSharedMapping(mapping); + } + continue; + } + + if (overlapEnd >= mapEnd) { + const newLen = overlapStart - mapAddr; + mapping.len = newLen; + mapping.snapshot = mapping.snapshot.slice(0, newLen); + continue; + } + + const leftLen = overlapStart - mapAddr; + const rightSkip = overlapEnd - mapAddr; + const rightLen = mapEnd - overlapEnd; + const rightAddr = overlapEnd; + const rightMapping: SharedMmapMapping = { + ...mapping, + fileOffset: mapping.fileOffset + rightSkip, + len: rightLen, + snapshot: mapping.snapshot.slice(rightSkip), + }; + mapping.len = leftLen; + mapping.snapshot = mapping.snapshot.slice(0, leftLen); + + const backing = this.sharedMmapBackings.get(mapping.backingKey); + if (backing) { + backing.refCount++; } + pidMap.set(rightAddr, rightMapping); } if (pidMap.size === 0) { @@ -7202,6 +8200,258 @@ export class CentralizedKernelWorker { } } + private releaseAllSharedMappingsForProcess(pid: number): void { + const registration = this.processes.get(pid); + if (registration) { + this.syncSharedMappingsFromProcess({ + pid, + memory: registration.memory, + channelOffset: 0, + i32View: new Int32Array(registration.memory.buffer, 0, 1), + consecutiveSyscalls: 0, + }, true, { force: true }); + } + + const pidMap = this.sharedMappings.get(pid); + if (!pidMap) return; + for (const mapping of pidMap.values()) { + const backing = this.sharedMmapBackings.get(mapping.backingKey); + if (backing) this.flushBackingRange(backing, mapping.fileOffset, mapping.len); + this.releaseSharedMapping(mapping); + } + this.sharedMappings.delete(pid); + } + + private inheritSharedMappings(parentPid: number, childPid: number): void { + const parentMap = this.sharedMappings.get(parentPid); + const childRegistration = this.processes.get(childPid); + if (!parentMap || parentMap.size === 0 || !childRegistration) return; + + const childMem = new Uint8Array(childRegistration.memory.buffer); + const childMap = new Map(); + for (const [mapAddr, mapping] of parentMap) { + const backing = this.sharedMmapBackings.get(mapping.backingKey); + if (!backing || mapAddr + mapping.len > childMem.length) continue; + backing.refCount++; + const snapshot = childMem.slice(mapAddr, mapAddr + mapping.len); + childMap.set(mapAddr, { + ...mapping, + snapshot, + version: backing.version, + }); + } + if (childMap.size > 0) { + this.sharedMappings.set(childPid, childMap); + } + } + + inheritProcessSharedMappings(parentPid: number, childPid: number): void { + this.inheritSharedMappings(parentPid, childPid); + this.inheritSysvShmMappings(parentPid, childPid); + } + + private setKernelCurrentPid(pid: number): void { + const setCurrentPid = this.kernelInstance!.exports.kernel_set_current_pid as + ((pid: number) => void) | undefined; + if (setCurrentPid) setCurrentPid(pid); + } + + private synchronizeSysvShmMappingsForSyscallBoundary(channel: ChannelInfo): void { + this.syncSysvShmMappingsFromProcess(channel); + this.refreshSysvShmMappingsToProcess(channel); + } + + private syncSysvShmMappingsFromProcess( + channel: ChannelInfo, + options: { force?: boolean } = {}, + ): void { + const pidMap = this.shmMappings.get(channel.pid); + if (!pidMap || pidMap.size === 0) return; + const processMem = new Uint8Array(channel.memory.buffer); + this.setKernelCurrentPid(channel.pid); + + for (const [mapAddr, mapping] of pidMap) { + const segmentVersion = this.shmSegmentVersions.get(mapping.segId) ?? 0; + if ( + !options.force + && mapping.version === segmentVersion + && !this.hasPeerSysvShmMapping(channel.pid, mapAddr, mapping.segId) + ) { + continue; + } + this.syncSysvShmMappingFromProcess(processMem, mapAddr, mapping); + } + } + + private hasPeerSysvShmMapping(pid: number, mapAddr: number, segId: number): boolean { + for (const [otherPid, pidMap] of this.shmMappings) { + for (const [otherAddr, otherMapping] of pidMap) { + if (otherMapping.segId !== segId) continue; + if (otherPid === pid && otherAddr === mapAddr) continue; + return true; + } + } + return false; + } + + private syncSysvShmSegmentFromMappedProcesses(segId: number): void { + for (const [pid, pidMap] of this.shmMappings) { + const registration = this.processes.get(pid); + if (!registration) continue; + const processMem = new Uint8Array(registration.memory.buffer); + this.setKernelCurrentPid(pid); + for (const [mapAddr, mapping] of pidMap) { + if (mapping.segId === segId) { + this.syncSysvShmMappingFromProcess(processMem, mapAddr, mapping); + } + } + } + } + + private syncSysvShmMappingFromProcess( + processMem: Uint8Array, + mapAddr: number, + mapping: SysvShmMapping, + ): void { + if (mapping.readOnly) return; + if (mapAddr + mapping.size > processMem.length) return; + + let changed = false; + for (let offset = 0; offset < mapping.size; offset += FILE_PAGE_SIZE) { + const n = Math.min(FILE_PAGE_SIZE, mapping.size - offset); + if (!this.rangeDiffersFromSnapshot( + processMem, + mapAddr + offset, + mapping.snapshot, + offset, + n, + )) { + continue; + } + + const bytes = processMem.subarray(mapAddr + offset, mapAddr + offset + n); + if (!this.writeSysvShmRange(mapping.segId, offset, bytes)) break; + mapping.snapshot.set(bytes, offset); + changed = true; + } + + if (changed) { + const version = (this.shmSegmentVersions.get(mapping.segId) ?? 0) + 1; + this.shmSegmentVersions.set(mapping.segId, version); + mapping.version = version; + } + } + + private refreshSysvShmMappingsToProcess(channel: ChannelInfo): void { + const pidMap = this.shmMappings.get(channel.pid); + if (!pidMap || pidMap.size === 0) return; + const processMem = new Uint8Array(channel.memory.buffer); + + for (const [mapAddr, mapping] of pidMap) { + const version = this.shmSegmentVersions.get(mapping.segId) ?? 0; + if (mapping.version === version) continue; + if (mapAddr + mapping.size > processMem.length) continue; + const latest = this.readSysvShmRange(mapping.segId, 0, mapping.size); + if (!latest) continue; + processMem.set(latest, mapAddr); + mapping.snapshot = latest; + mapping.version = version; + } + } + + private readSysvShmRange(segId: number, offset: number, len: number): Uint8Array | null { + const readChunk = this.kernelInstance!.exports.kernel_ipc_shm_read_chunk as + (shmid: number, offset: number, outPtr: KernelPointer, maxLen: number) => number; + const kernelMem = this.getKernelMem(); + const chunkPtr = this.scratchOffset + CH_DATA; + const out = new Uint8Array(len); + let transferred = 0; + + while (transferred < len) { + const toRead = Math.min(CH_DATA_SIZE, len - transferred); + const nRead = readChunk(segId, offset + transferred, this.toKernelPtr(chunkPtr), toRead); + if (nRead < 0) return null; + if (nRead === 0) break; + out.set(kernelMem.subarray(chunkPtr, chunkPtr + nRead), transferred); + transferred += nRead; + } + + return out; + } + + private writeSysvShmRange(segId: number, offset: number, bytes: Uint8Array): boolean { + const writeChunk = this.kernelInstance!.exports.kernel_ipc_shm_write_chunk as + (shmid: number, offset: number, dataPtr: KernelPointer, dataLen: number) => number; + const kernelMem = this.getKernelMem(); + const chunkPtr = this.scratchOffset + CH_DATA; + let transferred = 0; + + while (transferred < bytes.length) { + const toWrite = Math.min(CH_DATA_SIZE, bytes.length - transferred); + kernelMem.set(bytes.subarray(transferred, transferred + toWrite), chunkPtr); + const nWritten = writeChunk(segId, offset + transferred, this.toKernelPtr(chunkPtr), toWrite); + if (nWritten <= 0) return false; + transferred += nWritten; + } + + return true; + } + + private releaseAllSysvShmMappingsForProcess(pid: number): void { + const registration = this.processes.get(pid); + const pidMap = this.shmMappings.get(pid); + if (!pidMap || pidMap.size === 0) return; + + if (registration) { + this.syncSysvShmMappingsFromProcess({ + pid, + memory: registration.memory, + channelOffset: 0, + i32View: new Int32Array(registration.memory.buffer, 0, 1), + consecutiveSyscalls: 0, + }, { force: true }); + } + + this.setKernelCurrentPid(pid); + const kernelShmdt = this.kernelInstance!.exports.kernel_ipc_shmdt as + ((shmid: number) => number) | undefined; + if (kernelShmdt) { + for (const mapping of pidMap.values()) { + kernelShmdt(mapping.segId); + } + } + this.shmMappings.delete(pid); + } + + private inheritSysvShmMappings(parentPid: number, childPid: number): void { + const parentMap = this.shmMappings.get(parentPid); + const childRegistration = this.processes.get(childPid); + if (!parentMap || parentMap.size === 0 || !childRegistration) return; + + const childMem = new Uint8Array(childRegistration.memory.buffer); + const childMap = new Map(); + const kernelShmat = this.kernelInstance!.exports.kernel_ipc_shmat as + ((shmid: number, shmaddr: number, flags: number) => number) | undefined; + if (!kernelShmat) return; + + this.setKernelCurrentPid(childPid); + for (const [mapAddr, mapping] of parentMap) { + if (mapAddr + mapping.size > childMem.length) continue; + const flags = mapping.readOnly ? SHM_RDONLY : 0; + const sizeOrErr = kernelShmat(mapping.segId, mapAddr, flags); + if (sizeOrErr < 0) continue; + childMap.set(mapAddr, { + ...mapping, + snapshot: childMem.slice(mapAddr, mapAddr + mapping.size), + version: this.shmSegmentVersions.get(mapping.segId) ?? mapping.version, + }); + } + + if (childMap.size > 0) { + this.shmMappings.set(childPid, childMap); + } + } + /** Set the next child PID to allocate. */ setNextChildPid(pid: number): void { this.nextChildPid = pid; @@ -8159,7 +9409,6 @@ export class CentralizedKernelWorker { } } this.tcpConnections.delete(pid); - this.shmMappings.delete(pid); } // ========================================================================= @@ -8286,8 +9535,13 @@ export class CentralizedKernelWorker { const [shmid, _shmaddr, _flags] = args; // Set current pid for kernel_ipc_* exports - const setCurrentPid = this.kernelInstance!.exports.kernel_set_current_pid as ((pid: number) => void) | undefined; - if (setCurrentPid) setCurrentPid(channel.pid); + this.setKernelCurrentPid(channel.pid); + + // If this segment already has a host-side attachment, publish any + // single-observer writes before the new attachment reads its initial + // bytes from the kernel's segment backing. + this.syncSysvShmSegmentFromMappedProcesses(shmid); + this.setKernelCurrentPid(channel.pid); const kernelShmat = this.kernelInstance!.exports.kernel_ipc_shmat as (shmid: number, shmaddr: number, flags: number) => number; const sizeOrErr = kernelShmat(shmid, _shmaddr, _flags); @@ -8299,11 +9553,13 @@ export class CentralizedKernelWorker { const size = sizeOrErr; // Synthesize mmap to allocate virtual address space for this pid + const readOnly = (_flags & SHM_RDONLY) !== 0; + const prot = readOnly ? 1 : 3; // PROT_READ, or PROT_READ|PROT_WRITE. const kernelView = new DataView(this.kernelMemory!.buffer, this.scratchOffset); kernelView.setUint32(CH_SYSCALL, SYS_MMAP, true); kernelView.setBigInt64(CH_ARGS + 0 * CH_ARG_SIZE, BigInt(0), true); // addr hint = NULL kernelView.setBigInt64(CH_ARGS + 1 * CH_ARG_SIZE, BigInt(size), true); // length - kernelView.setBigInt64(CH_ARGS + 2 * CH_ARG_SIZE, BigInt(3), true); // prot = PROT_READ|PROT_WRITE + kernelView.setBigInt64(CH_ARGS + 2 * CH_ARG_SIZE, BigInt(prot), true); // prot kernelView.setBigInt64(CH_ARGS + 3 * CH_ARG_SIZE, BigInt(0x22), true); // flags = MAP_PRIVATE|MAP_ANONYMOUS kernelView.setBigInt64(CH_ARGS + 4 * CH_ARG_SIZE, BigInt(-1), true); // fd = -1 kernelView.setBigInt64(CH_ARGS + 5 * CH_ARG_SIZE, BigInt(0), true); // offset = 0 @@ -8330,23 +9586,13 @@ export class CentralizedKernelWorker { } // Grow process memory to cover the allocated address - this.ensureProcessMemoryCovers(channel.pid, channel.memory, SYS_MMAP, addr, [0, size, 3, 0x22, -1, 0]); + this.ensureProcessMemoryCovers(channel.pid, channel.memory, SYS_MMAP, addr, [0, size, prot, 0x22, -1, 0]); // Transfer segment data from kernel to process memory via read_chunk - const readChunk = this.kernelInstance!.exports.kernel_ipc_shm_read_chunk as - (shmid: number, offset: number, outPtr: KernelPointer, maxLen: number) => number; const processMem = new Uint8Array(channel.memory.buffer); - const kernelMem = this.getKernelMem(); - const chunkSize = CH_DATA_SIZE; - const chunkPtr = this.scratchOffset + CH_DATA; - let transferred = 0; - while (transferred < size) { - const remaining = size - transferred; - const toRead = Math.min(remaining, chunkSize); - const nRead = readChunk(shmid, transferred, this.toKernelPtr(chunkPtr), toRead); - if (nRead <= 0) break; - processMem.set(kernelMem.subarray(chunkPtr, chunkPtr + nRead), (addr >>> 0) + transferred); - transferred += nRead; + const snapshot = this.readSysvShmRange(shmid, 0, size); + if (snapshot) { + processMem.set(snapshot, addr >>> 0); } // Track the mapping for shmdt @@ -8355,7 +9601,14 @@ export class CentralizedKernelWorker { pidMappings = new Map(); this.shmMappings.set(channel.pid, pidMappings); } - pidMappings.set(addr >>> 0, { segId: shmid, size }); + const mapAddr = addr >>> 0; + pidMappings.set(mapAddr, { + segId: shmid, + size, + readOnly, + snapshot: snapshot ?? processMem.slice(mapAddr, mapAddr + size), + version: this.shmSegmentVersions.get(shmid) ?? 0, + }); this.completeChannelRaw(channel, addr, 0); this.relistenChannel(channel); @@ -8378,25 +9631,12 @@ export class CentralizedKernelWorker { } // Set current pid for kernel exports - const setCurrentPid = this.kernelInstance!.exports.kernel_set_current_pid as ((pid: number) => void) | undefined; - if (setCurrentPid) setCurrentPid(channel.pid); + this.setKernelCurrentPid(channel.pid); - // Sync process memory back to kernel segment via write_chunk - const writeChunk = this.kernelInstance!.exports.kernel_ipc_shm_write_chunk as - (shmid: number, offset: number, dataPtr: KernelPointer, dataLen: number) => number; + // Sync only dirty writable mappings. A read-only attachment must never + // overwrite newer segment contents on detach. const processMem = new Uint8Array(channel.memory.buffer); - const kernelMem = this.getKernelMem(); - const chunkSize = CH_DATA_SIZE; - const chunkPtr = this.scratchOffset + CH_DATA; - let transferred = 0; - while (transferred < mapping.size) { - const remaining = mapping.size - transferred; - const toWrite = Math.min(remaining, chunkSize); - kernelMem.set(processMem.subarray(addr + transferred, addr + transferred + toWrite), chunkPtr); - const nWritten = writeChunk(mapping.segId, transferred, this.toKernelPtr(chunkPtr), toWrite); - if (nWritten <= 0) break; - transferred += nWritten; - } + this.syncSysvShmMappingFromProcess(processMem, addr, mapping); // Kernel-side detach bookkeeping const kernelShmdt = this.kernelInstance!.exports.kernel_ipc_shmdt as (shmid: number) => number; diff --git a/host/src/node-kernel-host.ts b/host/src/node-kernel-host.ts index c29214aee..16c713bab 100644 --- a/host/src/node-kernel-host.ts +++ b/host/src/node-kernel-host.ts @@ -95,6 +95,8 @@ export interface SpawnOptions { /** Initial real/effective group ID for the process. */ gid?: number; stdin?: Uint8Array; + /** Stdio fds (0, 1, 2) that should be host-backed pipes, not terminals. */ + pipeStdio?: number[]; /** Optional pre-compiled module for the supplied program bytes. */ programModule?: WebAssembly.Module; pty?: boolean; @@ -217,6 +219,7 @@ export class NodeKernelHost { ptyCols: options?.ptyCols, ptyRows: options?.ptyRows, stdin: options?.stdin, + pipeStdio: options?.pipeStdio, maxAddr: options?.maxAddr, }) as number; diff --git a/host/src/node-kernel-protocol.ts b/host/src/node-kernel-protocol.ts index 853fc19b4..d40fdfa21 100644 --- a/host/src/node-kernel-protocol.ts +++ b/host/src/node-kernel-protocol.ts @@ -63,6 +63,8 @@ export interface SpawnMessage { ptyCols?: number; ptyRows?: number; stdin?: Uint8Array; + /** Stdio fds (0, 1, 2) that should be host-backed pipes, not terminals. */ + pipeStdio?: number[]; /** Limit heap growth to protect thread channel pages */ maxAddr?: number; } diff --git a/host/src/node-kernel-worker-entry.ts b/host/src/node-kernel-worker-entry.ts index 2ecd3310d..4bd3c824f 100644 --- a/host/src/node-kernel-worker-entry.ts +++ b/host/src/node-kernel-worker-entry.ts @@ -15,6 +15,7 @@ */ import { parentPort } from "node:worker_threads"; import { readFileSync, existsSync, mkdtempSync, rmSync } from "node:fs"; +import { createHash } from "node:crypto"; import { tmpdir } from "node:os"; import { join } from "node:path"; import { fileURLToPath } from "node:url"; @@ -105,6 +106,8 @@ interface ProcessInfo { const processes = new Map(); const processTeardowns = new Map>(); const reportedExits = new Set(); +const compiledProgramModules = new Map>(); +const MAX_COMPILED_PROGRAM_MODULES = 16; // Workers terminated by the kernel-worker entry itself (handleExit / // handleExec / handleTerminate). The crash safety-net listener checks @@ -356,6 +359,28 @@ function bufferToArrayBuffer(bytes: Uint8Array): ArrayBuffer { return out; } +function programModuleCacheKey(programBytes: ArrayBuffer): string { + const bytes = new Uint8Array(programBytes); + return `${bytes.byteLength}:${createHash("sha256").update(bytes).digest("hex")}`; +} + +async function getCompiledProgramModule( + programBytes: ArrayBuffer, +): Promise { + const key = programModuleCacheKey(programBytes); + let promise = compiledProgramModules.get(key); + if (!promise) { + promise = WebAssembly.compile(programBytes); + compiledProgramModules.set(key, promise); + promise.catch(() => compiledProgramModules.delete(key)); + if (compiledProgramModules.size > MAX_COMPILED_PROGRAM_MODULES) { + const oldest = compiledProgramModules.keys().next().value; + if (oldest) compiledProgramModules.delete(oldest); + } + } + return promise; +} + function resolveExecLocal(path: string): ArrayBuffer | null { const mapped = execPrograms[path]; if (mapped && existsSync(mapped)) { @@ -654,9 +679,15 @@ function handleSpawn(msg: SpawnMessage) { kernelWorker.onPtyOutput(ptyIdx, (data: Uint8Array) => { post({ type: "pty_output", pid, data }); }); - } else if (msg.stdin) { - const stdinData = msg.stdin instanceof Uint8Array ? msg.stdin : new Uint8Array(msg.stdin); - kernelWorker.setStdinData(pid, stdinData); + } else { + if (msg.pipeStdio) { + kernelWorker.setStdioPipes(pid, msg.pipeStdio); + } + if (msg.stdin) { + const stdinData = + msg.stdin instanceof Uint8Array ? msg.stdin : new Uint8Array(msg.stdin); + kernelWorker.setStdinData(pid, stdinData); + } } const initData: CentralizedWorkerInitMessage = { @@ -749,6 +780,7 @@ async function handleFork( maxAddr: childLayout.maxAddr, mmapBase: childLayout.mmapBase, }); + kernelWorker.inheritProcessSharedMappings(parentPid, childPid); const FORK_BUF_SIZE = FORK_SAVE_BUFFER_SIZE; const forkBufAddr = threadFork @@ -1070,6 +1102,7 @@ async function handleClone( if (threads) { const idx = threads.indexOf(threadEntry); if (idx >= 0) threads.splice(idx, 1); + if (threads.length === 0) threadWorkers.delete(pid); } }; const terminateThreadEntry = (): Promise => { diff --git a/host/src/platform/node.ts b/host/src/platform/node.ts index 8bf55dce2..58a133272 100644 --- a/host/src/platform/node.ts +++ b/host/src/platform/node.ts @@ -13,6 +13,15 @@ import type { PlatformIO, StatResult, StatfsResult } from "../types"; import { nativeStatfs, translateOpenFlags } from "../vfs/host-fs"; import { NativeMetadataOverlay } from "./native-metadata"; +const UTIME_NOW = 0x3fffffff; +const UTIME_OMIT = 0x3ffffffe; + +function makeFsError(code: string, message: string): Error & { code: string } { + const err = new Error(`${code}: ${message}`) as Error & { code: string }; + err.code = code; + return err; +} + export class NodePlatformIO implements PlatformIO { private dirHandles = new Map(); private nextDirHandle = 1; diff --git a/host/src/worker-main.ts b/host/src/worker-main.ts index 8a9ad298f..569ebe4ca 100644 --- a/host/src/worker-main.ts +++ b/host/src/worker-main.ts @@ -10,7 +10,7 @@ import type { CentralizedThreadInitMessage, WorkerToHostMessage, } from "./worker-protocol"; -import { DynamicLinker, type LoadedSharedLibrary } from "./dylink"; +import { DynamicLinker, type LoadedSharedLibrary, type SideModuleForkState } from "./dylink"; import { extractAbiVersion } from "./constants"; import { ABI_SYSCALLS, @@ -180,6 +180,10 @@ export interface DlopenSupport { * fork-child path AFTER setupChannelBase and BEFORE the wpk_fork * rewind into _start. */ replayDlopens: () => void; + /** Finish side-module fork unwind after the main module has unwound. */ + completeSideModuleForkUnwind: () => void; + /** Begin side-module fork rewind before re-entering the main module. */ + beginSideModuleForkRewind: () => void; } /** @@ -203,13 +207,18 @@ function buildDlopenImports( ): DlopenSupport { let linker: DynamicLinker | null = null; const loadedLibraries = new Map(); + let activeSideFork: SideModuleForkState | null = null; const decoder = new TextDecoder(); const encoder = new TextEncoder(); const n = (v: number | bigint): number => typeof v === "bigint" ? Number(v) : v; const forkBufAddr = channelOffset - FORK_BUF_SIZE; const headOffset = ptrWidth === 8 ? DLOPEN_HEAD_OFFSET_WASM64 : DLOPEN_HEAD_OFFSET_WASM32; + const sideForkOffset = ptrWidth === 8 + ? DLOPEN_ACTIVE_SIDE_FORK_OFFSET_WASM64 + : DLOPEN_ACTIVE_SIDE_FORK_OFFSET_WASM32; const headSlot = forkBufAddr - headOffset; + const activeSideForkSlot = forkBufAddr - sideForkOffset; const entrySize = ptrWidth === 8 ? DLOPEN_ENTRY_SIZE_WASM64 : DLOPEN_ENTRY_SIZE_WASM32; const readPtr = (view: DataView, addr: number): number => @@ -283,6 +292,21 @@ function buildDlopenImports( globalSymbols, got: new Map(), loadedLibraries, + sideModuleFork: { + setActiveFork: (state) => { + activeSideFork = state; + writePtr(new DataView(memory.buffer), activeSideForkSlot, state.forkBufAddr); + }, + clearActiveFork: (state) => { + if (activeSideFork?.forkBufAddr === state.forkBufAddr) { + activeSideFork = null; + } + const view = new DataView(memory.buffer); + if (readPtr(view, activeSideForkSlot) === state.forkBufAddr) { + writePtr(view, activeSideForkSlot, 0); + } + }, + }, }); return linker; }; @@ -291,7 +315,12 @@ function buildDlopenImports( // entry is one mmap block: struct, then name UTF-8 (padded to 8-byte // alignment), then the side-module wasm bytes. Pointers are absolute // — fork's memcpy preserves the parent's address space. - const persistArchiveEntry = (name: string, bytes: Uint8Array, memoryBase: number): void => { + const persistArchiveEntry = ( + name: string, + bytes: Uint8Array, + memoryBase: number, + sideForkBufAddr: number, + ): void => { const nameBytes = encoder.encode(name); const nameLen = nameBytes.length; const nameAligned = (nameLen + 7) & ~7; @@ -309,6 +338,7 @@ function buildDlopenImports( view.setBigUint64(entry + 24, BigInt(bytesPtr), true); view.setBigUint64(entry + 32, BigInt(bytes.length), true); view.setBigUint64(entry + 40, BigInt(memoryBase), true); + view.setBigUint64(entry + 48, BigInt(sideForkBufAddr), true); } else { view.setUint32(entry + 0, 0, true); view.setUint32(entry + 4, namePtr, true); @@ -316,6 +346,7 @@ function buildDlopenImports( view.setUint32(entry + 12, bytesPtr, true); view.setUint32(entry + 16, bytes.length, true); view.setUint32(entry + 20, memoryBase, true); + view.setUint32(entry + 24, sideForkBufAddr, true); } new Uint8Array(memory.buffer, namePtr, nameLen).set(nameBytes); @@ -349,7 +380,7 @@ function buildDlopenImports( const lk = getLinker(); while (cursor !== 0) { - let next: number, namePtr: number, nameLen: number, bytesPtr: number, bytesLen: number, memoryBase: number; + let next: number, namePtr: number, nameLen: number, bytesPtr: number, bytesLen: number, memoryBase: number, sideForkBufAddr: number; if (ptrWidth === 8) { next = Number(view.getBigUint64(cursor + 0, true)); namePtr = Number(view.getBigUint64(cursor + 8, true)); @@ -357,6 +388,7 @@ function buildDlopenImports( bytesPtr = Number(view.getBigUint64(cursor + 24, true)); bytesLen = Number(view.getBigUint64(cursor + 32, true)); memoryBase = Number(view.getBigUint64(cursor + 40, true)); + sideForkBufAddr = Number(view.getBigUint64(cursor + 48, true)); } else { next = view.getUint32(cursor + 0, true); namePtr = view.getUint32(cursor + 4, true); @@ -364,6 +396,7 @@ function buildDlopenImports( bytesPtr = view.getUint32(cursor + 12, true); bytesLen = view.getUint32(cursor + 16, true); memoryBase = view.getUint32(cursor + 20, true); + sideForkBufAddr = view.getUint32(cursor + 24, true); } // Copy name + bytes out of shared memory before passing to @@ -376,15 +409,54 @@ function buildDlopenImports( const bytesCopy = new Uint8Array(new Uint8Array(memory.buffer, bytesPtr, bytesLen)); // DynamicLinker.dlopenSync returns 0 on error, >0 on success. - const handle = lk.dlopenSync(name, bytesCopy, { memoryBase }); + const handle = lk.dlopenSync(name, bytesCopy, { memoryBase, forkBufAddr: sideForkBufAddr || undefined }); if (handle === 0) { throw new Error(`dlopen(${name}): ${lk.dlerror() || "unknown"}`); } + const loaded = loadedLibraries.get(name); + if (loaded?.forkBufAddr && loaded.forkBufAddr === readPtr(view, activeSideForkSlot)) { + activeSideFork = { name, instance: loaded.instance, forkBufAddr: loaded.forkBufAddr }; + } cursor = next; } }; + const findActiveSideFork = (): SideModuleForkState | null => { + if (activeSideFork) return activeSideFork; + const view = new DataView(memory.buffer); + const activeForkBufAddr = readPtr(view, activeSideForkSlot); + if (activeForkBufAddr === 0) return null; + for (const loaded of loadedLibraries.values()) { + if (loaded.forkBufAddr === activeForkBufAddr) { + activeSideFork = { + name: loaded.name, + instance: loaded.instance, + forkBufAddr: loaded.forkBufAddr, + }; + return activeSideFork; + } + } + return null; + }; + + const completeSideModuleForkUnwind = (): void => { + const sideFork = findActiveSideFork(); + if (!sideFork) return; + const state = (sideFork.instance.exports.wpk_fork_state as (() => number) | undefined)?.(); + if (state === 1) { + (sideFork.instance.exports.wpk_fork_unwind_end as () => void)(); + } + }; + + const beginSideModuleForkRewind = (): void => { + const sideFork = findActiveSideFork(); + if (!sideFork) return; + (sideFork.instance.exports.wpk_fork_rewind_begin as (addr: number) => void)( + sideFork.forkBufAddr, + ); + }; + const imports: Record = { __wasm_dlopen: (bytesPtr: number, bytesLen: number, namePtr: number, nameLen: number): number => { @@ -408,7 +480,7 @@ function buildDlopenImports( if (!loaded) { throw new Error(`__wasm_dlopen(${name}): handle=${handle} but loadedLibraries lookup failed`); } - persistArchiveEntry(name, bytesCopy, loaded.memoryBase); + persistArchiveEntry(name, bytesCopy, loaded.memoryBase, loaded.forkBufAddr ?? 0); } return handle; }, @@ -437,7 +509,7 @@ function buildDlopenImports( }, }; - return { imports, replayDlopens }; + return { imports, replayDlopens, completeSideModuleForkUnwind, beginSideModuleForkRewind }; } /** @@ -696,8 +768,10 @@ const FORK_BUF_SIZE = FORK_SAVE_BUFFER_SIZE; // wpk_fork rewind. const DLOPEN_HEAD_OFFSET_WASM32 = 12; const DLOPEN_HEAD_OFFSET_WASM64 = 24; -const DLOPEN_ENTRY_SIZE_WASM32 = 24; -const DLOPEN_ENTRY_SIZE_WASM64 = 48; +const DLOPEN_ACTIVE_SIDE_FORK_OFFSET_WASM32 = 16; +const DLOPEN_ACTIVE_SIDE_FORK_OFFSET_WASM64 = 32; +const DLOPEN_ENTRY_SIZE_WASM32 = 28; +const DLOPEN_ENTRY_SIZE_WASM64 = 56; const WPK_FORK_EXPORTS = [ "wpk_fork_unwind_begin", @@ -1002,6 +1076,7 @@ export async function centralizedWorkerMain( } replayedForkChildDlopens = true; } + dlopenSupport.beginSideModuleForkRewind(); needsRewind = false; } @@ -1021,6 +1096,7 @@ export async function centralizedWorkerMain( if (forkState === 1) { // Unwind completed (fork) — finalize and send SYS_FORK. unwindEnd(); + dlopenSupport.completeSideModuleForkUnwind(); // Send SYS_FORK through the channel now that memory has the // fork save buffer populated (saved_globals + frames). diff --git a/host/test/centralized-test-helper.ts b/host/test/centralized-test-helper.ts index ac371ef9f..4665d9f23 100644 --- a/host/test/centralized-test-helper.ts +++ b/host/test/centralized-test-helper.ts @@ -328,6 +328,7 @@ async function runOnMainThread(options: RunProgramOptions): Promise { + beforeAll(() => mkdirSync(BUILD_DIR, { recursive: true })); + const io = () => new NodePlatformIO(); + + it("fork called by a side module resumes parent and child at the call site", { timeout: 30_000 }, async () => { + const soPath = buildSharedLib(` + extern int fork(void); + extern void exit(int); + int side_fork(void) { + int pid = fork(); + if (pid == 0) exit(0); + return pid; + } + `, "libforkinside", true); + + const wasmPath = buildMainProgram(` + #include + #include + #include + typedef int (*side_fork_fn)(void); + int main(int argc, char **argv) { + void *lib = dlopen(argv[1], RTLD_NOW); + if (!lib) { fprintf(stderr, "dlopen: %s\\n", dlerror()); return 1; } + side_fork_fn side_fork = (side_fork_fn)dlsym(lib, "side_fork"); + if (!side_fork) { fprintf(stderr, "dlsym: %s\\n", dlerror()); return 1; } + int pid = side_fork(); + if (pid < 0) { fprintf(stderr, "side fork failed: %d\\n", pid); return 1; } + int status = 0; + if (waitpid(pid, &status, 0) != pid) { fprintf(stderr, "waitpid failed\\n"); return 1; } + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { fprintf(stderr, "bad child status %d\\n", status); return 1; } + puts("ok"); + return 0; + } + `, "test-fork-from-side"); + + const result = await runCentralizedProgram({ + programPath: wasmPath, + argv: ["fork-from-side-main", soPath], + timeout: 30_000, + io: io(), + }); + expect(result.exitCode).toBe(0); + expect(result.stdout).toContain("ok"); + }); +}); diff --git a/host/test/ifhwaddr.test.ts b/host/test/ifhwaddr.test.ts index 08c4eae00..8dba4b587 100644 --- a/host/test/ifhwaddr.test.ts +++ b/host/test/ifhwaddr.test.ts @@ -6,6 +6,7 @@ describe("SIOCGIFCONF / SIOCGIFHWADDR", () => { it("returns a virtual MAC address via ioctl", async () => { const result = await runCentralizedProgram({ programPath: resolveBinary("programs/ifhwaddr.wasm"), + useDefaultRootfs: false, timeout: 10_000, }); diff --git a/host/test/mmap-shared.test.ts b/host/test/mmap-shared.test.ts index 98c94be9b..d1b338821 100644 --- a/host/test/mmap-shared.test.ts +++ b/host/test/mmap-shared.test.ts @@ -1,7 +1,102 @@ import { describe, it, expect } from "vitest"; +import { existsSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; import { runCentralizedProgram } from "./centralized-test-helper"; import { resolveBinary } from "../src/binary-resolver"; import { NodePlatformIO } from "../src/platform/node"; +import { CentralizedKernelWorker } from "../src/kernel-worker"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const repoRoot = join(__dirname, "../.."); +const crossProcessFixture = join(repoRoot, "examples/mmap_shared_cross_process.wasm"); +const anonymousForkFixture = join(repoRoot, "examples/mmap_shared_anonymous_fork.wasm"); +const munmapReuseFixture = join(repoRoot, "examples/mmap_shared_munmap_reuse.wasm"); +const largePwriteFixture = join(repoRoot, "examples/mmap_shared_large_pwrite.wasm"); +const itIfCrossProcessFixture = existsSync(crossProcessFixture) ? it : it.skip; +const itIfAnonymousForkFixture = existsSync(anonymousForkFixture) ? it : it.skip; +const itIfMunmapReuseFixture = existsSync(munmapReuseFixture) ? it : it.skip; +const itIfLargePwriteFixture = existsSync(largePwriteFixture) ? it : it.skip; + +function createAnonymousSharedMmapHarness(refCount: number) { + const pid = 211; + const mapAddr = 0x3000; + const len = 4096; + const backingKey = "anon:test"; + const memory = new WebAssembly.Memory({ initial: 1, maximum: 1, shared: true }); + const backing = { + key: backingKey, + path: "", + handle: -1, + anonymous: true, + writable: true, + pages: new Map([[0, new Uint8Array(len)]]), + dirtyPages: new Set(), + refCount, + version: 0, + }; + const mapping = { + fd: -1, + fileOffset: 0, + len, + writable: true, + backingKey, + snapshot: new Uint8Array(len), + version: 0, + }; + const kw = Object.assign(Object.create(CentralizedKernelWorker.prototype), { + sharedMappings: new Map([[pid, new Map([[mapAddr, mapping]])]]), + sharedMmapBackings: new Map([[backingKey, backing]]), + }) as CentralizedKernelWorker; + const channel = { + pid, + memory, + channelOffset: 0, + i32View: new Int32Array(memory.buffer, 0, 1), + consecutiveSyscalls: 0, + }; + return { + backing, + channel, + kw, + mapAddr, + mapping, + processMem: new Uint8Array(memory.buffer), + }; +} + +describe("anonymous MAP_SHARED host synchronization", () => { + it("skips single-observer boundary publishes and publishes on forced handoff", () => { + const { backing, channel, kw, mapAddr, mapping, processMem } = + createAnonymousSharedMmapHarness(1); + + processMem[mapAddr + 23] = 0x4d; + (kw as any).syncSharedMappingsFromProcess(channel, true); + + expect(backing.version).toBe(0); + expect(backing.dirtyPages.size).toBe(0); + expect(mapping.snapshot[23]).toBe(0); + + (kw as any).syncSharedMappingsFromProcess(channel, true, { force: true }); + + expect(backing.version).toBe(1); + expect(backing.dirtyPages.has(0)).toBe(true); + expect(mapping.snapshot[23]).toBe(0x4d); + expect(backing.pages.get(0)![23]).toBe(0x4d); + }); + + it("publishes ordinary boundaries when another mapping observes the backing", () => { + const { backing, channel, kw, mapAddr, mapping, processMem } = + createAnonymousSharedMmapHarness(2); + + processMem[mapAddr + 31] = 0x91; + (kw as any).syncSharedMappingsFromProcess(channel, true); + + expect(backing.version).toBe(1); + expect(mapping.snapshot[31]).toBe(0x91); + expect(backing.pages.get(0)![31]).toBe(0x91); + }); +}); describe("MAP_SHARED mmap + msync", () => { it("writes through MAP_SHARED mapping and flushes with msync", async () => { @@ -18,4 +113,50 @@ describe("MAP_SHARED mmap + msync", () => { expect(result.stdout).toContain("mremap ok"); expect(result.stdout).toContain("PASS"); }); + + itIfCrossProcessFixture("keeps file-backed MAP_SHARED mappings coherent across processes", async () => { + const result = await runCentralizedProgram({ + programPath: crossProcessFixture, + io: new NodePlatformIO(), + timeout: 10000, + }); + expect(result.exitCode).toBe(0); + expect(result.stdout).toContain("inherited mapping coherent"); + expect(result.stdout).toContain("separate mapping coherent"); + expect(result.stdout).toContain("PASS"); + }); + + itIfAnonymousForkFixture("keeps anonymous MAP_SHARED mappings coherent after fork", async () => { + const result = await runCentralizedProgram({ + programPath: anonymousForkFixture, + io: new NodePlatformIO(), + timeout: 10000, + }); + expect(result.exitCode).toBe(0); + expect(result.stdout).toContain("inherited anonymous mapping coherent"); + expect(result.stdout).toContain("reused anonymous backing coherent"); + expect(result.stdout).toContain("PASS"); + }); + + itIfMunmapReuseFixture("drops page-rounded MAP_SHARED mappings before anonymous address reuse", async () => { + const result = await runCentralizedProgram({ + programPath: munmapReuseFixture, + io: new NodePlatformIO(), + timeout: 10000, + }); + expect(result.exitCode).toBe(0); + expect(result.stdout).toContain("partial munmap cleanup ok"); + expect(result.stdout).toContain("PASS"); + }); + + itIfLargePwriteFixture("refreshes MAP_SHARED mappings after large pwrite", async () => { + const result = await runCentralizedProgram({ + programPath: largePwriteFixture, + io: new NodePlatformIO(), + timeout: 10000, + }); + expect(result.exitCode).toBe(0); + expect(result.stdout).toContain("large pwrite mapping coherent"); + expect(result.stdout).toContain("PASS"); + }); }); diff --git a/host/test/multi-worker.test.ts b/host/test/multi-worker.test.ts index 1183a957f..bb604a004 100644 --- a/host/test/multi-worker.test.ts +++ b/host/test/multi-worker.test.ts @@ -19,6 +19,7 @@ import { CH_DATA, CH_ERRNO, CH_RETURN, + HOST_INTERCEPTED_SYSCALLS, } from "../src/generated/abi"; const MAX_PAGES = 1024; // 64 MiB: enough to prove initial < maximum. @@ -80,6 +81,9 @@ describe("CentralizedKernelWorker Process Management", () => { cleanupPendingSelectRetries: vi.fn(), cleanupUdpBindings: vi.fn(), cleanupTcpListeners: vi.fn(), + sharedMappings: new Map(), + sharedMmapBackings: new Map(), + shmMappings: new Map(), hostReaped: new Set([pid]), }) as CentralizedKernelWorker; @@ -91,6 +95,24 @@ describe("CentralizedKernelWorker Process Management", () => { expect((kw as any).hostReaped.has(pid)).toBe(false); }); + it("marks selected stdio descriptors as host-backed pipes", () => { + const setStdioPipe = vi.fn(() => 0); + const kw = Object.assign(Object.create(CentralizedKernelWorker.prototype), { + kernelInstance: { + exports: { + kernel_set_stdio_pipe: setStdioPipe, + }, + }, + }); + + kw.setStdioPipes(321, [0, 1, 2, -1, 3]); + + expect(setStdioPipe).toHaveBeenCalledTimes(3); + expect(setStdioPipe).toHaveBeenNthCalledWith(1, 321, 0); + expect(setStdioPipe).toHaveBeenNthCalledWith(2, 321, 1); + expect(setStdioPipe).toHaveBeenNthCalledWith(3, 321, 2); + }); + it("lets the host terminate pthread workers without waking SYS_EXIT back into guest code", () => { const pid = 123; const mainChannelOffset = WASM_PAGE_SIZE; @@ -207,6 +229,7 @@ describe("CentralizedKernelWorker Process Management", () => { resolveClone = resolve; }); }); + const channel = { pid, channelOffset: mainChannelOffset, memory }; const kw = Object.assign(Object.create(CentralizedKernelWorker.prototype), { callbacks: { onClone }, @@ -219,8 +242,9 @@ describe("CentralizedKernelWorker Process Management", () => { scratchOffset: 0, currentHandlePid: 0, processes: new Map([ - [pid, { channels: [{ channelOffset: mainChannelOffset }] }], + [pid, { channels: [channel] }], ]), + activeChannels: [channel], threadCtidPtrs, completeChannel: vi.fn(), bindKernelTidForChannel: vi.fn(), @@ -236,7 +260,7 @@ describe("CentralizedKernelWorker Process Management", () => { }); (kw as any).handleClone( - { pid, channelOffset: mainChannelOffset, memory }, + channel, [0, stackPtr, 0, tlsPtr, ctidPtr, 0], ); @@ -354,6 +378,62 @@ describe("CentralizedKernelWorker Process Management", () => { kw.unregisterProcess(100); }); + it("retries fork pid allocation when the kernel still owns a zombie pid", async () => { + const parentPid = 77; + const memory = new WebAssembly.Memory({ + initial: 4, + maximum: 4, + shared: true, + }); + const channel = { + pid: parentPid, + channelOffset: WASM_PAGE_SIZE, + memory, + }; + const kernelForkProcess = vi.fn((_parent: number, child: number) => + child === 100 ? -17 : 0, + ); + const completeChannel = vi.fn(); + const onFork = vi.fn(() => Promise.resolve([WASM_PAGE_SIZE])); + const kw = Object.assign(Object.create(CentralizedKernelWorker.prototype), { + callbacks: { onFork }, + nextChildPid: 100, + processes: new Map([[parentPid, { channels: [channel] }]]), + threadForkContexts: new Map(), + tcpListenerTargets: new Map(), + epollInterests: new Map(), + sharedMappings: new Map(), + sharedMmapBackings: new Map(), + shmMappings: new Map(), + shmSegmentVersions: new Map(), + inheritSharedMappings: vi.fn(), + completeChannel, + kernelInstance: { + exports: { + kernel_fork_process: kernelForkProcess, + kernel_clear_fork_child: vi.fn(() => 0), + kernel_reset_signal_mask: vi.fn(() => 0), + kernel_set_current_pid: vi.fn(), + }, + }, + }); + + (kw as any).handleFork(channel, [0]); + await Promise.resolve(); + + expect(kernelForkProcess).toHaveBeenNthCalledWith(1, parentPid, 100); + expect(kernelForkProcess).toHaveBeenNthCalledWith(2, parentPid, 101); + expect(onFork).toHaveBeenCalledWith(parentPid, 101, memory, undefined); + expect(completeChannel).toHaveBeenCalledWith( + channel, + HOST_INTERCEPTED_SYSCALLS.SYS_FORK, + [0], + undefined, + 101, + 0, + ); + }); + it("should throw when registering duplicate PID", async () => { const kw = new CentralizedKernelWorker( { maxWorkers: 4, dataBufferSize: 65536, useSharedMemory: true }, diff --git a/host/test/select-timeout-retry.test.ts b/host/test/select-timeout-retry.test.ts new file mode 100644 index 000000000..4656d4845 --- /dev/null +++ b/host/test/select-timeout-retry.test.ts @@ -0,0 +1,218 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; +import { + CH_ERRNO, + CH_RETURN, +} from "../src/generated/abi"; +import { CentralizedKernelWorker } from "../src/kernel-worker"; + +describe("centralized select/pselect timeout retries", () => { + afterEach(() => { + vi.useRealTimers(); + }); + + it("preserves a finite pselect6 deadline across retry wakes", () => { + vi.useFakeTimers(); + vi.setSystemTime(0); + + const kernelMemory = createSharedMemory(); + const processMemory = createSharedMemory(); + const scratchOffset = 128; + const handleChannel = vi.fn(() => { + const kernelView = new DataView(kernelMemory.buffer, scratchOffset); + kernelView.setBigInt64(CH_RETURN, -1n, true); + kernelView.setUint32(CH_ERRNO, 11, true); + return 0; + }); + const worker = createWorkerHarness({ kernel_handle_channel: handleChannel }); + worker.kernelMemory = kernelMemory; + worker.scratchOffset = scratchOffset; + + const channel = createChannel(42, processMemory); + worker.processes = new Map([ + [42, { pid: 42, memory: processMemory, channels: [channel], ptrWidth: 4 }], + ]); + worker.activeChannels = [channel]; + + const readfdsPtr = 1024; + const tsPtr = 2048; + const processView = new DataView(processMemory.buffer); + processView.setUint8(readfdsPtr, 1); + processView.setBigInt64(tsPtr, 0n, true); + processView.setBigInt64(tsPtr + 8, 10_000_000n, true); + + const origArgs = [1, readfdsPtr, 0, 0, tsPtr, 0]; + worker.handlePselect6(channel, origArgs); + expect(worker.completeChannel).not.toHaveBeenCalled(); + expect(handleChannel).toHaveBeenCalledTimes(1); + + vi.advanceTimersByTime(5); + worker.wakeAllBlockedRetries(); + expect(worker.completeChannel).not.toHaveBeenCalled(); + expect(handleChannel).toHaveBeenCalledTimes(2); + + vi.advanceTimersByTime(4); + expect(worker.completeChannel).not.toHaveBeenCalled(); + + vi.advanceTimersByTime(1); + expect(worker.completeChannel).toHaveBeenCalledWith( + channel, + expect.any(Number), + origArgs, + undefined, + 0, + 0, + ); + }); + + it("interrupts host-side epoll_pwait emulation when a handler signal is pending", () => { + const kernelMemory = createSharedMemory(); + const processMemory = createSharedMemory(); + const scratchOffset = 128; + const handleChannel = vi.fn(() => { + const kernelView = new DataView(kernelMemory.buffer, scratchOffset); + kernelView.setBigInt64(CH_RETURN, 0n, true); + kernelView.setUint32(CH_ERRNO, 0, true); + return 0; + }); + const worker = createWorkerHarness({ + kernel_handle_channel: handleChannel, + kernel_get_process_exit_status: () => 0, + }); + worker.kernelMemory = kernelMemory; + worker.scratchOffset = scratchOffset; + worker.epollInterests = new Map([ + ["42:7", [{ fd: 3, events: 0x001, data: 99n }]], + ]); + worker.dequeueSignalForDelivery = vi.fn(() => 15); + worker.completeChannelRaw = vi.fn(); + worker.relistenChannel = vi.fn(); + + const channel = createChannel(42, processMemory); + worker.processes = new Map([ + [42, { pid: 42, memory: processMemory, channels: [channel], ptrWidth: 4 }], + ]); + worker.activeChannels = [channel]; + + worker.handleEpollPwait(channel, 241, [7, 4096, 1, 1000, 0, 8]); + + expect(worker.dequeueSignalForDelivery).toHaveBeenCalledWith(channel); + expect(worker.completeChannelRaw).toHaveBeenCalledWith(channel, -4, 4); + expect(worker.relistenChannel).toHaveBeenCalledWith(channel); + expect(worker.pendingPollRetries.size).toBe(0); + }); + + it("merges disjoint MAP_SHARED writes from live processes", () => { + const worker = createWorkerHarness({}); + const mem1 = createSharedMemory(); + const mem2 = createSharedMemory(); + const addr = 1024; + const len = 16; + const key = "anon:test"; + const backing = { + key, + path: "", + handle: -1, + anonymous: true, + writable: true, + pages: new Map(), + dirtyPages: new Set(), + refCount: 2, + version: 0, + }; + const makeMapping = () => ({ + fd: -1, + fileOffset: 0, + len, + writable: true, + backingKey: key, + snapshot: new Uint8Array(len), + version: 0, + }); + + new Uint8Array(mem1.buffer)[addr] = "A".charCodeAt(0); + new Uint8Array(mem2.buffer)[addr + 1] = "B".charCodeAt(0); + const channel1 = createChannel(1, mem1); + const channel2 = createChannel(2, mem2); + worker.processes = new Map([ + [1, { pid: 1, memory: mem1, channels: [channel1], ptrWidth: 4 }], + [2, { pid: 2, memory: mem2, channels: [channel2], ptrWidth: 4 }], + ]); + worker.sharedMmapBackings = new Map([[key, backing]]); + worker.sharedMappings = new Map([ + [1, new Map([[addr, makeMapping()]])], + [2, new Map([[addr, makeMapping()]])], + ]); + + worker.syncSharedMappingsFromProcess(channel1, true); + worker.syncSharedMappingsFromProcess(channel2, true); + const latest = worker.readBackingRange(backing, 0, len); + expect(String.fromCharCode(latest[0], latest[1])).toBe("AB"); + + worker.refreshSharedMappingsToProcess(channel1, true); + expect(String.fromCharCode( + new Uint8Array(mem1.buffer)[addr], + new Uint8Array(mem1.buffer)[addr + 1], + )).toBe("AB"); + }); +}); + +function createWorkerHarness(exports: Record): any { + return Object.assign(Object.create(CentralizedKernelWorker.prototype), { + kernelInstance: { exports }, + kernel: { + toKernelPtr(value: number | bigint): number { + return Number(value); + }, + }, + kernelMemory: createSharedMemory(), + scratchOffset: 128, + config: {}, + callbacks: {}, + processes: new Map(), + activeChannels: [], + syscallRing: new Map(), + channelTids: new Map(), + threadForkContexts: new Map(), + stdinFinite: new Set(), + stdinBuffers: new Map(), + alarmTimers: new Map(), + posixTimers: new Map(), + pendingSleeps: new Map(), + pendingPollRetries: new Map(), + pendingSelectRetries: new Map(), + pendingPipeReaders: new Map(), + pendingPipeWriters: new Map(), + socketTimeoutTimers: new Map(), + pendingCancels: new Set(), + tcpListeners: new Map(), + tcpListenerTargets: new Map(), + tcpListenerRRIndex: new Map(), + sharedMappings: new Map(), + sharedMmapBackings: new Map(), + tcpConnections: new Map(), + shmMappings: new Map(), + usePolling: false, + completeChannel: vi.fn(), + dequeueSignalForDelivery: vi.fn(), + bindKernelTidForChannel: vi.fn(), + assertKernelStackContext: vi.fn(), + }); +} + +function createSharedMemory(pages = 1): WebAssembly.Memory { + return new WebAssembly.Memory({ + initial: pages, + maximum: pages, + shared: true, + }); +} + +function createChannel(pid: number, memory: WebAssembly.Memory): any { + return { + pid, + memory, + channelOffset: 0, + i32View: new Int32Array(memory.buffer, 0), + handling: false, + }; +} diff --git a/host/test/sysv-ipc.test.ts b/host/test/sysv-ipc.test.ts index 5ad55c9e1..93a178622 100644 --- a/host/test/sysv-ipc.test.ts +++ b/host/test/sysv-ipc.test.ts @@ -2,16 +2,132 @@ * Tests for SysV IPC: message queues, semaphores, and shared memory. * Verifies that the SharedIpcTable is properly wired up in the kernel worker. */ -import { describe, it, expect } from "vitest"; +import { describe, it, expect, vi } from "vitest"; import { join, dirname } from "node:path"; import { existsSync } from "node:fs"; import { fileURLToPath } from "node:url"; import { runCentralizedProgram } from "./centralized-test-helper"; +import { CentralizedKernelWorker } from "../src/kernel-worker"; const __dirname = dirname(fileURLToPath(import.meta.url)); const ipcBinary = join(__dirname, "../../examples/sysv_ipc_test.wasm"); const hasBinary = existsSync(ipcBinary); +function createSysvSyncHarness() { + const pid = 101; + const segId = 7; + const mapAddr = 0x2000; + const size = 4096; + const processMemory = new WebAssembly.Memory({ initial: 1, maximum: 1, shared: true }); + const kernelMemory = new WebAssembly.Memory({ initial: 2 }); + const backing = new Uint8Array(size); + const writes: Array<{ segId: number; offset: number; bytes: Uint8Array }> = []; + + const writeChunk = vi.fn((shmid: number, offset: number, dataPtr: number, dataLen: number) => { + const kernelMem = new Uint8Array(kernelMemory.buffer); + const bytes = kernelMem.slice(dataPtr, dataPtr + dataLen); + backing.set(bytes, offset); + writes.push({ segId: shmid, offset, bytes }); + return dataLen; + }); + + const readChunk = vi.fn((shmid: number, offset: number, outPtr: number, maxLen: number) => { + expect(shmid).toBe(segId); + const len = Math.min(maxLen, backing.length - offset); + new Uint8Array(kernelMemory.buffer).set(backing.subarray(offset, offset + len), outPtr); + return len; + }); + + const kw = Object.assign(Object.create(CentralizedKernelWorker.prototype), { + kernel: { toKernelPtr: (value: number | bigint) => Number(value) }, + kernelMemory, + kernelInstance: { + exports: { + kernel_set_current_pid: vi.fn(), + kernel_ipc_shm_write_chunk: writeChunk, + kernel_ipc_shm_read_chunk: readChunk, + kernel_ipc_shmdt: vi.fn(() => 0), + }, + }, + processes: new Map([ + [pid, { memory: processMemory, ptrWidth: 4 }], + ]), + shmMappings: new Map([ + [ + pid, + new Map([ + [ + mapAddr, + { + segId, + size, + readOnly: false, + snapshot: new Uint8Array(size), + version: 0, + }, + ], + ]), + ], + ]), + shmSegmentVersions: new Map([[segId, 0]]), + scratchOffset: 0, + }) as CentralizedKernelWorker; + + const channel = { + pid, + memory: processMemory, + channelOffset: 0, + i32View: new Int32Array(processMemory.buffer, 0, 1), + consecutiveSyscalls: 0, + }; + + return { + pid, + segId, + mapAddr, + processMemory, + processMem: new Uint8Array(processMemory.buffer), + kw, + channel, + writes, + writeChunk, + }; +} + +describe("SysV shared-memory host synchronization", () => { + it("skips single-observer syscall-boundary publishes but forces observer handoff publishes", () => { + const { + pid, + segId, + mapAddr, + processMem, + kw, + channel, + writes, + writeChunk, + } = createSysvSyncHarness(); + + processMem[mapAddr + 17] = 0x7b; + + (kw as any).synchronizeSysvShmMappingsForSyscallBoundary(channel); + expect(writeChunk).not.toHaveBeenCalled(); + + (kw as any).syncSysvShmSegmentFromMappedProcesses(segId); + expect(writes).toHaveLength(1); + expect(writes[0]!.segId).toBe(segId); + expect(writes[0]!.offset).toBe(0); + expect(writes[0]!.bytes[17]).toBe(0x7b); + + writes.length = 0; + writeChunk.mockClear(); + processMem[mapAddr + 41] = 0xa5; + + (kw as any).releaseAllSysvShmMappingsForProcess(pid); + expect(writeChunk).toHaveBeenCalledTimes(1); + expect(writes[0]!.bytes[41]).toBe(0xa5); + }); +}); + describe.skipIf(!hasBinary)("SysV IPC", () => { it("message queues, semaphores, shared memory", async () => { const result = await runCentralizedProgram({ diff --git a/host/test/vfs.test.ts b/host/test/vfs.test.ts index 12288acde..a24d95a16 100644 --- a/host/test/vfs.test.ts +++ b/host/test/vfs.test.ts @@ -1,5 +1,5 @@ -import { describe, it, expect, beforeEach, afterEach } from "vitest"; -import { mkdtempSync, writeFileSync, readFileSync, rmSync } from "node:fs"; +import { describe, it, expect, beforeEach, afterEach, vi } from "vitest"; +import { mkdirSync, mkdtempSync, writeFileSync, readFileSync, rmSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; import { VirtualPlatformIO } from "../src/vfs/vfs"; @@ -356,8 +356,14 @@ describe("HostFileSystem path traversal", () => { }); it("rejects paths with embedded .. sequences", () => { - const hfs = new HostFileSystem("/tmp/sandbox"); - expect(() => hfs.stat("/subdir/../../etc/passwd")).toThrow("EACCES"); + const root = mkdtempSync(join(tmpdir(), "kandelo-vfs-")); + try { + mkdirSync(join(root, "subdir")); + const hfs = new HostFileSystem(root); + expect(() => hfs.stat("/subdir/../../etc/passwd")).toThrow("EACCES"); + } finally { + rmSync(root, { recursive: true, force: true }); + } }); }); diff --git a/libc/glue/abi_constants.h b/libc/glue/abi_constants.h index 690d0b988..8fb227203 100644 --- a/libc/glue/abi_constants.h +++ b/libc/glue/abi_constants.h @@ -4,7 +4,7 @@ #define WASM_POSIX_ABI_CONSTANTS_H /* Mirrors wasm_posix_shared::ABI_VERSION. */ -#define WASM_POSIX_ABI_VERSION 15u +#define WASM_POSIX_ABI_VERSION 16u /* Default process-wasm pthread slot declaration. */ #define WASM_POSIX_THREAD_SLOT_DECL_DEFAULT -1