From 035514cb5582c8ff09a7026f83475a538d70acf9 Mon Sep 17 00:00:00 2001 From: Maurice Hieronymus Date: Sun, 19 Apr 2026 10:24:23 +0000 Subject: [PATCH 01/16] buildroot: enable busybox init and prefer /sbin/init Enables BR2_PACKAGE_BUSYBOX + BR2_INIT_BUSYBOX so buildroot installs /sbin/init as a symlink to /bin/busybox. Kernel's load_init_bytes now searches /sbin/init first; /bin/init (Rust fallback) stays in the overlay so a single-line reorder reverts the swap if bring-up gets stuck. Co-Authored-By: Claude Opus 4.7 (1M context) --- configs/solaya_riscv64_buildroot_defconfig.in | 19 +++++++++++-------- crates/kernel/src/processes/process_table.rs | 10 ++++------ 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/configs/solaya_riscv64_buildroot_defconfig.in b/configs/solaya_riscv64_buildroot_defconfig.in index 9513d581..1b603a73 100644 --- a/configs/solaya_riscv64_buildroot_defconfig.in +++ b/configs/solaya_riscv64_buildroot_defconfig.in @@ -35,14 +35,17 @@ BR2_TOOLCHAIN_EXTERNAL_BOOTLIN_RISCV64_LP64D_MUSL_STABLE=y # are separate follow-up features. BR2_STATIC_LIBS=y -# --- Init system: Solaya's Rust init stays PID 1 ------------------------- -# Buildroot provides no init (we don't use busybox init yet — needs kernel -# AF_UNIX socketpair + shebang execve support). Our Rust init.rs runs -# as PID 1 from /bin/init and execs /bin/dash directly. -BR2_INIT_NONE=y -# BR2_PACKAGE_BUSYBOX is not set — dropping busybox entirely also -# auto-enables BR2_PACKAGE_BUSYBOX_SHOW_OTHERS, which is required by the -# coreutils package selection below. +# --- Init system: busybox init as PID 1 ---------------------------------- +# BR2_INIT_BUSYBOX installs /sbin/init as a symlink to /bin/busybox, and +# busybox reads /etc/inittab (provided via the overlay at +# configs/overlay/etc/inittab). The kernel's load_init_bytes tries +# /sbin/init first, so busybox wins; /bin/init (our Rust fallback) stays +# in the overlay for one-line rollback during the initial bring-up. +BR2_INIT_BUSYBOX=y +BR2_PACKAGE_BUSYBOX=y +# Rely on the default buildroot busybox config (includes init + sh + the +# usual applets). Ship a BR2_PACKAGE_BUSYBOX_CONFIG_FRAGMENT_FILES only +# if the bring-up loop proves we need a specific applet toggled. # --- Shell + coreutils --------------------------------------------------- BR2_PACKAGE_DASH=y diff --git a/crates/kernel/src/processes/process_table.rs b/crates/kernel/src/processes/process_table.rs index c626c471..7f3e159b 100644 --- a/crates/kernel/src/processes/process_table.rs +++ b/crates/kernel/src/processes/process_table.rs @@ -55,13 +55,11 @@ pub fn init() { /// Source the PID-1 ELF image from the initramfs-populated rootfs. /// -/// /bin/init is Solaya's Rust init (delivered via the buildroot overlay). -/// /sbin/init would be busybox if we ever flipped back to it — keeping -/// both paths means swapping busybox in is a one-line reorder plus the -/// AF_UNIX socketpair + shebang execve kernel work that's currently on -/// the follow-up list. +/// /sbin/init is buildroot's busybox (symlink to /bin/busybox). /bin/init +/// is Solaya's Rust init, kept as a fallback during the busybox bring-up +/// so reverting is a one-line reorder. fn load_init_bytes() -> Arc<[u8]> { - const INIT_PATHS: &[&str] = &["/bin/init", "/sbin/init", "/init"]; + const INIT_PATHS: &[&str] = &["/sbin/init", "/bin/init", "/init"]; for path in INIT_PATHS { let Ok(node) = fs::resolve_path(path) else { continue; From a169414944df1f64775f05e2df588ccd97d7efe3 Mon Sep 17 00:00:00 2001 From: Maurice Hieronymus Date: Sun, 19 Apr 2026 10:28:31 +0000 Subject: [PATCH 02/16] net: return EAFNOSUPPORT/EPROTONOSUPPORT instead of panicking do_socket was asserting on AF != AF_INET and panicking on unknown socket types. A syscall driven by userspace input must never panic the kernel. Busybox init calls socket(AF_UNIX, ...) during startup and hit the assert immediately; returning the proper errnos lets init proceed so we can see what it actually needs next. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kernel/src/syscalls/net_ops.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/crates/kernel/src/syscalls/net_ops.rs b/crates/kernel/src/syscalls/net_ops.rs index 5fe4a2a5..c2e8f201 100644 --- a/crates/kernel/src/syscalls/net_ops.rs +++ b/crates/kernel/src/syscalls/net_ops.rs @@ -25,15 +25,14 @@ impl LinuxSyscallHandler { typ: c_int, _protocol: c_int, ) -> Result { - assert!( - domain == AF_INET, - "socket: only AF_INET supported (got domain={domain})" - ); + if domain != AF_INET { + return Err(Errno::EAFNOSUPPORT); + } let masked_type = typ & !SOCK_CLOEXEC; let descriptor = match masked_type { SOCK_DGRAM => FileDescriptor::UnboundUdpSocket, SOCK_STREAM => FileDescriptor::UnboundTcpSocket, - _ => panic!("socket: unsupported type {typ:#x}"), + _ => return Err(Errno::EPROTONOSUPPORT), }; let fd = self .current_process From 3a0bad0442ef033176b912d315321f535af2755f Mon Sep 17 00:00:00 2001 From: Maurice Hieronymus Date: Sun, 19 Apr 2026 10:32:23 +0000 Subject: [PATCH 03/16] exec: add shebang support for script execution Booting a busybox userspace crashed the kernel with "Cannot parse ELF file: MagicNumberWrong" in do_execve when PID 1 tried to exec /etc/init.d/rcS (a `#!/bin/dash` script). do_execve now peeks the first two bytes of the file, and on a `#!` header resolves the interpreter path + optional arg and re-execs against it, following up to MAX_SHEBANG_DEPTH=4 layers (ELOOP beyond). Malformed shebangs or non-ELF final files return ENOEXEC instead of panicking, so userspace input can no longer crash the kernel. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kernel/src/syscalls/exec_ops.rs | 127 +++++++++++++++++++++++-- 1 file changed, 120 insertions(+), 7 deletions(-) diff --git a/crates/kernel/src/syscalls/exec_ops.rs b/crates/kernel/src/syscalls/exec_ops.rs index 4375ac0e..848c9a0b 100644 --- a/crates/kernel/src/syscalls/exec_ops.rs +++ b/crates/kernel/src/syscalls/exec_ops.rs @@ -51,7 +51,6 @@ impl LinuxSyscallHandler { })?; let mut buf = ConsumableBuffer::new(&filename_bytes); let filename_str = buf.consume_str().ok_or(Errno::EFAULT)?; - let name = filename_str.rsplit('/').next().unwrap_or(filename_str); let argv_buffers = self.read_string_array(argv)?; let mut args: Vec<&str> = Vec::new(); @@ -78,15 +77,25 @@ impl LinuxSyscallHandler { let old_cwd_str = self.get_process().with_lock(|p| String::from(p.cwd())); - // Resolve the filename against the VFS: absolute paths walk the - // mount tree, relative paths get rebased against cwd. Errors - // (ENOENT, EACCES, ELOOP, EIO, E2BIG) propagate to userspace as-is. - let vfs_bytes = try_read_from_vfs(filename_str, &old_cwd_str)?; + // Resolve the filename (plus any shebang layers) against the VFS. + // Errors (ENOENT, EACCES, ELOOP, EIO, E2BIG, ENOEXEC) propagate to + // userspace as-is. + let (vfs_bytes, final_argv) = resolve_shebang(filename_str, &args, &old_cwd_str)?; let elf_arc: Arc<[u8]> = Arc::<[u8]>::from(vfs_bytes.as_slice()); - let elf = ElfFile::parse(&elf_arc).expect("Cannot parse ELF file"); + // After shebang resolution, argv[0] is the interpreter path (or the + // original filename if no shebang); the binary's basename becomes + // the process name. + let resolved_path = &final_argv[0]; + let name = resolved_path + .rsplit('/') + .next() + .unwrap_or(resolved_path.as_str()); + let args_refs: Vec<&str> = final_argv.iter().skip(1).map(String::as_str).collect(); + + let elf = ElfFile::parse(&elf_arc).map_err(|_| Errno::ENOEXEC)?; let loaded = - loader::load_elf(&elf, name, &args, &env_strs).expect("ELF loading must succeed"); + loader::load_elf(&elf, name, &args_refs, &env_strs).expect("ELF loading must succeed"); let process_name = Arc::new(String::from(name)); let old_process = self.get_process(); @@ -137,6 +146,110 @@ impl LinuxSyscallHandler { } } +/// Linux caps shebang recursion at 4 layers (`BINPRM_MAX_RECURSION`). +const MAX_SHEBANG_DEPTH: usize = 4; + +/// Maximum bytes of the first line we inspect for a `#!` header. Matches +/// Linux's `BINPRM_BUF_SIZE` envelope: `#!` + 255 interpreter bytes + `\n`. +const SHEBANG_MAX_LINE: usize = 257; + +/// Parse a `#!` line. Returns `(interpreter, optional_arg)` where the +/// optional arg is the remainder of the line after the interpreter token, +/// treated as a single argument (matching Linux behavior — no splitting). +/// +/// Returns `Err(Errno::ENOEXEC)` if the shebang header is malformed (no +/// newline within the bound, empty interpreter, ...). +fn parse_shebang(bytes: &[u8]) -> Result<(String, Option), Errno> { + let scan_len = bytes.len().min(SHEBANG_MAX_LINE); + let line_end = bytes[..scan_len] + .iter() + .position(|&b| b == b'\n') + .ok_or(Errno::ENOEXEC)?; + // Skip the `#!` prefix then leading spaces/tabs. + let after_bang = &bytes[2..line_end]; + let start = after_bang + .iter() + .position(|&b| b != b' ' && b != b'\t') + .ok_or(Errno::ENOEXEC)?; + let rest = &after_bang[start..]; + let interp_end = rest + .iter() + .position(|&b| b == b' ' || b == b'\t') + .unwrap_or(rest.len()); + if interp_end == 0 { + return Err(Errno::ENOEXEC); + } + let interpreter = core::str::from_utf8(&rest[..interp_end]) + .map_err(|_| Errno::ENOEXEC)? + .to_string(); + let arg_region = &rest[interp_end..]; + let arg_start = arg_region + .iter() + .position(|&b| b != b' ' && b != b'\t') + .unwrap_or(arg_region.len()); + let trimmed = &arg_region[arg_start..]; + let trimmed_end = trimmed + .iter() + .rposition(|&b| b != b' ' && b != b'\t') + .map(|p| p + 1) + .unwrap_or(0); + let optional_arg = if trimmed_end == 0 { + None + } else { + Some( + core::str::from_utf8(&trimmed[..trimmed_end]) + .map_err(|_| Errno::ENOEXEC)? + .to_string(), + ) + }; + Ok((interpreter, optional_arg)) +} + +/// Read the file at `filename`, following up to `MAX_SHEBANG_DEPTH` layers +/// of `#!` indirection. Returns the final binary bytes plus the full argv +/// (argv[0] is the resolved interpreter or original filename, followed by +/// any shebang-contributed args, then the caller's `trailing_args`). +fn resolve_shebang( + filename: &str, + trailing_args: &[&str], + cwd: &str, +) -> Result<(Vec, Vec), Errno> { + let mut current_path = String::from(filename); + // Per-layer (optional_arg, script_path) in discovery order (outermost + // first). On exit, innermost interpreter path is `current_path`. + let mut layers: Vec<(Option, String)> = Vec::new(); + let bytes = loop { + let bytes = try_read_from_vfs(¤t_path, cwd)?; + if bytes.len() < 2 || &bytes[..2] != b"#!" { + break bytes; + } + if layers.len() >= MAX_SHEBANG_DEPTH { + return Err(Errno::ELOOP); + } + let (interpreter, optional_arg) = parse_shebang(&bytes)?; + layers.push((optional_arg, current_path)); + current_path = interpreter; + }; + // Assemble argv. Linux semantics: + // argv[0] = innermost interpreter (the actual binary) + // then, unwinding innermost-layer first: + // if that layer had an optional arg: push it + // push that layer's script path + // then trailing_args (argv[1..] from the original execve call) + let mut argv: Vec = Vec::with_capacity(1 + layers.len() * 2 + trailing_args.len()); + argv.push(current_path); + for (opt_arg, script) in layers.into_iter().rev() { + if let Some(a) = opt_arg { + argv.push(a); + } + argv.push(script); + } + for a in trailing_args { + argv.push(String::from(*a)); + } + Ok((bytes, argv)) +} + /// Read the whole file at `filename` into memory, preserving the VFS /// errno so userspace can distinguish ENOENT / EACCES / ELOOP / EIO. /// From ee70b93e6cb2cb632651edc5b09bebacd1c9e0d8 Mon Sep 17 00:00:00 2001 From: Maurice Hieronymus Date: Sun, 19 Apr 2026 10:41:11 +0000 Subject: [PATCH 04/16] signal: implement rt_sigtimedwait for busybox init Busybox init's event loop spins on rt_sigtimedwait waiting for SIGCHLD (and SIGHUP/SIGUSR*/SIGTERM). Without the syscall we returned ENOSYS, so init hot-looped and dash respawned forever. Implement the NULL-info infinite-wait path (and a zero-timeout poll path) that busybox uses: dequeue the lowest pending signal in the caller's set, or block on a new per-thread signal_waker that send_signal wakes regardless of sigmask (critical because callers typically block the set beforehand). SIGKILL/SIGSTOP are stripped from the wait mask. Non-NULL siginfo and finite non-zero timeouts are rejected with EINVAL for now. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kernel/src/processes/process_table.rs | 2 + crates/kernel/src/processes/signal.rs | 9 +++ crates/kernel/src/processes/thread.rs | 22 ++++++ crates/kernel/src/syscalls/linux.rs | 12 +++ crates/kernel/src/syscalls/signal_ops.rs | 79 +++++++++++++++++++- 5 files changed, 122 insertions(+), 2 deletions(-) diff --git a/crates/kernel/src/processes/process_table.rs b/crates/kernel/src/processes/process_table.rs index 7f3e159b..2515357b 100644 --- a/crates/kernel/src/processes/process_table.rs +++ b/crates/kernel/src/processes/process_table.rs @@ -245,6 +245,8 @@ impl ProcessTable { return false; } t.raise_signal(sig); + // Wake any waiter (e.g. rt_sigtimedwait) regardless of sigmask. + t.wake_signal_waker(); // SIGCONT resumes stopped threads if sig == headers::syscall_types::SIGCONT && t.get_state() == ThreadState::Stopped { t.clear_pending_stop_signals(); diff --git a/crates/kernel/src/processes/signal.rs b/crates/kernel/src/processes/signal.rs index 0219301c..abe813d5 100644 --- a/crates/kernel/src/processes/signal.rs +++ b/crates/kernel/src/processes/signal.rs @@ -56,6 +56,15 @@ impl PendingSignals { } Some(deliverable.trailing_zeros()) } + + /// Lowest-numbered pending signal that is in `set`. + pub fn first_in(&self, set: u64) -> Option { + let matched = self.0 & set; + if matched == 0 { + return None; + } + Some(matched.trailing_zeros()) + } } #[derive(Debug, Clone, Copy, PartialEq, Eq)] diff --git a/crates/kernel/src/processes/thread.rs b/crates/kernel/src/processes/thread.rs index bb8ffa3c..456ea9ae 100644 --- a/crates/kernel/src/processes/thread.rs +++ b/crates/kernel/src/processes/thread.rs @@ -27,6 +27,7 @@ use core::{ fmt::Debug, ptr::null_mut, sync::atomic::{AtomicU64, Ordering}, + task::Waker, }; use headers::{ errno::Errno, @@ -108,6 +109,7 @@ pub struct Thread { pub stopped_notified: bool, pub stop_signal: u32, thread_name: Option, + signal_waker: Option, } impl core::fmt::Display for Thread { @@ -258,6 +260,7 @@ impl Thread { stopped_notified: false, stop_signal: 0, thread_name: None, + signal_waker: None, })) } @@ -523,6 +526,25 @@ impl Thread { Some(sig) } + /// Lowest-numbered pending signal that is in `set`, regardless of sigmask. + pub fn first_pending_in_set(&self, set: u64) -> Option { + self.signal_state.pending.first_in(set) + } + + pub fn clear_pending(&mut self, sig: u32) { + self.signal_state.pending.clear(sig); + } + + pub fn register_signal_waker(&mut self, waker: Waker) { + self.signal_waker = Some(waker); + } + + pub fn wake_signal_waker(&mut self) { + if let Some(w) = self.signal_waker.take() { + w.wake(); + } + } + pub fn get_sigaction_raw(&self, sig: u32) -> &sigaction { &self.signal_state.sigaction[sig as usize] } diff --git a/crates/kernel/src/syscalls/linux.rs b/crates/kernel/src/syscalls/linux.rs index 2412f623..9f1bef53 100644 --- a/crates/kernel/src/syscalls/linux.rs +++ b/crates/kernel/src/syscalls/linux.rs @@ -108,6 +108,7 @@ linux_syscalls! { SYSCALL_NR_RT_SIGACTION => rt_sigaction(sig: c_uint, act: Option<*const sigaction>, oact: Option<*mut sigaction>, sigsetsize: usize); SYSCALL_NR_RT_SIGPROCMASK => rt_sigprocmask(how: c_uint, set: Option<*const sigset_t>, oldset: Option<*mut sigset_t>, sigsetsize: usize); SYSCALL_NR_RT_SIGRETURN => rt_sigreturn(); + SYSCALL_NR_RT_SIGTIMEDWAIT => rt_sigtimedwait(set: *const sigset_t, info: Option<*mut u8>, timeout: Option<*const timespec>, sigsetsize: usize); SYSCALL_NR_SENDFILE => sendfile(out_fd: c_int, in_fd: c_int, offset: Option<*mut isize>, count: usize); SYSCALL_NR_SENDTO => sendto(fd: c_int, buf: *const u8, len: usize, flags: c_int, dest_addr: *const u8, addrlen: c_uint); SYSCALL_NR_SETGID => setgid(gid: c_uint); @@ -505,6 +506,17 @@ impl LinuxSyscalls for LinuxSyscallHandler { self.do_rt_sigreturn() } + async fn rt_sigtimedwait( + &mut self, + set: LinuxUserspaceArg<*const sigset_t>, + info: LinuxUserspaceArg>, + timeout: LinuxUserspaceArg>, + sigsetsize: usize, + ) -> Result { + self.do_rt_sigtimedwait(set, info, timeout, sigsetsize) + .await + } + async fn sigaltstack( &mut self, uss: LinuxUserspaceArg>, diff --git a/crates/kernel/src/syscalls/signal_ops.rs b/crates/kernel/src/syscalls/signal_ops.rs index a18d1dca..53967457 100644 --- a/crates/kernel/src/syscalls/signal_ops.rs +++ b/crates/kernel/src/syscalls/signal_ops.rs @@ -1,12 +1,21 @@ -use core::ffi::{c_int, c_uint}; +use core::{ + ffi::{c_int, c_uint}, + future::Future, + pin::Pin, + task::{Context, Poll}, +}; use headers::{ errno::Errno, syscall_types::{ _NSIG, SIG_BLOCK, SIG_SETMASK, SIG_UNBLOCK, SIGKILL, SIGSTOP, sigaction, sigset_t, stack_t, + timespec, }, }; -use crate::{processes::process_table, syscalls::linux_validator::LinuxUserspaceArg}; +use crate::{ + processes::{process_table, thread::ThreadRef}, + syscalls::linux_validator::LinuxUserspaceArg, +}; use abi::pid::Tid; use super::linux::LinuxSyscallHandler; @@ -122,4 +131,70 @@ impl LinuxSyscallHandler { })?; Ok(0) } + + pub(super) async fn do_rt_sigtimedwait( + &self, + set: LinuxUserspaceArg<*const sigset_t>, + info: LinuxUserspaceArg>, + timeout: LinuxUserspaceArg>, + sigsetsize: usize, + ) -> Result { + if sigsetsize != core::mem::size_of::() { + return Err(Errno::EINVAL); + } + // NULL-info is the only supported caller path for now. + if info.arg_nonzero() { + return Err(Errno::EINVAL); + } + let set = set.validate_ptr()?; + // SIGKILL/SIGSTOP cannot be waited for — strip them from the wait set. + let wait_mask = set.sig[0] & !(1u64 << SIGKILL) & !(1u64 << SIGSTOP); + + if let Some(t) = timeout.validate_ptr()? { + if t.tv_sec == 0 && t.tv_nsec == 0 { + // Poll: dequeue a matching pending signal, or EAGAIN. + return self.current_thread.with_lock(|mut th| { + match th.first_pending_in_set(wait_mask) { + Some(sig) => { + th.clear_pending(sig); + Ok(sig as isize) + } + None => Err(Errno::EAGAIN), + } + }); + } + // Finite non-zero timeouts are out of scope for now. + return Err(Errno::EINVAL); + } + + SigTimedWait { + thread: self.current_thread.clone(), + wait_mask, + } + .await + } +} + +struct SigTimedWait { + thread: ThreadRef, + wait_mask: u64, +} + +impl Future for SigTimedWait { + type Output = Result; + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + self.thread.with_lock(|mut t| { + if let Some(sig) = t.first_pending_in_set(self.wait_mask) { + t.clear_pending(sig); + return Poll::Ready(Ok(sig as isize)); + } + // If an unblocked signal not in set is pending, the scheduler's + // Interrupt path will deliver EINTR after we return Pending. + // Register our waker so send_signal wakes us for blocked signals + // in `set`. + t.register_signal_waker(cx.waker().clone()); + Poll::Pending + }) + } } From 152e66f9445a9215dd5a30ce14064b527dc3d777 Mon Sep 17 00:00:00 2001 From: Maurice Hieronymus Date: Sun, 19 Apr 2026 10:43:08 +0000 Subject: [PATCH 05/16] qemu-infra: sync on shell prompt instead of Rust-init banners The boot-synchronization expectations for "init process started" and "starting shell" were printed by the deprecated Rust init. Busybox init (now PID 1 via /sbin/init) does not emit them, so every system test was hanging on boot. Drop those markers and let the shell prompt be the sync point; keep the dhcpd marker when the test boots with networking since our Rust dhcpd still runs via inittab's ::wait:/bin/dhcpd entry. Co-Authored-By: Claude Opus 4.7 (1M context) --- qemu-infra/src/qemu.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/qemu-infra/src/qemu.rs b/qemu-infra/src/qemu.rs index 26108d22..76c8c03a 100644 --- a/qemu-infra/src/qemu.rs +++ b/qemu-infra/src/qemu.rs @@ -179,13 +179,15 @@ impl QemuInstance { stdout.assert_read_until("kernel_init done!").await?; // After kernel_init, async kernel tasks (like ext2 mount) run concurrently - // with the init process. Accumulate boot output to check if the ext2 init - // message was already seen before the prompt. - let mut boot_tail = stdout.assert_read_until("init process started").await?; + // with busybox init. Accumulate boot output to check if the ext2 init + // message was already seen before the prompt. Busybox init is now PID 1 + // (see crates/kernel/src/processes/process_table.rs load_init_bytes); + // it does not emit Rust-init's old banner lines, so we sync on dhcpd + // (when networked) and the shell prompt from dash's `console::respawn`. + let mut boot_tail = Vec::new(); if network_port.is_some() { boot_tail.extend(stdout.assert_read_until("dhcpd: configured ip").await?); } - boot_tail.extend(stdout.assert_read_until("starting shell").await?); boot_tail.extend(stdout.assert_read_until(PROMPT).await?); if has_block_device { From 5680ed4da7083bd90c61b14a0587a1774bee0014 Mon Sep 17 00:00:00 2001 From: Maurice Hieronymus Date: Sun, 19 Apr 2026 11:19:49 +0000 Subject: [PATCH 06/16] fs: map open(/dev/console) to FileDescriptor::Tty MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Busybox init's console::respawn child closes FDs 0/1/2 and reopens /dev/console before exec'ing dash. Before this change the reopened fd was FileDescriptor::VfsFile, whose read path drains ConsoleCharDevice synchronously and returns EAGAIN when the TTY buffer is empty — so dash's blocking read on stdin never unblocked and typed input was silently dropped. do_openat now recognises the console char device via Arc identity (new CONSOLE_CHAR_DEVICE handle + as-char-device VfsNode hook) and produces FileDescriptor::Tty, which blocks via the async ReadTty future like the initial FDs in FdTable::new. Two supporting fixes fell out: - send_signal called ThreadWaker::wake() while holding the target thread's lock; wake() re-locks the same thread, which deadlocked on the same CPU the first time busybox dash self-suspended via kill(0, SIGTTIN). Take the waker out first, drop the lock, then wake. - busybox init calls setsid() in the spawned child before opening the console, so dash's pgid (= its tid) never matches the TTY's default fg_pgid (=1) and dash loops on SIGTTIN to stop itself until foregrounded. On the openat path, set fg_pgid to the opener's pgid. Proper TIOCSCTTY/ctty handling remains deferred to #250. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kernel/src/fs/devfs.rs | 4 ++++ crates/kernel/src/fs/vfs.rs | 10 +++++++++- crates/kernel/src/io/uart.rs | 18 +++++++++++++++++ crates/kernel/src/processes/process_table.rs | 19 +++++++++++------- crates/kernel/src/processes/thread.rs | 10 ++++++---- crates/kernel/src/syscalls/fs_ops.rs | 21 +++++++++++++++++--- 6 files changed, 67 insertions(+), 15 deletions(-) diff --git a/crates/kernel/src/fs/devfs.rs b/crates/kernel/src/fs/devfs.rs index a26eaf35..9a285a95 100644 --- a/crates/kernel/src/fs/devfs.rs +++ b/crates/kernel/src/fs/devfs.rs @@ -96,6 +96,10 @@ impl VfsNode for CharNode { fn truncate(&self, _length: usize) -> Result<(), Errno> { Ok(()) } + + fn char_device(&self) -> Option> { + Some(self.device.clone()) + } } struct DisplayNode { diff --git a/crates/kernel/src/fs/vfs.rs b/crates/kernel/src/fs/vfs.rs index 91d998ac..0c063ecc 100644 --- a/crates/kernel/src/fs/vfs.rs +++ b/crates/kernel/src/fs/vfs.rs @@ -5,7 +5,7 @@ use alloc::{ vec::Vec, }; use core::sync::atomic::{AtomicU64, Ordering}; -use driver_api::BlockDevice; +use driver_api::{BlockDevice, CharDevice}; use headers::errno::Errno; use hal::spinlock::Spinlock; @@ -179,6 +179,14 @@ pub trait VfsNode: Send + Sync { None } + /// If this node is backed by a character device, return an + /// `Arc`. Used by `openat` to recognise `/dev/console` + /// and produce a blocking `FileDescriptor::Tty` instead of the default + /// non-blocking `VfsFile`. + fn char_device(&self) -> Option> { + None + } + fn atime(&self) -> (i64, u32) { (0, 0) } diff --git a/crates/kernel/src/io/uart.rs b/crates/kernel/src/io/uart.rs index 6b2c2091..a7ba1c69 100644 --- a/crates/kernel/src/io/uart.rs +++ b/crates/kernel/src/io/uart.rs @@ -10,9 +10,26 @@ use core::sync::atomic::{AtomicU8, Ordering}; use alloc::sync::Arc; use driver_api::{CharDevice, IoError, IrqHandler}; use headers::errno::Errno; +use klib::runtime_initialized::RuntimeInitializedData; pub use console::uart::CONSOLE_UART; +/// The console `CharDevice` Arc registered in devfs as `/dev/console`. +/// Set by [`register_console_char_device`]; used by [`is_console_char_device`] +/// to recognise reopened console FDs in `openat` so they become +/// `FileDescriptor::Tty` (which blocks correctly) instead of the default +/// `FileDescriptor::VfsFile` path (which drains the TTY buffer synchronously +/// and returns EAGAIN, so blocking reads never unblock). +static CONSOLE_CHAR_DEVICE: RuntimeInitializedData> = + RuntimeInitializedData::new(); + +/// Returns true if `dev` is the same `Arc` as the one registered for +/// `/dev/console`. Arc-identity comparison — safe against future aliasing +/// (e.g. if we ever expose the same CharDevice under multiple devfs names). +pub fn is_console_char_device(dev: &Arc) -> bool { + CONSOLE_CHAR_DEVICE.is_initialized() && Arc::ptr_eq(dev, &CONSOLE_CHAR_DEVICE) +} + /// `CharDevice` adapter for the console UART. /// /// Carries the TTY line discipline internally: `write` goes through the TTY @@ -53,6 +70,7 @@ impl CharDevice for ConsoleCharDevice { /// devfs. Called once during kernel init. pub fn register_console_char_device() { let device: Arc = Arc::new(ConsoleCharDevice); + CONSOLE_CHAR_DEVICE.initialize(device.clone()); crate::drivers::registry::().register(device.clone()); crate::fs::devfs::register_char_device("console", device); } diff --git a/crates/kernel/src/processes/process_table.rs b/crates/kernel/src/processes/process_table.rs index 2515357b..a73bff9a 100644 --- a/crates/kernel/src/processes/process_table.rs +++ b/crates/kernel/src/processes/process_table.rs @@ -240,25 +240,30 @@ impl ProcessTable { pub fn send_signal(&mut self, tid: Tid, sig: u32) { if let Some(thread) = self.threads.get(&tid).cloned() { - let should_enqueue = thread.with_lock(|mut t| { + // Collect the signal waker while we hold the thread lock, but + // call wake() only after releasing it — ThreadWaker::wake re-locks + // the same thread, which would deadlock on the same CPU. + let (should_enqueue, signal_waker) = thread.with_lock(|mut t| { if matches!(t.get_state(), ThreadState::Zombie(_)) { - return false; + return (false, None); } t.raise_signal(sig); - // Wake any waiter (e.g. rt_sigtimedwait) regardless of sigmask. - t.wake_signal_waker(); + let waker = t.take_signal_waker(); // SIGCONT resumes stopped threads if sig == headers::syscall_types::SIGCONT && t.get_state() == ThreadState::Stopped { t.clear_pending_stop_signals(); t.set_state(ThreadState::Runnable); - return true; + return (true, waker); } if t.has_pending_unblocked_signal() && t.get_state() == ThreadState::Waiting { t.set_state(ThreadState::Runnable); - return true; + return (true, waker); } - false + (false, waker) }); + if let Some(w) = signal_waker { + w.wake(); + } if should_enqueue { RUN_QUEUE.lock().push_back(thread); } diff --git a/crates/kernel/src/processes/thread.rs b/crates/kernel/src/processes/thread.rs index 456ea9ae..a952930b 100644 --- a/crates/kernel/src/processes/thread.rs +++ b/crates/kernel/src/processes/thread.rs @@ -539,10 +539,12 @@ impl Thread { self.signal_waker = Some(waker); } - pub fn wake_signal_waker(&mut self) { - if let Some(w) = self.signal_waker.take() { - w.wake(); - } + /// Detach the registered signal waker, if any. The caller must invoke + /// `wake()` on the returned `Waker` AFTER releasing the thread lock — + /// `ThreadWaker::wake` re-locks the same thread, so calling it while + /// holding the thread lock deadlocks. + pub fn take_signal_waker(&mut self) -> Option { + self.signal_waker.take() } pub fn get_sigaction_raw(&self, sig: u32) -> &sigaction { diff --git a/crates/kernel/src/syscalls/fs_ops.rs b/crates/kernel/src/syscalls/fs_ops.rs index 4115ddfa..4de2b06f 100644 --- a/crates/kernel/src/syscalls/fs_ops.rs +++ b/crates/kernel/src/syscalls/fs_ops.rs @@ -64,11 +64,26 @@ impl LinuxSyscallHandler { return Err(Errno::ENOTDIR); } - let fd_abs = compose_abs(&base_abs, &raw_path); - let open_file = fs::open_file::open(node, flags, fd_abs); + let descriptor = if let Some(dev) = node.char_device() + && crate::io::uart::is_console_char_device(&dev) + { + // Implicit-ctty: give the opener's pgid the TTY's foreground + // group. busybox init's child calls setsid() before opening the + // console; without this, dash's job-control startup sees + // fg_pgid != getpgrp() and self-stops via SIGTTIN. Proper + // TIOCSCTTY on-open is #250 — this keeps things unblocked. + let caller_pgid = self.current_process.with_lock(|p| p.pgid()); + crate::io::tty_device::console_tty() + .lock() + .set_fg_pgid(caller_pgid); + FileDescriptor::Tty(crate::io::tty_device::console_tty().clone()) + } else { + let fd_abs = compose_abs(&base_abs, &raw_path); + FileDescriptor::VfsFile(fs::open_file::open(node, flags, fd_abs)) + }; let fd = self .current_process - .with_lock(|p| p.fd_table().allocate(FileDescriptor::VfsFile(open_file)))?; + .with_lock(|p| p.fd_table().allocate(descriptor))?; Ok(fd as isize) } From 72911959aa547059e0bbcd1734e606d72d92148f Mon Sep 17 00:00:00 2001 From: Maurice Hieronymus Date: Sun, 19 Apr 2026 11:47:30 +0000 Subject: [PATCH 07/16] syscalls: implement reboot(2) and switch shutdown test to halt for busybox init Add a Linux-compatible reboot(2) handler (SYSCALL_NR_REBOOT = 142) so that busybox init's boot-time reboot(LINUX_REBOOT_CMD_CAD_OFF) no longer hits the UNIMPLEMENTED syscall path, and so userspace has a direct route to a clean system shutdown. Magic values follow the public man 2 reboot contract: magic1 must be 0xfee1dead and magic2 one of the documented constants. CAD_OFF/CAD_ON are no-ops; HALT/POWER_OFF print "shutting down system" and call qemu_exit::exit_success(); RESTART delegates to platform::reset::trigger_reset(). RESTART2, SW_SUSPEND, and KEXEC return EINVAL. Credential checks are deferred until we grow a capability subsystem. The shutdown system test now uses busybox's "halt -n" instead of dash's "exit". With busybox init as PID 1, dash's exit is caught by the console respawn entry in inittab, so the process table never empties. "halt -n" skips the unimplemented sync(2) call that plain "halt" makes, and reaches the shutdown path via init, producing the same literal "shutting down system" message the test already waits for. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kernel/src/syscalls/linux.rs | 11 ++++++++ crates/kernel/src/syscalls/process_ops.rs | 33 ++++++++++++++++++++++- system-tests/src/tests/basics.rs | 2 +- 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/crates/kernel/src/syscalls/linux.rs b/crates/kernel/src/syscalls/linux.rs index 9f1bef53..9955dd2d 100644 --- a/crates/kernel/src/syscalls/linux.rs +++ b/crates/kernel/src/syscalls/linux.rs @@ -101,6 +101,7 @@ linux_syscalls! { SYSCALL_NR_PRLIMIT64 => prlimit64(pid: c_int, resource: c_uint, new_limit: Option<*const u8>, old_limit: Option<*mut u8>); SYSCALL_NR_PWRITE64 => pwrite64(fd: c_int, buf: *const u8, count: usize, offset: isize); SYSCALL_NR_READ => read(fd: c_int, buf: *mut u8, count: usize); + SYSCALL_NR_REBOOT => reboot(magic1: c_int, magic2: c_int, cmd: c_uint, _arg: usize); SYSCALL_NR_READLINKAT => readlinkat(dirfd: c_int, pathname: *const u8, buf: *mut u8, bufsiz: usize); SYSCALL_NR_READV => readv(fd: c_int, iov: *const iovec, iovcnt: c_int); SYSCALL_NR_RECVFROM => recvfrom(fd: c_int, buf: *mut u8, len: usize, flags: c_int, src_addr: Option<*mut u8>, addrlen: Option<*mut c_uint>); @@ -529,6 +530,16 @@ impl LinuxSyscalls for LinuxSyscallHandler { self.do_kill(pid, sig) } + async fn reboot( + &mut self, + magic1: c_int, + magic2: c_int, + cmd: c_uint, + _arg: usize, + ) -> Result { + self.do_reboot(magic1, magic2, cmd) + } + async fn tgkill(&mut self, _tgid: c_int, tid: c_int, sig: c_int) -> Result { let target_tid = Tid::try_from_i32(tid).ok_or(Errno::ESRCH)?; if let Some(sig) = crate::processes::signal::validate_signal(sig)? { diff --git a/crates/kernel/src/syscalls/process_ops.rs b/crates/kernel/src/syscalls/process_ops.rs index c0b296f5..023dfbcd 100644 --- a/crates/kernel/src/syscalls/process_ops.rs +++ b/crates/kernel/src/syscalls/process_ops.rs @@ -1,5 +1,5 @@ use alloc::{collections::BTreeMap, string::String, sync::Arc}; -use core::ffi::{c_int, c_ulong}; +use core::ffi::{c_int, c_uint, c_ulong}; use hal::spinlock::Spinlock; use headers::{ errno::Errno, @@ -8,6 +8,7 @@ use headers::{ use crate::{ cpu::Cpu, + info, memory::VirtAddr, processes::{ process::Process, @@ -21,6 +22,15 @@ use abi::{pid::Tid, syscalls::trap_frame::Register}; use super::linux::LinuxSyscallHandler; +const LINUX_REBOOT_MAGIC1: c_int = 0xfee1deadu32 as c_int; +const LINUX_REBOOT_MAGIC2_SET: &[c_int] = &[0x28121969, 0x05121996, 0x16041998, 0x20112000]; + +const LINUX_REBOOT_CMD_CAD_OFF: c_uint = 0x00000000; +const LINUX_REBOOT_CMD_CAD_ON: c_uint = 0x89abcdef; +const LINUX_REBOOT_CMD_HALT: c_uint = 0xcdef0123; +const LINUX_REBOOT_CMD_POWER_OFF: c_uint = 0x4321fedc; +const LINUX_REBOOT_CMD_RESTART: c_uint = 0x01234567; + impl LinuxSyscallHandler { pub(super) async fn clone_fork(&mut self, stack: usize) -> Result { let parent_regs = Cpu::read_trap_frame(); @@ -204,4 +214,25 @@ impl LinuxSyscallHandler { crate::debug!("Exit process with status: {status}\n"); Ok(0) } + + // TODO: require CAP_SYS_BOOT once credentials gain capability bits + pub(super) fn do_reboot( + &self, + magic1: c_int, + magic2: c_int, + cmd: c_uint, + ) -> Result { + if magic1 != LINUX_REBOOT_MAGIC1 || !LINUX_REBOOT_MAGIC2_SET.contains(&magic2) { + return Err(Errno::EINVAL); + } + match cmd { + LINUX_REBOOT_CMD_CAD_OFF | LINUX_REBOOT_CMD_CAD_ON => Ok(0), + LINUX_REBOOT_CMD_HALT | LINUX_REBOOT_CMD_POWER_OFF => { + info!("No more processes to schedule, shutting down system"); + crate::test::qemu_exit::exit_success(); + } + LINUX_REBOOT_CMD_RESTART => crate::platform::reset::trigger_reset(), + _ => Err(Errno::EINVAL), + } + } } diff --git a/system-tests/src/tests/basics.rs b/system-tests/src/tests/basics.rs index 523b7f9a..1b8a2c3d 100644 --- a/system-tests/src/tests/basics.rs +++ b/system-tests/src/tests/basics.rs @@ -23,7 +23,7 @@ async fn shutdown() -> anyhow::Result<()> { let mut solaya = QemuInstance::start().await?; solaya - .run_prog_waiting_for("exit", "shutting down system") + .run_prog_waiting_for("halt -n", "shutting down system") .await?; assert!(solaya.wait_for_qemu_to_exit().await?.success()); From 31f6f0c1dbc568ac3e3b709adbd269e9b0eb04cc Mon Sep 17 00:00:00 2001 From: Maurice Hieronymus Date: Sun, 19 Apr 2026 11:53:38 +0000 Subject: [PATCH 08/16] syscalls: stub sync(2) so busybox halt runs cleanly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Solaya has no writeback caches — filesystem operations either hit tmpfs or the virtio-blk synchronous path, both of which persist immediately. A no-op sync(2) is semantically correct for this kernel and unblocks busybox's halt applet, which otherwise traps on an unimplemented syscall before reaching reboot(HALT). The shutdown system test can now invoke plain `halt` instead of `halt -n`. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kernel/src/syscalls/linux.rs | 6 ++++++ system-tests/src/tests/basics.rs | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/crates/kernel/src/syscalls/linux.rs b/crates/kernel/src/syscalls/linux.rs index 9955dd2d..591efe91 100644 --- a/crates/kernel/src/syscalls/linux.rs +++ b/crates/kernel/src/syscalls/linux.rs @@ -133,6 +133,7 @@ linux_syscalls! { SYSCALL_NR_STATFS => statfs(pathname: *const u8, buf: *mut u8); SYSCALL_NR_STATX => statx(dirfd: c_int, pathname: *const u8, flags: c_int, mask: c_uint, statxbuf: *mut u8); SYSCALL_NR_SYMLINKAT => symlinkat(target: *const u8, newdirfd: c_int, linkpath: *const u8); + SYSCALL_NR_SYNC => sync(); SYSCALL_NR_SYSINFO => sysinfo(info: *mut u8); SYSCALL_NR_TGKILL => tgkill(tgid: c_int, tid: c_int, sig: c_int); SYSCALL_NR_TKILL => tkill(tid: c_int, sig: c_int); @@ -767,6 +768,11 @@ impl LinuxSyscalls for LinuxSyscallHandler { self.do_sysinfo(info) } + // No-op: Solaya has no writeback caches, so there's nothing to flush. + async fn sync(&mut self) -> Result { + Ok(0) + } + async fn getrlimit( &mut self, resource: c_uint, diff --git a/system-tests/src/tests/basics.rs b/system-tests/src/tests/basics.rs index 1b8a2c3d..6d3e5c05 100644 --- a/system-tests/src/tests/basics.rs +++ b/system-tests/src/tests/basics.rs @@ -23,7 +23,7 @@ async fn shutdown() -> anyhow::Result<()> { let mut solaya = QemuInstance::start().await?; solaya - .run_prog_waiting_for("halt -n", "shutting down system") + .run_prog_waiting_for("halt", "shutting down system") .await?; assert!(solaya.wait_for_qemu_to_exit().await?.success()); From 0ce812bf3cfbf6b0d9f253abec1fb2270fb21168 Mon Sep 17 00:00:00 2001 From: Maurice Hieronymus Date: Sun, 19 Apr 2026 11:56:28 +0000 Subject: [PATCH 09/16] userspace: delete Rust init now that busybox is PID 1 Removes userspace/src/bin/init.rs and drops /bin/init from the kernel's INIT_PATHS search list. Busybox (/sbin/init) is the only supported PID 1; /init stays as the conventional initramfs fallback. Also cleans up stale defconfig commentary that referenced the fallback path. Co-Authored-By: Claude Opus 4.7 (1M context) --- configs/solaya_riscv64_buildroot_defconfig.in | 11 +++++------ crates/kernel/src/processes/process_table.rs | 10 +++++----- userspace/src/bin/init.rs | 12 ------------ 3 files changed, 10 insertions(+), 23 deletions(-) delete mode 100644 userspace/src/bin/init.rs diff --git a/configs/solaya_riscv64_buildroot_defconfig.in b/configs/solaya_riscv64_buildroot_defconfig.in index 1b603a73..9579f12f 100644 --- a/configs/solaya_riscv64_buildroot_defconfig.in +++ b/configs/solaya_riscv64_buildroot_defconfig.in @@ -36,16 +36,15 @@ BR2_TOOLCHAIN_EXTERNAL_BOOTLIN_RISCV64_LP64D_MUSL_STABLE=y BR2_STATIC_LIBS=y # --- Init system: busybox init as PID 1 ---------------------------------- -# BR2_INIT_BUSYBOX installs /sbin/init as a symlink to /bin/busybox, and -# busybox reads /etc/inittab (provided via the overlay at -# configs/overlay/etc/inittab). The kernel's load_init_bytes tries -# /sbin/init first, so busybox wins; /bin/init (our Rust fallback) stays -# in the overlay for one-line rollback during the initial bring-up. +# BR2_INIT_BUSYBOX installs /sbin/init as a symlink to /bin/busybox. +# Busybox reads /etc/inittab from the overlay at +# configs/overlay/etc/inittab. The kernel's load_init_bytes finds +# /sbin/init first. BR2_INIT_BUSYBOX=y BR2_PACKAGE_BUSYBOX=y # Rely on the default buildroot busybox config (includes init + sh + the # usual applets). Ship a BR2_PACKAGE_BUSYBOX_CONFIG_FRAGMENT_FILES only -# if the bring-up loop proves we need a specific applet toggled. +# if a specific applet needs to be toggled. # --- Shell + coreutils --------------------------------------------------- BR2_PACKAGE_DASH=y diff --git a/crates/kernel/src/processes/process_table.rs b/crates/kernel/src/processes/process_table.rs index a73bff9a..29e72458 100644 --- a/crates/kernel/src/processes/process_table.rs +++ b/crates/kernel/src/processes/process_table.rs @@ -55,11 +55,11 @@ pub fn init() { /// Source the PID-1 ELF image from the initramfs-populated rootfs. /// -/// /sbin/init is buildroot's busybox (symlink to /bin/busybox). /bin/init -/// is Solaya's Rust init, kept as a fallback during the busybox bring-up -/// so reverting is a one-line reorder. +/// `/sbin/init` is buildroot's busybox (symlink to `/bin/busybox`). +/// `/init` stays in the search list as the conventional initramfs +/// fallback. fn load_init_bytes() -> Arc<[u8]> { - const INIT_PATHS: &[&str] = &["/sbin/init", "/bin/init", "/init"]; + const INIT_PATHS: &[&str] = &["/sbin/init", "/init"]; for path in INIT_PATHS { let Ok(node) = fs::resolve_path(path) else { continue; @@ -73,7 +73,7 @@ fn load_init_bytes() -> Arc<[u8]> { crate::info!("init: loaded PID 1 from {path} ({n} bytes)"); return Arc::<[u8]>::from(buf); } - panic!("init: no /sbin/init, /bin/init, or /init in the root filesystem"); + panic!("init: no /sbin/init or /init in the root filesystem"); } pub struct ProcessTable { diff --git a/userspace/src/bin/init.rs b/userspace/src/bin/init.rs deleted file mode 100644 index ad3b0e66..00000000 --- a/userspace/src/bin/init.rs +++ /dev/null @@ -1,12 +0,0 @@ -use userspace::spawn::spawn; - -fn main() { - println!("init process started"); - if let Ok(mut child) = spawn("/bin/dhcpd", &[]) { - let _ = child.wait(); - } - println!("starting shell"); - let mut child = spawn("/bin/dash", &[]).expect("Failed to spawn shell"); - child.wait().expect("Failed to wait for shell"); - println!("Initial shell has exited..."); -} From 266b16aa83def5bed9d6e0bd74cbc647034781a8 Mon Sep 17 00:00:00 2001 From: Maurice Hieronymus Date: Sun, 19 Apr 2026 11:57:57 +0000 Subject: [PATCH 10/16] doc: reflect busybox as PID 1 in CLAUDE.md and doc/ai Updates the userspace architecture section and the boot-sequence test expectations to describe the busybox init flow (read /sbin/init, inittab from overlay, dhcpd wait, dash respawn on console) now that the Rust init is gone. Removes references to the deleted init.rs binary from the BUILD.md userspace build chain. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 16 ++++++++++------ doc/ai/BUILD.md | 16 +++++++++------- doc/ai/TESTING.md | 6 +++--- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 5821c393..e4c91de6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -187,14 +187,18 @@ Userspace is a buildroot-produced rootfs cpio, loaded at boot via QEMU `-initrd` (path from DTB `/chosen/linux,initrd-{start,end}`) and extracted into a tmpfs-backed `/` in `initramfs::extract()`: -- **dash** + **GNU coreutils** come from the buildroot package set - (`cmake/buildroot.cmake`, `configs/solaya_riscv64_buildroot_defconfig.in`). -- Solaya's Rust binaries (init, dhcpd, tcp_echo, webserver, test +- **busybox** (init + sh + applets) and **dash** + **GNU coreutils** come + from the buildroot package set (`cmake/buildroot.cmake`, + `configs/solaya_riscv64_buildroot_defconfig.in`). +- Solaya's Rust userspace binaries (dhcpd, tcp_echo, webserver, test fixtures like `prog1`/`*-test`) are built by `userspace-rust` and layered on top via `BR2_ROOTFS_OVERLAY` — they end up at `/bin/`. -- PID 1 is Solaya's Rust `init` (read from `/bin/init` by - `process_table::load_init_bytes`), which execs `/bin/dhcpd` once then - spawns `/bin/dash`. +- PID 1 is busybox (read from `/sbin/init` by + `process_table::load_init_bytes`, which resolves the buildroot symlink + to `/bin/busybox`). Busybox reads `/etc/inittab` + (`configs/overlay/etc/inittab`): runs `/etc/init.d/rcS`, waits on + `/bin/dhcpd` to configure the network, then respawns + `/bin/dash -i` on the serial console. ### Adding a userspace program diff --git a/doc/ai/BUILD.md b/doc/ai/BUILD.md index 05512b82..98deb784 100644 --- a/doc/ai/BUILD.md +++ b/doc/ai/BUILD.md @@ -64,20 +64,22 @@ just build Userspace is a buildroot-produced **cpio initramfs**, not a compile-time embedding: -1. `userspace-rust` builds Solaya's Rust binaries (init, dhcpd, tests) - into `build/userspace/artifacts/`. +1. `userspace-rust` builds Solaya's Rust binaries (dhcpd, tests) into + `build/userspace/artifacts/`. 2. `buildroot-overlay` copies those into `.buildroot/overlay/bin/`. -3. `buildroot-all` runs buildroot, which cross-builds dash + GNU - coreutils using the Bootlin prebuilt musl GCC toolchain +3. `buildroot-all` runs buildroot, which cross-builds busybox, dash, + and GNU coreutils using the Bootlin prebuilt musl GCC toolchain (`BR2_TOOLCHAIN_EXTERNAL_BOOTLIN`), then layers our overlay on top and emits `.buildroot/output/images/rootfs.cpio`. 4. `qemu_wrapper.sh` passes the cpio via `-initrd`; kernel reads `/chosen/linux,initrd-{start,end}` from the DTB, reserves the range in the page allocator, and `initramfs::extract()` unpacks it into the tmpfs-backed root. -5. `process_table::init` reads `/bin/init` from the VFS and runs it as - PID 1. That's Solaya's Rust `init`, which execs `/bin/dhcpd` and - `/bin/dash` by absolute path (both live in the cpio). +5. `process_table::init` reads `/sbin/init` from the VFS (buildroot + symlinks it to `/bin/busybox`) and runs busybox as PID 1. Busybox + reads `/etc/inittab` (shipped via overlay), runs `/etc/init.d/rcS`, + waits on `/bin/dhcpd` to configure the network, then respawns + `/bin/dash -i` on the console. Kernel unit tests no longer embed userspace fixtures; all userspace coverage lives in `system-tests/`, which boot the full image in QEMU. diff --git a/doc/ai/TESTING.md b/doc/ai/TESTING.md index f8fe419e..ee25200e 100644 --- a/doc/ai/TESTING.md +++ b/doc/ai/TESTING.md @@ -102,9 +102,9 @@ Reads from stdout until finding the needle string. QemuInstance::start() automatically waits for: 1. "Hello World from Solaya!" 2. "kernel_init done!" -3. "init process started" -4. "starting shell" -5. Shell prompt ("$ ") +3. "dhcpd: configured ip" (only when the test enables networking) +4. Shell prompt ("$ ") — emitted by dash once busybox init's + `console::respawn` entry has spawned the interactive shell. ### Example Tests From 426a135baa9d8ca4b0c12f31bbafb06e577b59be Mon Sep 17 00:00:00 2001 From: Maurice Hieronymus Date: Sun, 19 Apr 2026 17:25:34 +0000 Subject: [PATCH 11/16] docs: trim bloated comments and fix stale /bin/init ref Drop comments that narrate the recent busybox switch or restate what the code already says; keep only the non-obvious invariants. Also update qemu_wrapper.sh to reference /sbin/init (the current PID-1 path) in its "no rootfs" error note. Co-Authored-By: Claude Opus 4.7 (1M context) --- configs/solaya_riscv64_buildroot_defconfig.in | 9 ++------- crates/kernel/src/syscalls/exec_ops.rs | 6 ------ crates/kernel/src/syscalls/fs_ops.rs | 8 +++----- qemu-infra/src/qemu.rs | 9 +++------ qemu_wrapper.sh | 2 +- 5 files changed, 9 insertions(+), 25 deletions(-) diff --git a/configs/solaya_riscv64_buildroot_defconfig.in b/configs/solaya_riscv64_buildroot_defconfig.in index 9579f12f..b0c60248 100644 --- a/configs/solaya_riscv64_buildroot_defconfig.in +++ b/configs/solaya_riscv64_buildroot_defconfig.in @@ -36,15 +36,10 @@ BR2_TOOLCHAIN_EXTERNAL_BOOTLIN_RISCV64_LP64D_MUSL_STABLE=y BR2_STATIC_LIBS=y # --- Init system: busybox init as PID 1 ---------------------------------- -# BR2_INIT_BUSYBOX installs /sbin/init as a symlink to /bin/busybox. -# Busybox reads /etc/inittab from the overlay at -# configs/overlay/etc/inittab. The kernel's load_init_bytes finds -# /sbin/init first. +# Installs /sbin/init as a symlink to /bin/busybox; inittab comes from +# configs/overlay/etc/inittab. BR2_INIT_BUSYBOX=y BR2_PACKAGE_BUSYBOX=y -# Rely on the default buildroot busybox config (includes init + sh + the -# usual applets). Ship a BR2_PACKAGE_BUSYBOX_CONFIG_FRAGMENT_FILES only -# if a specific applet needs to be toggled. # --- Shell + coreutils --------------------------------------------------- BR2_PACKAGE_DASH=y diff --git a/crates/kernel/src/syscalls/exec_ops.rs b/crates/kernel/src/syscalls/exec_ops.rs index 848c9a0b..1a47a53d 100644 --- a/crates/kernel/src/syscalls/exec_ops.rs +++ b/crates/kernel/src/syscalls/exec_ops.rs @@ -77,15 +77,9 @@ impl LinuxSyscallHandler { let old_cwd_str = self.get_process().with_lock(|p| String::from(p.cwd())); - // Resolve the filename (plus any shebang layers) against the VFS. - // Errors (ENOENT, EACCES, ELOOP, EIO, E2BIG, ENOEXEC) propagate to - // userspace as-is. let (vfs_bytes, final_argv) = resolve_shebang(filename_str, &args, &old_cwd_str)?; let elf_arc: Arc<[u8]> = Arc::<[u8]>::from(vfs_bytes.as_slice()); - // After shebang resolution, argv[0] is the interpreter path (or the - // original filename if no shebang); the binary's basename becomes - // the process name. let resolved_path = &final_argv[0]; let name = resolved_path .rsplit('/') diff --git a/crates/kernel/src/syscalls/fs_ops.rs b/crates/kernel/src/syscalls/fs_ops.rs index 4de2b06f..90c3405b 100644 --- a/crates/kernel/src/syscalls/fs_ops.rs +++ b/crates/kernel/src/syscalls/fs_ops.rs @@ -67,11 +67,9 @@ impl LinuxSyscallHandler { let descriptor = if let Some(dev) = node.char_device() && crate::io::uart::is_console_char_device(&dev) { - // Implicit-ctty: give the opener's pgid the TTY's foreground - // group. busybox init's child calls setsid() before opening the - // console; without this, dash's job-control startup sees - // fg_pgid != getpgrp() and self-stops via SIGTTIN. Proper - // TIOCSCTTY on-open is #250 — this keeps things unblocked. + // Implicit-ctty stop-gap: grant the opener's pgid the console's + // fg_pgid so dash's job-control startup doesn't self-stop via + // SIGTTIN. Proper TIOCSCTTY-on-open is tracked in issue #262. let caller_pgid = self.current_process.with_lock(|p| p.pgid()); crate::io::tty_device::console_tty() .lock() diff --git a/qemu-infra/src/qemu.rs b/qemu-infra/src/qemu.rs index 76c8c03a..0ce08b61 100644 --- a/qemu-infra/src/qemu.rs +++ b/qemu-infra/src/qemu.rs @@ -178,12 +178,9 @@ impl QemuInstance { stdout.assert_read_until("Hello World from Solaya!").await?; stdout.assert_read_until("kernel_init done!").await?; - // After kernel_init, async kernel tasks (like ext2 mount) run concurrently - // with busybox init. Accumulate boot output to check if the ext2 init - // message was already seen before the prompt. Busybox init is now PID 1 - // (see crates/kernel/src/processes/process_table.rs load_init_bytes); - // it does not emit Rust-init's old banner lines, so we sync on dhcpd - // (when networked) and the shell prompt from dash's `console::respawn`. + // Sync on dhcpd (when networked) + shell prompt. Accumulate boot + // output so we can tell whether "ext2: init complete" fired before + // the prompt arrived. let mut boot_tail = Vec::new(); if network_port.is_some() { boot_tail.extend(stdout.assert_read_until("dhcpd: configured ip").await?); diff --git a/qemu_wrapper.sh b/qemu_wrapper.sh index e0bfb06c..5733e4ea 100755 --- a/qemu_wrapper.sh +++ b/qemu_wrapper.sh @@ -132,7 +132,7 @@ fi # Fall back to SOLAYA_INITRD env var if --initrd wasn't passed — lets the # CMake run/test targets inject the buildroot cpio without every caller # having to know the flag. Booting without a rootfs is not a supported -# mode (kernel panics on missing /bin/init), so require one of --initrd +# mode (kernel panics on missing /sbin/init), so require one of --initrd # or SOLAYA_INITRD and fail hard if the referenced file is missing — # silent fallback used to mask an unbuilt buildroot tree. if [[ -z "$INITRD_PATH" ]]; then From 260bb0a0053609f7a126e2c307987e53e9f5936f Mon Sep 17 00:00:00 2001 From: Maurice Hieronymus Date: Sun, 19 Apr 2026 17:26:21 +0000 Subject: [PATCH 12/16] signal: merge PendingSignals first_unblocked/first_in into one method first_unblocked and first_in did the same bitmask-then-ctz work with different input mask interpretations. Collapse into first_matching(), have callers pass !sigmask for delivery and the set directly for sigtimedwait. Also drop duplicated sigmask-access boilerplate in has_pending_unblocked_signal and take_next_pending_signal by routing them through peek_first_unblocked_signal. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kernel/src/processes/signal.rs | 16 +++++----------- crates/kernel/src/processes/thread.rs | 14 ++++---------- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/crates/kernel/src/processes/signal.rs b/crates/kernel/src/processes/signal.rs index abe813d5..d066e39d 100644 --- a/crates/kernel/src/processes/signal.rs +++ b/crates/kernel/src/processes/signal.rs @@ -49,17 +49,11 @@ impl PendingSignals { self.0 &= !(1u64 << sig); } - pub fn first_unblocked(&self, mask: u64) -> Option { - let deliverable = self.0 & !mask; - if deliverable == 0 { - return None; - } - Some(deliverable.trailing_zeros()) - } - - /// Lowest-numbered pending signal that is in `set`. - pub fn first_in(&self, set: u64) -> Option { - let matched = self.0 & set; + /// Lowest-numbered pending signal that intersects `allowed`. + /// Callers pass `!sigmask` for delivery or the sigtimedwait set + /// directly to wait for blocked signals. + pub fn first_matching(&self, allowed: u64) -> Option { + let matched = self.0 & allowed; if matched == 0 { return None; } diff --git a/crates/kernel/src/processes/thread.rs b/crates/kernel/src/processes/thread.rs index a952930b..df0d847b 100644 --- a/crates/kernel/src/processes/thread.rs +++ b/crates/kernel/src/processes/thread.rs @@ -505,30 +505,24 @@ impl Thread { } pub fn has_pending_unblocked_signal(&self) -> bool { - self.signal_state - .pending - .first_unblocked(self.signal_state.sigmask.sig[0]) - .is_some() + self.peek_first_unblocked_signal().is_some() } pub fn peek_first_unblocked_signal(&self) -> Option { self.signal_state .pending - .first_unblocked(self.signal_state.sigmask.sig[0]) + .first_matching(!self.signal_state.sigmask.sig[0]) } pub fn take_next_pending_signal(&mut self) -> Option { - let sig = self - .signal_state - .pending - .first_unblocked(self.signal_state.sigmask.sig[0])?; + let sig = self.peek_first_unblocked_signal()?; self.signal_state.pending.clear(sig); Some(sig) } /// Lowest-numbered pending signal that is in `set`, regardless of sigmask. pub fn first_pending_in_set(&self, set: u64) -> Option { - self.signal_state.pending.first_in(set) + self.signal_state.pending.first_matching(set) } pub fn clear_pending(&mut self, sig: u32) { From 97f9590a9ceb2e51a933dc880cd25564853ebd6c Mon Sep 17 00:00:00 2001 From: Maurice Hieronymus Date: Sun, 19 Apr 2026 17:28:18 +0000 Subject: [PATCH 13/16] exec: peek shebang header before full-file read, route load_elf errno MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A multi-MiB ELF binary was being pulled into memory on every execve just to check whether its first two bytes were '#!'. Split the old try_read_from_vfs into resolve_against_cwd + read_full_node so resolve_shebang can peek up to SHEBANG_MAX_LINE bytes first and only commit to the full read once we know we're not following a shebang chain (and reuse the peek buffer verbatim for files <= the peek size). While here, map loader::load_elf errors to ENOEXEC so a corrupted-but- parseable ELF reports back to userspace instead of panicking the kernel — matching the ElfFile::parse path that already does this. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kernel/src/syscalls/exec_ops.rs | 41 ++++++++++++++++---------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/crates/kernel/src/syscalls/exec_ops.rs b/crates/kernel/src/syscalls/exec_ops.rs index 1a47a53d..a66fbfcf 100644 --- a/crates/kernel/src/syscalls/exec_ops.rs +++ b/crates/kernel/src/syscalls/exec_ops.rs @@ -89,7 +89,7 @@ impl LinuxSyscallHandler { let elf = ElfFile::parse(&elf_arc).map_err(|_| Errno::ENOEXEC)?; let loaded = - loader::load_elf(&elf, name, &args_refs, &env_strs).expect("ELF loading must succeed"); + loader::load_elf(&elf, name, &args_refs, &env_strs).map_err(|_| Errno::ENOEXEC)?; let process_name = Arc::new(String::from(name)); let old_process = self.get_process(); @@ -213,14 +213,25 @@ fn resolve_shebang( // first). On exit, innermost interpreter path is `current_path`. let mut layers: Vec<(Option, String)> = Vec::new(); let bytes = loop { - let bytes = try_read_from_vfs(¤t_path, cwd)?; - if bytes.len() < 2 || &bytes[..2] != b"#!" { - break bytes; + let node = resolve_against_cwd(¤t_path, cwd)?; + // Peek the shebang header first so a multi-MiB ELF isn't read + // into memory just to check its first two bytes. + let size = node.size(); + let peek_len = size.min(SHEBANG_MAX_LINE); + let mut peek: Vec = alloc::vec![0u8; peek_len]; + let n = node.read(0, &mut peek)?; + peek.truncate(n); + if peek.len() < 2 || &peek[..2] != b"#!" { + break if peek.len() == size { + peek + } else { + read_full_node(&node)? + }; } if layers.len() >= MAX_SHEBANG_DEPTH { return Err(Errno::ELOOP); } - let (interpreter, optional_arg) = parse_shebang(&bytes)?; + let (interpreter, optional_arg) = parse_shebang(&peek)?; layers.push((optional_arg, current_path)); current_path = interpreter; }; @@ -244,13 +255,9 @@ fn resolve_shebang( Ok((bytes, argv)) } -/// Read the whole file at `filename` into memory, preserving the VFS -/// errno so userspace can distinguish ENOENT / EACCES / ELOOP / EIO. -/// -/// Path resolution follows execve(2): absolute paths resolve against the -/// VFS root, relative paths against `cwd`. No PATH search — shells are -/// expected to do that themselves (dash/busybox ash both do). -fn try_read_from_vfs(filename: &str, cwd: &str) -> Result, Errno> { +/// Resolve `filename` against the VFS root (absolute) or `cwd` (relative). +/// execve(2) does no PATH search — shells handle that. +fn resolve_against_cwd(filename: &str, cwd: &str) -> Result { let absolute: String = if filename.starts_with('/') { filename.to_string() } else if cwd.ends_with('/') { @@ -258,11 +265,13 @@ fn try_read_from_vfs(filename: &str, cwd: &str) -> Result, Errno> { } else { alloc::format!("{cwd}/{filename}") }; - let node = fs::resolve_path(&absolute)?; + fs::resolve_path(&absolute) +} + +/// Read an entire VFS node into memory. 64 MiB cap keeps a rogue or +/// corrupt entry from exhausting the heap (~10× our largest binary). +fn read_full_node(node: &fs::vfs::VfsNodeRef) -> Result, Errno> { let size = node.size(); - // Refuse outlandish sizes to avoid a rogue or corrupt VFS entry - // allocating the whole heap; 64 MiB is ~10× the largest userspace - // binary we produce. if size > 64 * 1024 * 1024 { return Err(Errno::E2BIG); } From 8e5145f37485de9b23101337077a54175a445830 Mon Sep 17 00:00:00 2001 From: Maurice Hieronymus Date: Sun, 19 Apr 2026 17:29:40 +0000 Subject: [PATCH 14/16] driver-api: expose CharDevice::is_tty instead of console Arc identity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit openat's "is this /dev/console?" check was comparing the node's CharDevice Arc against a second global Arc stored just for this purpose. Push the decision into the trait: default is_tty() == false, override on ConsoleCharDevice. Drops the CONSOLE_CHAR_DEVICE static, the is_console_char_device helper, and the RuntimeInitializedData import — fs_ops::openat just asks the device. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/driver-api/src/lib.rs | 6 +++++ crates/kernel/src/io/uart.rs | 35 ++++++---------------------- crates/kernel/src/syscalls/fs_ops.rs | 2 +- 3 files changed, 14 insertions(+), 29 deletions(-) diff --git a/crates/driver-api/src/lib.rs b/crates/driver-api/src/lib.rs index ed324246..cb6cbf7a 100644 --- a/crates/driver-api/src/lib.rs +++ b/crates/driver-api/src/lib.rs @@ -162,6 +162,12 @@ pub trait CharDevice: Send + Sync { /// Write `data`. Returns the number of bytes written (typically /// `data.len()` for synchronous console-style devices). fn write(&self, data: &[u8]) -> Result; + + /// True if this device is a terminal. `openat` uses this to wrap the + /// fd in `FileDescriptor::Tty` instead of the default VfsFile. + fn is_tty(&self) -> bool { + false + } } /// Static framebuffer description, returned by `DisplayDevice::framebuffer`. diff --git a/crates/kernel/src/io/uart.rs b/crates/kernel/src/io/uart.rs index a7ba1c69..e0df6fad 100644 --- a/crates/kernel/src/io/uart.rs +++ b/crates/kernel/src/io/uart.rs @@ -10,34 +10,12 @@ use core::sync::atomic::{AtomicU8, Ordering}; use alloc::sync::Arc; use driver_api::{CharDevice, IoError, IrqHandler}; use headers::errno::Errno; -use klib::runtime_initialized::RuntimeInitializedData; pub use console::uart::CONSOLE_UART; -/// The console `CharDevice` Arc registered in devfs as `/dev/console`. -/// Set by [`register_console_char_device`]; used by [`is_console_char_device`] -/// to recognise reopened console FDs in `openat` so they become -/// `FileDescriptor::Tty` (which blocks correctly) instead of the default -/// `FileDescriptor::VfsFile` path (which drains the TTY buffer synchronously -/// and returns EAGAIN, so blocking reads never unblock). -static CONSOLE_CHAR_DEVICE: RuntimeInitializedData> = - RuntimeInitializedData::new(); - -/// Returns true if `dev` is the same `Arc` as the one registered for -/// `/dev/console`. Arc-identity comparison — safe against future aliasing -/// (e.g. if we ever expose the same CharDevice under multiple devfs names). -pub fn is_console_char_device(dev: &Arc) -> bool { - CONSOLE_CHAR_DEVICE.is_initialized() && Arc::ptr_eq(dev, &CONSOLE_CHAR_DEVICE) -} - -/// `CharDevice` adapter for the console UART. -/// -/// Carries the TTY line discipline internally: `write` goes through the TTY -/// `process_output` path (handles ONLCR, echo, etc.) before hitting the -/// UART; `read` drains cooked bytes from the TTY input buffer. -/// -/// The TTY itself still lives in `io/tty_device` and is wired up at init. -/// Fully decoupling TTY from UART stays deferred (#250 item #5). +/// `CharDevice` adapter for the console UART. Write goes through the TTY +/// line discipline (ONLCR, echo, ...) before the UART; read drains cooked +/// input bytes. pub struct ConsoleCharDevice; impl CharDevice for ConsoleCharDevice { @@ -64,13 +42,14 @@ impl CharDevice for ConsoleCharDevice { } Ok(data.len()) } + + fn is_tty(&self) -> bool { + true + } } -/// Register the console UART as a `CharDevice` in both the registry and -/// devfs. Called once during kernel init. pub fn register_console_char_device() { let device: Arc = Arc::new(ConsoleCharDevice); - CONSOLE_CHAR_DEVICE.initialize(device.clone()); crate::drivers::registry::().register(device.clone()); crate::fs::devfs::register_char_device("console", device); } diff --git a/crates/kernel/src/syscalls/fs_ops.rs b/crates/kernel/src/syscalls/fs_ops.rs index 90c3405b..5d5b6e76 100644 --- a/crates/kernel/src/syscalls/fs_ops.rs +++ b/crates/kernel/src/syscalls/fs_ops.rs @@ -65,7 +65,7 @@ impl LinuxSyscallHandler { } let descriptor = if let Some(dev) = node.char_device() - && crate::io::uart::is_console_char_device(&dev) + && dev.is_tty() { // Implicit-ctty stop-gap: grant the opener's pgid the console's // fg_pgid so dash's job-control startup doesn't self-stop via From c5ecf28305d52fe479fb77711094a9289146eb99 Mon Sep 17 00:00:00 2001 From: Maurice Hieronymus Date: Sun, 19 Apr 2026 17:33:56 +0000 Subject: [PATCH 15/16] headers: bindgen LINUX_REBOOT_* constants from linux/reboot.h Per CLAUDE.md's "reuse Linux/musl header definitions" rule, pull the reboot magic + command constants from linux/reboot.h via the bindgen driver instead of redefining them inline. MAGIC2_SET now reuses the four bindgen'd LINUX_REBOOT_MAGIC2* values, and the magic compare widens to u32 (via i32::cast_unsigned) to match the generated type while preserving the raw 32-bit bit patterns the syscall ABI expects. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kernel/src/syscalls/process_ops.rs | 25 ++++++++++++++--------- tools/bindgen-driver/src/main.rs | 1 + 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/crates/kernel/src/syscalls/process_ops.rs b/crates/kernel/src/syscalls/process_ops.rs index 023dfbcd..cff970e1 100644 --- a/crates/kernel/src/syscalls/process_ops.rs +++ b/crates/kernel/src/syscalls/process_ops.rs @@ -3,7 +3,12 @@ use core::ffi::{c_int, c_uint, c_ulong}; use hal::spinlock::Spinlock; use headers::{ errno::Errno, - syscall_types::{CLONE_CHILD_CLEARTID, CLONE_PARENT_SETTID, CLONE_SETTLS}, + syscall_types::{ + CLONE_CHILD_CLEARTID, CLONE_PARENT_SETTID, CLONE_SETTLS, LINUX_REBOOT_CMD_CAD_OFF, + LINUX_REBOOT_CMD_CAD_ON, LINUX_REBOOT_CMD_HALT, LINUX_REBOOT_CMD_POWER_OFF, + LINUX_REBOOT_CMD_RESTART, LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, LINUX_REBOOT_MAGIC2A, + LINUX_REBOOT_MAGIC2B, LINUX_REBOOT_MAGIC2C, + }, }; use crate::{ @@ -22,14 +27,12 @@ use abi::{pid::Tid, syscalls::trap_frame::Register}; use super::linux::LinuxSyscallHandler; -const LINUX_REBOOT_MAGIC1: c_int = 0xfee1deadu32 as c_int; -const LINUX_REBOOT_MAGIC2_SET: &[c_int] = &[0x28121969, 0x05121996, 0x16041998, 0x20112000]; - -const LINUX_REBOOT_CMD_CAD_OFF: c_uint = 0x00000000; -const LINUX_REBOOT_CMD_CAD_ON: c_uint = 0x89abcdef; -const LINUX_REBOOT_CMD_HALT: c_uint = 0xcdef0123; -const LINUX_REBOOT_CMD_POWER_OFF: c_uint = 0x4321fedc; -const LINUX_REBOOT_CMD_RESTART: c_uint = 0x01234567; +const LINUX_REBOOT_MAGIC2_SET: &[u32] = &[ + LINUX_REBOOT_MAGIC2, + LINUX_REBOOT_MAGIC2A, + LINUX_REBOOT_MAGIC2B, + LINUX_REBOOT_MAGIC2C, +]; impl LinuxSyscallHandler { pub(super) async fn clone_fork(&mut self, stack: usize) -> Result { @@ -222,7 +225,9 @@ impl LinuxSyscallHandler { magic2: c_int, cmd: c_uint, ) -> Result { - if magic1 != LINUX_REBOOT_MAGIC1 || !LINUX_REBOOT_MAGIC2_SET.contains(&magic2) { + if magic1.cast_unsigned() != LINUX_REBOOT_MAGIC1 + || !LINUX_REBOOT_MAGIC2_SET.contains(&magic2.cast_unsigned()) + { return Err(Errno::EINVAL); } match cmd { diff --git a/tools/bindgen-driver/src/main.rs b/tools/bindgen-driver/src/main.rs index d0a4a66d..4bbecfe1 100644 --- a/tools/bindgen-driver/src/main.rs +++ b/tools/bindgen-driver/src/main.rs @@ -183,6 +183,7 @@ fn generate_syscall_types(args: &Args) -> Result<(), Box> .header(linux_h(args, "linux/uio.h")) .header(linux_h(args, "linux/wait.h")) .header(linux_h(args, "linux/prctl.h")) + .header(linux_h(args, "linux/reboot.h")) .generate()?; bindings.write_to_file(args.out_dir.join("syscall_types.rs"))?; Ok(()) From fa97cca8cb8b6ef4714a6f7bb21db787dda6e593 Mon Sep 17 00:00:00 2001 From: Maurice Hieronymus Date: Sun, 19 Apr 2026 17:39:41 +0000 Subject: [PATCH 16/16] signal: replace rt_sigtimedwait waker slot with a sigtimedwait_mask MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SigTimedWait::poll stored the task's cx.waker() in a dedicated signal_waker field on Thread, and send_signal had to take() + wake() that waker out of band (with the usual "release the lock before waking to avoid the ThreadWaker re-lock deadlock" dance). That slot never cleared on Poll::Ready, and only held a single Waker so a concurrent sigtimedwait would silently overwrite an earlier one. Replace it with an Option mask that the Future arms on Pending and disarms in its Drop impl (covering both Ready-return and task cancellation). send_signal just adds `sigtimedwait_matches(sig)` to its existing Waiting->Runnable gate, and set_syscall_task_and_suspend adds `sigtimedwait_pending()` to its race-avoidance check — the existing ThreadWaker / RUN_QUEUE machinery re-polls the future without any custom wake plumbing. Shutdown test (busybox halt -> reboot/sync -> HALT) still passes, as do the remaining 70 system tests. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kernel/src/processes/process_table.rs | 22 +++++------- crates/kernel/src/processes/thread.rs | 38 +++++++++++++------- crates/kernel/src/syscalls/signal_ops.rs | 18 ++++++---- 3 files changed, 46 insertions(+), 32 deletions(-) diff --git a/crates/kernel/src/processes/process_table.rs b/crates/kernel/src/processes/process_table.rs index 29e72458..086ae06c 100644 --- a/crates/kernel/src/processes/process_table.rs +++ b/crates/kernel/src/processes/process_table.rs @@ -240,30 +240,26 @@ impl ProcessTable { pub fn send_signal(&mut self, tid: Tid, sig: u32) { if let Some(thread) = self.threads.get(&tid).cloned() { - // Collect the signal waker while we hold the thread lock, but - // call wake() only after releasing it — ThreadWaker::wake re-locks - // the same thread, which would deadlock on the same CPU. - let (should_enqueue, signal_waker) = thread.with_lock(|mut t| { + let should_enqueue = thread.with_lock(|mut t| { if matches!(t.get_state(), ThreadState::Zombie(_)) { - return (false, None); + return false; } t.raise_signal(sig); - let waker = t.take_signal_waker(); + let sigtimedwait_hit = t.sigtimedwait_matches(sig); // SIGCONT resumes stopped threads if sig == headers::syscall_types::SIGCONT && t.get_state() == ThreadState::Stopped { t.clear_pending_stop_signals(); t.set_state(ThreadState::Runnable); - return (true, waker); + return true; } - if t.has_pending_unblocked_signal() && t.get_state() == ThreadState::Waiting { + if (sigtimedwait_hit || t.has_pending_unblocked_signal()) + && t.get_state() == ThreadState::Waiting + { t.set_state(ThreadState::Runnable); - return (true, waker); + return true; } - (false, waker) + false }); - if let Some(w) = signal_waker { - w.wake(); - } if should_enqueue { RUN_QUEUE.lock().push_back(thread); } diff --git a/crates/kernel/src/processes/thread.rs b/crates/kernel/src/processes/thread.rs index df0d847b..17ab3be5 100644 --- a/crates/kernel/src/processes/thread.rs +++ b/crates/kernel/src/processes/thread.rs @@ -27,7 +27,6 @@ use core::{ fmt::Debug, ptr::null_mut, sync::atomic::{AtomicU64, Ordering}, - task::Waker, }; use headers::{ errno::Errno, @@ -109,7 +108,10 @@ pub struct Thread { pub stopped_notified: bool, pub stop_signal: u32, thread_name: Option, - signal_waker: Option, + /// Active `rt_sigtimedwait` set, if the thread is currently suspended + /// in that syscall. `send_signal` consults this to wake the thread + /// even for signals that are blocked in `sigmask`. + sigtimedwait_mask: Option, } impl core::fmt::Display for Thread { @@ -260,7 +262,7 @@ impl Thread { stopped_notified: false, stop_signal: 0, thread_name: None, - signal_waker: None, + sigtimedwait_mask: None, })) } @@ -302,11 +304,12 @@ impl Thread { self.state = ThreadState::Runnable; return true; } - if self.has_pending_unblocked_signal() { + if self.has_pending_unblocked_signal() || self.sigtimedwait_pending() { // A signal arrived while the thread was Running (before the // syscall yielded). Same reasoning as the wakeup_pending branch: // drop to Runnable so the scheduler can re-pick us and deliver - // the signal via the normal path. + // the signal via the normal path. sigtimedwait_pending covers + // blocked signals the thread is explicitly waiting for. self.state = ThreadState::Runnable; return true; } @@ -529,16 +532,25 @@ impl Thread { self.signal_state.pending.clear(sig); } - pub fn register_signal_waker(&mut self, waker: Waker) { - self.signal_waker = Some(waker); + pub fn set_sigtimedwait_mask(&mut self, mask: u64) { + self.sigtimedwait_mask = Some(mask); } - /// Detach the registered signal waker, if any. The caller must invoke - /// `wake()` on the returned `Waker` AFTER releasing the thread lock — - /// `ThreadWaker::wake` re-locks the same thread, so calling it while - /// holding the thread lock deadlocks. - pub fn take_signal_waker(&mut self) -> Option { - self.signal_waker.take() + pub fn clear_sigtimedwait_mask(&mut self) { + self.sigtimedwait_mask = None; + } + + /// True when a raised signal should wake a thread parked in + /// `rt_sigtimedwait`, regardless of the thread's sigmask. + pub fn sigtimedwait_matches(&self, sig: u32) -> bool { + self.sigtimedwait_mask + .is_some_and(|m| m & (1u64 << sig) != 0) + } + + /// True if any pending signal matches the active `rt_sigtimedwait` set. + pub fn sigtimedwait_pending(&self) -> bool { + self.sigtimedwait_mask + .is_some_and(|m| self.signal_state.pending.first_matching(m).is_some()) } pub fn get_sigaction_raw(&self, sig: u32) -> &sigaction { diff --git a/crates/kernel/src/syscalls/signal_ops.rs b/crates/kernel/src/syscalls/signal_ops.rs index 53967457..40ee37bb 100644 --- a/crates/kernel/src/syscalls/signal_ops.rs +++ b/crates/kernel/src/syscalls/signal_ops.rs @@ -4,6 +4,7 @@ use core::{ pin::Pin, task::{Context, Poll}, }; + use headers::{ errno::Errno, syscall_types::{ @@ -183,18 +184,23 @@ struct SigTimedWait { impl Future for SigTimedWait { type Output = Result; - fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + fn poll(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll { self.thread.with_lock(|mut t| { if let Some(sig) = t.first_pending_in_set(self.wait_mask) { t.clear_pending(sig); return Poll::Ready(Ok(sig as isize)); } - // If an unblocked signal not in set is pending, the scheduler's - // Interrupt path will deliver EINTR after we return Pending. - // Register our waker so send_signal wakes us for blocked signals - // in `set`. - t.register_signal_waker(cx.waker().clone()); + // Arm the sigtimedwait mask so send_signal wakes us for a + // matching signal even when it's blocked in sigmask; the + // unblocked-signal path stays responsible for EINTR delivery. + t.set_sigtimedwait_mask(self.wait_mask); Poll::Pending }) } } + +impl Drop for SigTimedWait { + fn drop(&mut self) { + self.thread.lock().clear_sigtimedwait_mask(); + } +}