diff --git a/CLAUDE.md b/CLAUDE.md index 5821c393..e4c91de6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -187,14 +187,18 @@ Userspace is a buildroot-produced rootfs cpio, loaded at boot via QEMU `-initrd` (path from DTB `/chosen/linux,initrd-{start,end}`) and extracted into a tmpfs-backed `/` in `initramfs::extract()`: -- **dash** + **GNU coreutils** come from the buildroot package set - (`cmake/buildroot.cmake`, `configs/solaya_riscv64_buildroot_defconfig.in`). -- Solaya's Rust binaries (init, dhcpd, tcp_echo, webserver, test +- **busybox** (init + sh + applets) and **dash** + **GNU coreutils** come + from the buildroot package set (`cmake/buildroot.cmake`, + `configs/solaya_riscv64_buildroot_defconfig.in`). +- Solaya's Rust userspace binaries (dhcpd, tcp_echo, webserver, test fixtures like `prog1`/`*-test`) are built by `userspace-rust` and layered on top via `BR2_ROOTFS_OVERLAY` — they end up at `/bin/`. -- PID 1 is Solaya's Rust `init` (read from `/bin/init` by - `process_table::load_init_bytes`), which execs `/bin/dhcpd` once then - spawns `/bin/dash`. +- PID 1 is busybox (read from `/sbin/init` by + `process_table::load_init_bytes`, which resolves the buildroot symlink + to `/bin/busybox`). Busybox reads `/etc/inittab` + (`configs/overlay/etc/inittab`): runs `/etc/init.d/rcS`, waits on + `/bin/dhcpd` to configure the network, then respawns + `/bin/dash -i` on the serial console. ### Adding a userspace program diff --git a/configs/solaya_riscv64_buildroot_defconfig.in b/configs/solaya_riscv64_buildroot_defconfig.in index 9513d581..b0c60248 100644 --- a/configs/solaya_riscv64_buildroot_defconfig.in +++ b/configs/solaya_riscv64_buildroot_defconfig.in @@ -35,14 +35,11 @@ BR2_TOOLCHAIN_EXTERNAL_BOOTLIN_RISCV64_LP64D_MUSL_STABLE=y # are separate follow-up features. BR2_STATIC_LIBS=y -# --- Init system: Solaya's Rust init stays PID 1 ------------------------- -# Buildroot provides no init (we don't use busybox init yet — needs kernel -# AF_UNIX socketpair + shebang execve support). Our Rust init.rs runs -# as PID 1 from /bin/init and execs /bin/dash directly. -BR2_INIT_NONE=y -# BR2_PACKAGE_BUSYBOX is not set — dropping busybox entirely also -# auto-enables BR2_PACKAGE_BUSYBOX_SHOW_OTHERS, which is required by the -# coreutils package selection below. +# --- Init system: busybox init as PID 1 ---------------------------------- +# Installs /sbin/init as a symlink to /bin/busybox; inittab comes from +# configs/overlay/etc/inittab. +BR2_INIT_BUSYBOX=y +BR2_PACKAGE_BUSYBOX=y # --- Shell + coreutils --------------------------------------------------- BR2_PACKAGE_DASH=y diff --git a/crates/driver-api/src/lib.rs b/crates/driver-api/src/lib.rs index ed324246..cb6cbf7a 100644 --- a/crates/driver-api/src/lib.rs +++ b/crates/driver-api/src/lib.rs @@ -162,6 +162,12 @@ pub trait CharDevice: Send + Sync { /// Write `data`. Returns the number of bytes written (typically /// `data.len()` for synchronous console-style devices). fn write(&self, data: &[u8]) -> Result; + + /// True if this device is a terminal. `openat` uses this to wrap the + /// fd in `FileDescriptor::Tty` instead of the default VfsFile. + fn is_tty(&self) -> bool { + false + } } /// Static framebuffer description, returned by `DisplayDevice::framebuffer`. diff --git a/crates/kernel/src/fs/devfs.rs b/crates/kernel/src/fs/devfs.rs index a26eaf35..9a285a95 100644 --- a/crates/kernel/src/fs/devfs.rs +++ b/crates/kernel/src/fs/devfs.rs @@ -96,6 +96,10 @@ impl VfsNode for CharNode { fn truncate(&self, _length: usize) -> Result<(), Errno> { Ok(()) } + + fn char_device(&self) -> Option> { + Some(self.device.clone()) + } } struct DisplayNode { diff --git a/crates/kernel/src/fs/vfs.rs b/crates/kernel/src/fs/vfs.rs index 91d998ac..0c063ecc 100644 --- a/crates/kernel/src/fs/vfs.rs +++ b/crates/kernel/src/fs/vfs.rs @@ -5,7 +5,7 @@ use alloc::{ vec::Vec, }; use core::sync::atomic::{AtomicU64, Ordering}; -use driver_api::BlockDevice; +use driver_api::{BlockDevice, CharDevice}; use headers::errno::Errno; use hal::spinlock::Spinlock; @@ -179,6 +179,14 @@ pub trait VfsNode: Send + Sync { None } + /// If this node is backed by a character device, return an + /// `Arc`. Used by `openat` to recognise `/dev/console` + /// and produce a blocking `FileDescriptor::Tty` instead of the default + /// non-blocking `VfsFile`. + fn char_device(&self) -> Option> { + None + } + fn atime(&self) -> (i64, u32) { (0, 0) } diff --git a/crates/kernel/src/io/uart.rs b/crates/kernel/src/io/uart.rs index 6b2c2091..e0df6fad 100644 --- a/crates/kernel/src/io/uart.rs +++ b/crates/kernel/src/io/uart.rs @@ -13,14 +13,9 @@ use headers::errno::Errno; pub use console::uart::CONSOLE_UART; -/// `CharDevice` adapter for the console UART. -/// -/// Carries the TTY line discipline internally: `write` goes through the TTY -/// `process_output` path (handles ONLCR, echo, etc.) before hitting the -/// UART; `read` drains cooked bytes from the TTY input buffer. -/// -/// The TTY itself still lives in `io/tty_device` and is wired up at init. -/// Fully decoupling TTY from UART stays deferred (#250 item #5). +/// `CharDevice` adapter for the console UART. Write goes through the TTY +/// line discipline (ONLCR, echo, ...) before the UART; read drains cooked +/// input bytes. pub struct ConsoleCharDevice; impl CharDevice for ConsoleCharDevice { @@ -47,10 +42,12 @@ impl CharDevice for ConsoleCharDevice { } Ok(data.len()) } + + fn is_tty(&self) -> bool { + true + } } -/// Register the console UART as a `CharDevice` in both the registry and -/// devfs. Called once during kernel init. pub fn register_console_char_device() { let device: Arc = Arc::new(ConsoleCharDevice); crate::drivers::registry::().register(device.clone()); diff --git a/crates/kernel/src/processes/process_table.rs b/crates/kernel/src/processes/process_table.rs index c626c471..086ae06c 100644 --- a/crates/kernel/src/processes/process_table.rs +++ b/crates/kernel/src/processes/process_table.rs @@ -55,13 +55,11 @@ pub fn init() { /// Source the PID-1 ELF image from the initramfs-populated rootfs. /// -/// /bin/init is Solaya's Rust init (delivered via the buildroot overlay). -/// /sbin/init would be busybox if we ever flipped back to it — keeping -/// both paths means swapping busybox in is a one-line reorder plus the -/// AF_UNIX socketpair + shebang execve kernel work that's currently on -/// the follow-up list. +/// `/sbin/init` is buildroot's busybox (symlink to `/bin/busybox`). +/// `/init` stays in the search list as the conventional initramfs +/// fallback. fn load_init_bytes() -> Arc<[u8]> { - const INIT_PATHS: &[&str] = &["/bin/init", "/sbin/init", "/init"]; + const INIT_PATHS: &[&str] = &["/sbin/init", "/init"]; for path in INIT_PATHS { let Ok(node) = fs::resolve_path(path) else { continue; @@ -75,7 +73,7 @@ fn load_init_bytes() -> Arc<[u8]> { crate::info!("init: loaded PID 1 from {path} ({n} bytes)"); return Arc::<[u8]>::from(buf); } - panic!("init: no /sbin/init, /bin/init, or /init in the root filesystem"); + panic!("init: no /sbin/init or /init in the root filesystem"); } pub struct ProcessTable { @@ -247,13 +245,16 @@ impl ProcessTable { return false; } t.raise_signal(sig); + let sigtimedwait_hit = t.sigtimedwait_matches(sig); // SIGCONT resumes stopped threads if sig == headers::syscall_types::SIGCONT && t.get_state() == ThreadState::Stopped { t.clear_pending_stop_signals(); t.set_state(ThreadState::Runnable); return true; } - if t.has_pending_unblocked_signal() && t.get_state() == ThreadState::Waiting { + if (sigtimedwait_hit || t.has_pending_unblocked_signal()) + && t.get_state() == ThreadState::Waiting + { t.set_state(ThreadState::Runnable); return true; } diff --git a/crates/kernel/src/processes/signal.rs b/crates/kernel/src/processes/signal.rs index 0219301c..d066e39d 100644 --- a/crates/kernel/src/processes/signal.rs +++ b/crates/kernel/src/processes/signal.rs @@ -49,12 +49,15 @@ impl PendingSignals { self.0 &= !(1u64 << sig); } - pub fn first_unblocked(&self, mask: u64) -> Option { - let deliverable = self.0 & !mask; - if deliverable == 0 { + /// Lowest-numbered pending signal that intersects `allowed`. + /// Callers pass `!sigmask` for delivery or the sigtimedwait set + /// directly to wait for blocked signals. + pub fn first_matching(&self, allowed: u64) -> Option { + let matched = self.0 & allowed; + if matched == 0 { return None; } - Some(deliverable.trailing_zeros()) + Some(matched.trailing_zeros()) } } diff --git a/crates/kernel/src/processes/thread.rs b/crates/kernel/src/processes/thread.rs index bb8ffa3c..17ab3be5 100644 --- a/crates/kernel/src/processes/thread.rs +++ b/crates/kernel/src/processes/thread.rs @@ -108,6 +108,10 @@ pub struct Thread { pub stopped_notified: bool, pub stop_signal: u32, thread_name: Option, + /// Active `rt_sigtimedwait` set, if the thread is currently suspended + /// in that syscall. `send_signal` consults this to wake the thread + /// even for signals that are blocked in `sigmask`. + sigtimedwait_mask: Option, } impl core::fmt::Display for Thread { @@ -258,6 +262,7 @@ impl Thread { stopped_notified: false, stop_signal: 0, thread_name: None, + sigtimedwait_mask: None, })) } @@ -299,11 +304,12 @@ impl Thread { self.state = ThreadState::Runnable; return true; } - if self.has_pending_unblocked_signal() { + if self.has_pending_unblocked_signal() || self.sigtimedwait_pending() { // A signal arrived while the thread was Running (before the // syscall yielded). Same reasoning as the wakeup_pending branch: // drop to Runnable so the scheduler can re-pick us and deliver - // the signal via the normal path. + // the signal via the normal path. sigtimedwait_pending covers + // blocked signals the thread is explicitly waiting for. self.state = ThreadState::Runnable; return true; } @@ -502,27 +508,51 @@ impl Thread { } pub fn has_pending_unblocked_signal(&self) -> bool { - self.signal_state - .pending - .first_unblocked(self.signal_state.sigmask.sig[0]) - .is_some() + self.peek_first_unblocked_signal().is_some() } pub fn peek_first_unblocked_signal(&self) -> Option { self.signal_state .pending - .first_unblocked(self.signal_state.sigmask.sig[0]) + .first_matching(!self.signal_state.sigmask.sig[0]) } pub fn take_next_pending_signal(&mut self) -> Option { - let sig = self - .signal_state - .pending - .first_unblocked(self.signal_state.sigmask.sig[0])?; + let sig = self.peek_first_unblocked_signal()?; self.signal_state.pending.clear(sig); Some(sig) } + /// Lowest-numbered pending signal that is in `set`, regardless of sigmask. + pub fn first_pending_in_set(&self, set: u64) -> Option { + self.signal_state.pending.first_matching(set) + } + + pub fn clear_pending(&mut self, sig: u32) { + self.signal_state.pending.clear(sig); + } + + pub fn set_sigtimedwait_mask(&mut self, mask: u64) { + self.sigtimedwait_mask = Some(mask); + } + + pub fn clear_sigtimedwait_mask(&mut self) { + self.sigtimedwait_mask = None; + } + + /// True when a raised signal should wake a thread parked in + /// `rt_sigtimedwait`, regardless of the thread's sigmask. + pub fn sigtimedwait_matches(&self, sig: u32) -> bool { + self.sigtimedwait_mask + .is_some_and(|m| m & (1u64 << sig) != 0) + } + + /// True if any pending signal matches the active `rt_sigtimedwait` set. + pub fn sigtimedwait_pending(&self) -> bool { + self.sigtimedwait_mask + .is_some_and(|m| self.signal_state.pending.first_matching(m).is_some()) + } + pub fn get_sigaction_raw(&self, sig: u32) -> &sigaction { &self.signal_state.sigaction[sig as usize] } diff --git a/crates/kernel/src/syscalls/exec_ops.rs b/crates/kernel/src/syscalls/exec_ops.rs index 4375ac0e..a66fbfcf 100644 --- a/crates/kernel/src/syscalls/exec_ops.rs +++ b/crates/kernel/src/syscalls/exec_ops.rs @@ -51,7 +51,6 @@ impl LinuxSyscallHandler { })?; let mut buf = ConsumableBuffer::new(&filename_bytes); let filename_str = buf.consume_str().ok_or(Errno::EFAULT)?; - let name = filename_str.rsplit('/').next().unwrap_or(filename_str); let argv_buffers = self.read_string_array(argv)?; let mut args: Vec<&str> = Vec::new(); @@ -78,15 +77,19 @@ impl LinuxSyscallHandler { let old_cwd_str = self.get_process().with_lock(|p| String::from(p.cwd())); - // Resolve the filename against the VFS: absolute paths walk the - // mount tree, relative paths get rebased against cwd. Errors - // (ENOENT, EACCES, ELOOP, EIO, E2BIG) propagate to userspace as-is. - let vfs_bytes = try_read_from_vfs(filename_str, &old_cwd_str)?; + let (vfs_bytes, final_argv) = resolve_shebang(filename_str, &args, &old_cwd_str)?; let elf_arc: Arc<[u8]> = Arc::<[u8]>::from(vfs_bytes.as_slice()); - let elf = ElfFile::parse(&elf_arc).expect("Cannot parse ELF file"); + let resolved_path = &final_argv[0]; + let name = resolved_path + .rsplit('/') + .next() + .unwrap_or(resolved_path.as_str()); + let args_refs: Vec<&str> = final_argv.iter().skip(1).map(String::as_str).collect(); + + let elf = ElfFile::parse(&elf_arc).map_err(|_| Errno::ENOEXEC)?; let loaded = - loader::load_elf(&elf, name, &args, &env_strs).expect("ELF loading must succeed"); + loader::load_elf(&elf, name, &args_refs, &env_strs).map_err(|_| Errno::ENOEXEC)?; let process_name = Arc::new(String::from(name)); let old_process = self.get_process(); @@ -137,13 +140,124 @@ impl LinuxSyscallHandler { } } -/// Read the whole file at `filename` into memory, preserving the VFS -/// errno so userspace can distinguish ENOENT / EACCES / ELOOP / EIO. +/// Linux caps shebang recursion at 4 layers (`BINPRM_MAX_RECURSION`). +const MAX_SHEBANG_DEPTH: usize = 4; + +/// Maximum bytes of the first line we inspect for a `#!` header. Matches +/// Linux's `BINPRM_BUF_SIZE` envelope: `#!` + 255 interpreter bytes + `\n`. +const SHEBANG_MAX_LINE: usize = 257; + +/// Parse a `#!` line. Returns `(interpreter, optional_arg)` where the +/// optional arg is the remainder of the line after the interpreter token, +/// treated as a single argument (matching Linux behavior — no splitting). /// -/// Path resolution follows execve(2): absolute paths resolve against the -/// VFS root, relative paths against `cwd`. No PATH search — shells are -/// expected to do that themselves (dash/busybox ash both do). -fn try_read_from_vfs(filename: &str, cwd: &str) -> Result, Errno> { +/// Returns `Err(Errno::ENOEXEC)` if the shebang header is malformed (no +/// newline within the bound, empty interpreter, ...). +fn parse_shebang(bytes: &[u8]) -> Result<(String, Option), Errno> { + let scan_len = bytes.len().min(SHEBANG_MAX_LINE); + let line_end = bytes[..scan_len] + .iter() + .position(|&b| b == b'\n') + .ok_or(Errno::ENOEXEC)?; + // Skip the `#!` prefix then leading spaces/tabs. + let after_bang = &bytes[2..line_end]; + let start = after_bang + .iter() + .position(|&b| b != b' ' && b != b'\t') + .ok_or(Errno::ENOEXEC)?; + let rest = &after_bang[start..]; + let interp_end = rest + .iter() + .position(|&b| b == b' ' || b == b'\t') + .unwrap_or(rest.len()); + if interp_end == 0 { + return Err(Errno::ENOEXEC); + } + let interpreter = core::str::from_utf8(&rest[..interp_end]) + .map_err(|_| Errno::ENOEXEC)? + .to_string(); + let arg_region = &rest[interp_end..]; + let arg_start = arg_region + .iter() + .position(|&b| b != b' ' && b != b'\t') + .unwrap_or(arg_region.len()); + let trimmed = &arg_region[arg_start..]; + let trimmed_end = trimmed + .iter() + .rposition(|&b| b != b' ' && b != b'\t') + .map(|p| p + 1) + .unwrap_or(0); + let optional_arg = if trimmed_end == 0 { + None + } else { + Some( + core::str::from_utf8(&trimmed[..trimmed_end]) + .map_err(|_| Errno::ENOEXEC)? + .to_string(), + ) + }; + Ok((interpreter, optional_arg)) +} + +/// Read the file at `filename`, following up to `MAX_SHEBANG_DEPTH` layers +/// of `#!` indirection. Returns the final binary bytes plus the full argv +/// (argv[0] is the resolved interpreter or original filename, followed by +/// any shebang-contributed args, then the caller's `trailing_args`). +fn resolve_shebang( + filename: &str, + trailing_args: &[&str], + cwd: &str, +) -> Result<(Vec, Vec), Errno> { + let mut current_path = String::from(filename); + // Per-layer (optional_arg, script_path) in discovery order (outermost + // first). On exit, innermost interpreter path is `current_path`. + let mut layers: Vec<(Option, String)> = Vec::new(); + let bytes = loop { + let node = resolve_against_cwd(¤t_path, cwd)?; + // Peek the shebang header first so a multi-MiB ELF isn't read + // into memory just to check its first two bytes. + let size = node.size(); + let peek_len = size.min(SHEBANG_MAX_LINE); + let mut peek: Vec = alloc::vec![0u8; peek_len]; + let n = node.read(0, &mut peek)?; + peek.truncate(n); + if peek.len() < 2 || &peek[..2] != b"#!" { + break if peek.len() == size { + peek + } else { + read_full_node(&node)? + }; + } + if layers.len() >= MAX_SHEBANG_DEPTH { + return Err(Errno::ELOOP); + } + let (interpreter, optional_arg) = parse_shebang(&peek)?; + layers.push((optional_arg, current_path)); + current_path = interpreter; + }; + // Assemble argv. Linux semantics: + // argv[0] = innermost interpreter (the actual binary) + // then, unwinding innermost-layer first: + // if that layer had an optional arg: push it + // push that layer's script path + // then trailing_args (argv[1..] from the original execve call) + let mut argv: Vec = Vec::with_capacity(1 + layers.len() * 2 + trailing_args.len()); + argv.push(current_path); + for (opt_arg, script) in layers.into_iter().rev() { + if let Some(a) = opt_arg { + argv.push(a); + } + argv.push(script); + } + for a in trailing_args { + argv.push(String::from(*a)); + } + Ok((bytes, argv)) +} + +/// Resolve `filename` against the VFS root (absolute) or `cwd` (relative). +/// execve(2) does no PATH search — shells handle that. +fn resolve_against_cwd(filename: &str, cwd: &str) -> Result { let absolute: String = if filename.starts_with('/') { filename.to_string() } else if cwd.ends_with('/') { @@ -151,11 +265,13 @@ fn try_read_from_vfs(filename: &str, cwd: &str) -> Result, Errno> { } else { alloc::format!("{cwd}/{filename}") }; - let node = fs::resolve_path(&absolute)?; + fs::resolve_path(&absolute) +} + +/// Read an entire VFS node into memory. 64 MiB cap keeps a rogue or +/// corrupt entry from exhausting the heap (~10× our largest binary). +fn read_full_node(node: &fs::vfs::VfsNodeRef) -> Result, Errno> { let size = node.size(); - // Refuse outlandish sizes to avoid a rogue or corrupt VFS entry - // allocating the whole heap; 64 MiB is ~10× the largest userspace - // binary we produce. if size > 64 * 1024 * 1024 { return Err(Errno::E2BIG); } diff --git a/crates/kernel/src/syscalls/fs_ops.rs b/crates/kernel/src/syscalls/fs_ops.rs index 4115ddfa..5d5b6e76 100644 --- a/crates/kernel/src/syscalls/fs_ops.rs +++ b/crates/kernel/src/syscalls/fs_ops.rs @@ -64,11 +64,24 @@ impl LinuxSyscallHandler { return Err(Errno::ENOTDIR); } - let fd_abs = compose_abs(&base_abs, &raw_path); - let open_file = fs::open_file::open(node, flags, fd_abs); + let descriptor = if let Some(dev) = node.char_device() + && dev.is_tty() + { + // Implicit-ctty stop-gap: grant the opener's pgid the console's + // fg_pgid so dash's job-control startup doesn't self-stop via + // SIGTTIN. Proper TIOCSCTTY-on-open is tracked in issue #262. + let caller_pgid = self.current_process.with_lock(|p| p.pgid()); + crate::io::tty_device::console_tty() + .lock() + .set_fg_pgid(caller_pgid); + FileDescriptor::Tty(crate::io::tty_device::console_tty().clone()) + } else { + let fd_abs = compose_abs(&base_abs, &raw_path); + FileDescriptor::VfsFile(fs::open_file::open(node, flags, fd_abs)) + }; let fd = self .current_process - .with_lock(|p| p.fd_table().allocate(FileDescriptor::VfsFile(open_file)))?; + .with_lock(|p| p.fd_table().allocate(descriptor))?; Ok(fd as isize) } diff --git a/crates/kernel/src/syscalls/linux.rs b/crates/kernel/src/syscalls/linux.rs index 2412f623..591efe91 100644 --- a/crates/kernel/src/syscalls/linux.rs +++ b/crates/kernel/src/syscalls/linux.rs @@ -101,6 +101,7 @@ linux_syscalls! { SYSCALL_NR_PRLIMIT64 => prlimit64(pid: c_int, resource: c_uint, new_limit: Option<*const u8>, old_limit: Option<*mut u8>); SYSCALL_NR_PWRITE64 => pwrite64(fd: c_int, buf: *const u8, count: usize, offset: isize); SYSCALL_NR_READ => read(fd: c_int, buf: *mut u8, count: usize); + SYSCALL_NR_REBOOT => reboot(magic1: c_int, magic2: c_int, cmd: c_uint, _arg: usize); SYSCALL_NR_READLINKAT => readlinkat(dirfd: c_int, pathname: *const u8, buf: *mut u8, bufsiz: usize); SYSCALL_NR_READV => readv(fd: c_int, iov: *const iovec, iovcnt: c_int); SYSCALL_NR_RECVFROM => recvfrom(fd: c_int, buf: *mut u8, len: usize, flags: c_int, src_addr: Option<*mut u8>, addrlen: Option<*mut c_uint>); @@ -108,6 +109,7 @@ linux_syscalls! { SYSCALL_NR_RT_SIGACTION => rt_sigaction(sig: c_uint, act: Option<*const sigaction>, oact: Option<*mut sigaction>, sigsetsize: usize); SYSCALL_NR_RT_SIGPROCMASK => rt_sigprocmask(how: c_uint, set: Option<*const sigset_t>, oldset: Option<*mut sigset_t>, sigsetsize: usize); SYSCALL_NR_RT_SIGRETURN => rt_sigreturn(); + SYSCALL_NR_RT_SIGTIMEDWAIT => rt_sigtimedwait(set: *const sigset_t, info: Option<*mut u8>, timeout: Option<*const timespec>, sigsetsize: usize); SYSCALL_NR_SENDFILE => sendfile(out_fd: c_int, in_fd: c_int, offset: Option<*mut isize>, count: usize); SYSCALL_NR_SENDTO => sendto(fd: c_int, buf: *const u8, len: usize, flags: c_int, dest_addr: *const u8, addrlen: c_uint); SYSCALL_NR_SETGID => setgid(gid: c_uint); @@ -131,6 +133,7 @@ linux_syscalls! { SYSCALL_NR_STATFS => statfs(pathname: *const u8, buf: *mut u8); SYSCALL_NR_STATX => statx(dirfd: c_int, pathname: *const u8, flags: c_int, mask: c_uint, statxbuf: *mut u8); SYSCALL_NR_SYMLINKAT => symlinkat(target: *const u8, newdirfd: c_int, linkpath: *const u8); + SYSCALL_NR_SYNC => sync(); SYSCALL_NR_SYSINFO => sysinfo(info: *mut u8); SYSCALL_NR_TGKILL => tgkill(tgid: c_int, tid: c_int, sig: c_int); SYSCALL_NR_TKILL => tkill(tid: c_int, sig: c_int); @@ -505,6 +508,17 @@ impl LinuxSyscalls for LinuxSyscallHandler { self.do_rt_sigreturn() } + async fn rt_sigtimedwait( + &mut self, + set: LinuxUserspaceArg<*const sigset_t>, + info: LinuxUserspaceArg>, + timeout: LinuxUserspaceArg>, + sigsetsize: usize, + ) -> Result { + self.do_rt_sigtimedwait(set, info, timeout, sigsetsize) + .await + } + async fn sigaltstack( &mut self, uss: LinuxUserspaceArg>, @@ -517,6 +531,16 @@ impl LinuxSyscalls for LinuxSyscallHandler { self.do_kill(pid, sig) } + async fn reboot( + &mut self, + magic1: c_int, + magic2: c_int, + cmd: c_uint, + _arg: usize, + ) -> Result { + self.do_reboot(magic1, magic2, cmd) + } + async fn tgkill(&mut self, _tgid: c_int, tid: c_int, sig: c_int) -> Result { let target_tid = Tid::try_from_i32(tid).ok_or(Errno::ESRCH)?; if let Some(sig) = crate::processes::signal::validate_signal(sig)? { @@ -744,6 +768,11 @@ impl LinuxSyscalls for LinuxSyscallHandler { self.do_sysinfo(info) } + // No-op: Solaya has no writeback caches, so there's nothing to flush. + async fn sync(&mut self) -> Result { + Ok(0) + } + async fn getrlimit( &mut self, resource: c_uint, diff --git a/crates/kernel/src/syscalls/net_ops.rs b/crates/kernel/src/syscalls/net_ops.rs index 5fe4a2a5..c2e8f201 100644 --- a/crates/kernel/src/syscalls/net_ops.rs +++ b/crates/kernel/src/syscalls/net_ops.rs @@ -25,15 +25,14 @@ impl LinuxSyscallHandler { typ: c_int, _protocol: c_int, ) -> Result { - assert!( - domain == AF_INET, - "socket: only AF_INET supported (got domain={domain})" - ); + if domain != AF_INET { + return Err(Errno::EAFNOSUPPORT); + } let masked_type = typ & !SOCK_CLOEXEC; let descriptor = match masked_type { SOCK_DGRAM => FileDescriptor::UnboundUdpSocket, SOCK_STREAM => FileDescriptor::UnboundTcpSocket, - _ => panic!("socket: unsupported type {typ:#x}"), + _ => return Err(Errno::EPROTONOSUPPORT), }; let fd = self .current_process diff --git a/crates/kernel/src/syscalls/process_ops.rs b/crates/kernel/src/syscalls/process_ops.rs index c0b296f5..cff970e1 100644 --- a/crates/kernel/src/syscalls/process_ops.rs +++ b/crates/kernel/src/syscalls/process_ops.rs @@ -1,13 +1,19 @@ use alloc::{collections::BTreeMap, string::String, sync::Arc}; -use core::ffi::{c_int, c_ulong}; +use core::ffi::{c_int, c_uint, c_ulong}; use hal::spinlock::Spinlock; use headers::{ errno::Errno, - syscall_types::{CLONE_CHILD_CLEARTID, CLONE_PARENT_SETTID, CLONE_SETTLS}, + syscall_types::{ + CLONE_CHILD_CLEARTID, CLONE_PARENT_SETTID, CLONE_SETTLS, LINUX_REBOOT_CMD_CAD_OFF, + LINUX_REBOOT_CMD_CAD_ON, LINUX_REBOOT_CMD_HALT, LINUX_REBOOT_CMD_POWER_OFF, + LINUX_REBOOT_CMD_RESTART, LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, LINUX_REBOOT_MAGIC2A, + LINUX_REBOOT_MAGIC2B, LINUX_REBOOT_MAGIC2C, + }, }; use crate::{ cpu::Cpu, + info, memory::VirtAddr, processes::{ process::Process, @@ -21,6 +27,13 @@ use abi::{pid::Tid, syscalls::trap_frame::Register}; use super::linux::LinuxSyscallHandler; +const LINUX_REBOOT_MAGIC2_SET: &[u32] = &[ + LINUX_REBOOT_MAGIC2, + LINUX_REBOOT_MAGIC2A, + LINUX_REBOOT_MAGIC2B, + LINUX_REBOOT_MAGIC2C, +]; + impl LinuxSyscallHandler { pub(super) async fn clone_fork(&mut self, stack: usize) -> Result { let parent_regs = Cpu::read_trap_frame(); @@ -204,4 +217,27 @@ impl LinuxSyscallHandler { crate::debug!("Exit process with status: {status}\n"); Ok(0) } + + // TODO: require CAP_SYS_BOOT once credentials gain capability bits + pub(super) fn do_reboot( + &self, + magic1: c_int, + magic2: c_int, + cmd: c_uint, + ) -> Result { + if magic1.cast_unsigned() != LINUX_REBOOT_MAGIC1 + || !LINUX_REBOOT_MAGIC2_SET.contains(&magic2.cast_unsigned()) + { + return Err(Errno::EINVAL); + } + match cmd { + LINUX_REBOOT_CMD_CAD_OFF | LINUX_REBOOT_CMD_CAD_ON => Ok(0), + LINUX_REBOOT_CMD_HALT | LINUX_REBOOT_CMD_POWER_OFF => { + info!("No more processes to schedule, shutting down system"); + crate::test::qemu_exit::exit_success(); + } + LINUX_REBOOT_CMD_RESTART => crate::platform::reset::trigger_reset(), + _ => Err(Errno::EINVAL), + } + } } diff --git a/crates/kernel/src/syscalls/signal_ops.rs b/crates/kernel/src/syscalls/signal_ops.rs index a18d1dca..40ee37bb 100644 --- a/crates/kernel/src/syscalls/signal_ops.rs +++ b/crates/kernel/src/syscalls/signal_ops.rs @@ -1,12 +1,22 @@ -use core::ffi::{c_int, c_uint}; +use core::{ + ffi::{c_int, c_uint}, + future::Future, + pin::Pin, + task::{Context, Poll}, +}; + use headers::{ errno::Errno, syscall_types::{ _NSIG, SIG_BLOCK, SIG_SETMASK, SIG_UNBLOCK, SIGKILL, SIGSTOP, sigaction, sigset_t, stack_t, + timespec, }, }; -use crate::{processes::process_table, syscalls::linux_validator::LinuxUserspaceArg}; +use crate::{ + processes::{process_table, thread::ThreadRef}, + syscalls::linux_validator::LinuxUserspaceArg, +}; use abi::pid::Tid; use super::linux::LinuxSyscallHandler; @@ -122,4 +132,75 @@ impl LinuxSyscallHandler { })?; Ok(0) } + + pub(super) async fn do_rt_sigtimedwait( + &self, + set: LinuxUserspaceArg<*const sigset_t>, + info: LinuxUserspaceArg>, + timeout: LinuxUserspaceArg>, + sigsetsize: usize, + ) -> Result { + if sigsetsize != core::mem::size_of::() { + return Err(Errno::EINVAL); + } + // NULL-info is the only supported caller path for now. + if info.arg_nonzero() { + return Err(Errno::EINVAL); + } + let set = set.validate_ptr()?; + // SIGKILL/SIGSTOP cannot be waited for — strip them from the wait set. + let wait_mask = set.sig[0] & !(1u64 << SIGKILL) & !(1u64 << SIGSTOP); + + if let Some(t) = timeout.validate_ptr()? { + if t.tv_sec == 0 && t.tv_nsec == 0 { + // Poll: dequeue a matching pending signal, or EAGAIN. + return self.current_thread.with_lock(|mut th| { + match th.first_pending_in_set(wait_mask) { + Some(sig) => { + th.clear_pending(sig); + Ok(sig as isize) + } + None => Err(Errno::EAGAIN), + } + }); + } + // Finite non-zero timeouts are out of scope for now. + return Err(Errno::EINVAL); + } + + SigTimedWait { + thread: self.current_thread.clone(), + wait_mask, + } + .await + } +} + +struct SigTimedWait { + thread: ThreadRef, + wait_mask: u64, +} + +impl Future for SigTimedWait { + type Output = Result; + + fn poll(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll { + self.thread.with_lock(|mut t| { + if let Some(sig) = t.first_pending_in_set(self.wait_mask) { + t.clear_pending(sig); + return Poll::Ready(Ok(sig as isize)); + } + // Arm the sigtimedwait mask so send_signal wakes us for a + // matching signal even when it's blocked in sigmask; the + // unblocked-signal path stays responsible for EINTR delivery. + t.set_sigtimedwait_mask(self.wait_mask); + Poll::Pending + }) + } +} + +impl Drop for SigTimedWait { + fn drop(&mut self) { + self.thread.lock().clear_sigtimedwait_mask(); + } } diff --git a/doc/ai/BUILD.md b/doc/ai/BUILD.md index 05512b82..98deb784 100644 --- a/doc/ai/BUILD.md +++ b/doc/ai/BUILD.md @@ -64,20 +64,22 @@ just build Userspace is a buildroot-produced **cpio initramfs**, not a compile-time embedding: -1. `userspace-rust` builds Solaya's Rust binaries (init, dhcpd, tests) - into `build/userspace/artifacts/`. +1. `userspace-rust` builds Solaya's Rust binaries (dhcpd, tests) into + `build/userspace/artifacts/`. 2. `buildroot-overlay` copies those into `.buildroot/overlay/bin/`. -3. `buildroot-all` runs buildroot, which cross-builds dash + GNU - coreutils using the Bootlin prebuilt musl GCC toolchain +3. `buildroot-all` runs buildroot, which cross-builds busybox, dash, + and GNU coreutils using the Bootlin prebuilt musl GCC toolchain (`BR2_TOOLCHAIN_EXTERNAL_BOOTLIN`), then layers our overlay on top and emits `.buildroot/output/images/rootfs.cpio`. 4. `qemu_wrapper.sh` passes the cpio via `-initrd`; kernel reads `/chosen/linux,initrd-{start,end}` from the DTB, reserves the range in the page allocator, and `initramfs::extract()` unpacks it into the tmpfs-backed root. -5. `process_table::init` reads `/bin/init` from the VFS and runs it as - PID 1. That's Solaya's Rust `init`, which execs `/bin/dhcpd` and - `/bin/dash` by absolute path (both live in the cpio). +5. `process_table::init` reads `/sbin/init` from the VFS (buildroot + symlinks it to `/bin/busybox`) and runs busybox as PID 1. Busybox + reads `/etc/inittab` (shipped via overlay), runs `/etc/init.d/rcS`, + waits on `/bin/dhcpd` to configure the network, then respawns + `/bin/dash -i` on the console. Kernel unit tests no longer embed userspace fixtures; all userspace coverage lives in `system-tests/`, which boot the full image in QEMU. diff --git a/doc/ai/TESTING.md b/doc/ai/TESTING.md index f8fe419e..ee25200e 100644 --- a/doc/ai/TESTING.md +++ b/doc/ai/TESTING.md @@ -102,9 +102,9 @@ Reads from stdout until finding the needle string. QemuInstance::start() automatically waits for: 1. "Hello World from Solaya!" 2. "kernel_init done!" -3. "init process started" -4. "starting shell" -5. Shell prompt ("$ ") +3. "dhcpd: configured ip" (only when the test enables networking) +4. Shell prompt ("$ ") — emitted by dash once busybox init's + `console::respawn` entry has spawned the interactive shell. ### Example Tests diff --git a/qemu-infra/src/qemu.rs b/qemu-infra/src/qemu.rs index 26108d22..0ce08b61 100644 --- a/qemu-infra/src/qemu.rs +++ b/qemu-infra/src/qemu.rs @@ -178,14 +178,13 @@ impl QemuInstance { stdout.assert_read_until("Hello World from Solaya!").await?; stdout.assert_read_until("kernel_init done!").await?; - // After kernel_init, async kernel tasks (like ext2 mount) run concurrently - // with the init process. Accumulate boot output to check if the ext2 init - // message was already seen before the prompt. - let mut boot_tail = stdout.assert_read_until("init process started").await?; + // Sync on dhcpd (when networked) + shell prompt. Accumulate boot + // output so we can tell whether "ext2: init complete" fired before + // the prompt arrived. + let mut boot_tail = Vec::new(); if network_port.is_some() { boot_tail.extend(stdout.assert_read_until("dhcpd: configured ip").await?); } - boot_tail.extend(stdout.assert_read_until("starting shell").await?); boot_tail.extend(stdout.assert_read_until(PROMPT).await?); if has_block_device { diff --git a/qemu_wrapper.sh b/qemu_wrapper.sh index e0bfb06c..5733e4ea 100755 --- a/qemu_wrapper.sh +++ b/qemu_wrapper.sh @@ -132,7 +132,7 @@ fi # Fall back to SOLAYA_INITRD env var if --initrd wasn't passed — lets the # CMake run/test targets inject the buildroot cpio without every caller # having to know the flag. Booting without a rootfs is not a supported -# mode (kernel panics on missing /bin/init), so require one of --initrd +# mode (kernel panics on missing /sbin/init), so require one of --initrd # or SOLAYA_INITRD and fail hard if the referenced file is missing — # silent fallback used to mask an unbuilt buildroot tree. if [[ -z "$INITRD_PATH" ]]; then diff --git a/system-tests/src/tests/basics.rs b/system-tests/src/tests/basics.rs index 523b7f9a..6d3e5c05 100644 --- a/system-tests/src/tests/basics.rs +++ b/system-tests/src/tests/basics.rs @@ -23,7 +23,7 @@ async fn shutdown() -> anyhow::Result<()> { let mut solaya = QemuInstance::start().await?; solaya - .run_prog_waiting_for("exit", "shutting down system") + .run_prog_waiting_for("halt", "shutting down system") .await?; assert!(solaya.wait_for_qemu_to_exit().await?.success()); diff --git a/tools/bindgen-driver/src/main.rs b/tools/bindgen-driver/src/main.rs index d0a4a66d..4bbecfe1 100644 --- a/tools/bindgen-driver/src/main.rs +++ b/tools/bindgen-driver/src/main.rs @@ -183,6 +183,7 @@ fn generate_syscall_types(args: &Args) -> Result<(), Box> .header(linux_h(args, "linux/uio.h")) .header(linux_h(args, "linux/wait.h")) .header(linux_h(args, "linux/prctl.h")) + .header(linux_h(args, "linux/reboot.h")) .generate()?; bindings.write_to_file(args.out_dir.join("syscall_types.rs"))?; Ok(()) diff --git a/userspace/src/bin/init.rs b/userspace/src/bin/init.rs deleted file mode 100644 index ad3b0e66..00000000 --- a/userspace/src/bin/init.rs +++ /dev/null @@ -1,12 +0,0 @@ -use userspace::spawn::spawn; - -fn main() { - println!("init process started"); - if let Ok(mut child) = spawn("/bin/dhcpd", &[]) { - let _ = child.wait(); - } - println!("starting shell"); - let mut child = spawn("/bin/dash", &[]).expect("Failed to spawn shell"); - child.wait().expect("Failed to wait for shell"); - println!("Initial shell has exited..."); -}