diff --git a/README.md b/README.md index 91c64e2..9caff5f 100644 --- a/README.md +++ b/README.md @@ -98,8 +98,14 @@ Follow child processes created by the tracee (like `strace -f`): ```shell pinchy -f -- sh -c 'ls | wc -l' ``` -The daemon learns about new children from the parent's fork/clone exit, so a -child's very first syscalls may be missed. +Trace lines are annotated with a fixed-width process name field +(`1234 [sh ]`) so interleaved output stays readable. The daemon learns about new children from the parent's +fork/clone exit, so a child's very first syscalls may be missed. + +Show the time spent in each syscall (like `strace -T`): +```shell +pinchy -T -p +``` Other knobs: diff --git a/pinchy-common/src/lib.rs b/pinchy-common/src/lib.rs index 19a11d8..c1597e1 100644 --- a/pinchy-common/src/lib.rs +++ b/pinchy-common/src/lib.rs @@ -101,7 +101,7 @@ pub const Q_XQUOTASYNC: i32 = 0x5807; pub const Q_XGETQSTATV: i32 = 0x5808; pub const Q_XGETNEXTQUOTA: i32 = 0x5809; -pub const WIRE_VERSION: u16 = 4; +pub const WIRE_VERSION: u16 = 5; pub const EFF_STAT_EVENTS_SUBMITTED: u32 = 0; pub const EFF_STAT_BYTES_SUBMITTED: u32 = 1; @@ -582,6 +582,9 @@ pub fn max_compact_payload_size() -> usize { max_payload_size } +// Same as the kernel's TASK_COMM_LEN. +pub const COMM_LEN: usize = 16; + #[repr(C)] #[derive(Clone, Copy, Debug, Default)] pub struct WireEventHeader { @@ -591,6 +594,10 @@ pub struct WireEventHeader { pub pid: u32, pub tid: u32, pub return_value: i64, + // Time spent in the syscall; 0 when no matching enter was seen. + pub duration_ns: u64, + // NUL-padded name of the process, as the kernel sees it. + pub comm: [u8; COMM_LEN], } #[repr(C)] diff --git a/pinchy-ebpf/src/main.rs b/pinchy-ebpf/src/main.rs index 560e632..d39087b 100644 --- a/pinchy-ebpf/src/main.rs +++ b/pinchy-ebpf/src/main.rs @@ -8,7 +8,7 @@ use aya_ebpf::{ macros::{map, tracepoint}, - maps::{Array, HashMap, ProgramArray, RingBuf}, + maps::{Array, HashMap, PerCpuArray, ProgramArray, RingBuf}, programs::TracePointContext, EbpfContext as _, }; @@ -58,6 +58,50 @@ static mut EVENTS: RingBuf = RingBuf::with_byte_size(83886080, 0); #[map] static mut ENTER_MAP: HashMap = HashMap::with_max_entries(10240, 0); +// Per-exit context shared between the dispatching exit program and the +// tail-called handler that builds the event. Filled once per exit so the +// many submit_compact_payload() monomorphizations stay small -- inlining +// the map lookups and helper calls at every submit site pushed the larger +// programs past the BPF branch offset range. Tracepoint programs are not +// preempted, so the per-CPU slot is stable across the tail call. +#[repr(C)] +#[derive(Clone, Copy)] +pub struct ExitContext { + pub duration_ns: u64, + pub comm: [u8; pinchy_common::COMM_LEN], +} + +#[map] +static mut EXIT_CONTEXT: PerCpuArray = PerCpuArray::with_max_entries(1, 0); + +// Record the time spent in the syscall (when we saw its enter) and the +// process name for the handler this exit tail-calls into. +#[inline(always)] +pub fn fill_exit_context(ctx: &TracePointContext, syscall_nr: i64) { + let tid = ctx.pid(); + + let mut duration_ns = 0u64; + + if let Some(enter_data) = unsafe { ENTER_MAP.get(&tid) } { + if enter_data.syscall_nr == syscall_nr { + duration_ns = unsafe { aya_ebpf::helpers::bpf_ktime_get_ns() } + .saturating_sub(enter_data.enter_ns); + } + } + + if let Some(exit_ctx) = unsafe { EXIT_CONTEXT.get_ptr_mut(0) } { + unsafe { + (*exit_ctx).duration_ns = duration_ns; + + let comm_ptr = core::ptr::addr_of_mut!((*exit_ctx).comm); + aya_ebpf::helpers::gen::bpf_get_current_comm( + comm_ptr as *mut core::ffi::c_void, + pinchy_common::COMM_LEN as u32, + ); + } + } +} + #[map(name = "SYSCALL_TAILCALLS")] static mut SYSCALL_TAILCALLS: ProgramArray = ProgramArray::pinned(512, 0); @@ -66,6 +110,7 @@ pub struct SyscallEnterData { pub tgid: u32, pub syscall_nr: i64, pub args: [usize; SYSCALL_ARGS_COUNT], + pub enter_ns: u64, } #[inline(always)] @@ -130,6 +175,7 @@ pub fn pinchy(ctx: TracePointContext) -> u32 { tgid, syscall_nr, args, + enter_ns: unsafe { aya_ebpf::helpers::bpf_ktime_get_ns() }, }; if let Err(err) = unsafe { ENTER_MAP.insert(&tid, &data, 0) } { @@ -182,6 +228,8 @@ pub fn pinchy_exit(ctx: TracePointContext) -> u32 { return Ok(0); } + fill_exit_context(&ctx, syscall_nr); + unsafe { let Err(_) = SYSCALL_TAILCALLS.tail_call(&ctx, syscall_nr as u32); error!(&ctx, "failed tailcall for syscall {}", syscall_nr); @@ -798,6 +846,32 @@ pub fn syscall_exit_trivial(ctx: TracePointContext) -> u32 { }, )?; } + _ => { + trace!(&ctx, "unknown syscall {}", syscall_nr); + } + } + + Ok(()) + } + match inner(ctx) { + Ok(_) => 0, + Err(ret) => ret, + } +} + +// Split from syscall_exit_trivial: with every submit site inlined, a single +// program holding all trivial syscalls exceeds the BPF branch offset range +// on x86_64 ("Branch target out of insn range" at link time). Each syscall +// is registered against exactly one of the two programs in the tailcall +// array. +#[tracepoint] +pub fn syscall_exit_trivial2(ctx: TracePointContext) -> u32 { + fn inner(ctx: TracePointContext) -> Result<(), u32> { + let syscall_nr = get_syscall_nr(&ctx)?; + let args = get_args(&ctx, syscall_nr)?; + let return_value = util::get_return_value(&ctx)?; + + match syscall_nr { syscalls::SYS_pidfd_getfd => { crate::util::submit_compact_payload::( &ctx, diff --git a/pinchy-ebpf/src/util.rs b/pinchy-ebpf/src/util.rs index 22af40c..a031f73 100644 --- a/pinchy-ebpf/src/util.rs +++ b/pinchy-ebpf/src/util.rs @@ -11,7 +11,7 @@ use pinchy_common::{ WireEventHeader, WIRE_VERSION, }; -use crate::{ENTER_MAP, EVENTS, SYSCALL_OFFSET, SYSCALL_RETURN_OFFSET}; +use crate::{ENTER_MAP, EVENTS, EXIT_CONTEXT, SYSCALL_OFFSET, SYSCALL_RETURN_OFFSET}; mod efficiency { #[cfg(feature = "efficiency-metrics")] @@ -147,7 +147,8 @@ pub fn get_args(ctx: &TracePointContext, expected_syscall_nr: i64) -> Result<[us // Copy the part of the enter data we care about... let args = enter_data.args; - // Then remove the item from the map. + // Then remove the item from the map. The syscall duration was already + // computed from the entry in fill_exit_context() before the tail call. let _ = unsafe { ENTER_MAP.remove(&tid) }; Ok(args) @@ -194,6 +195,14 @@ where let event = unsafe { event.assume_init_mut() as *mut WireCompactPayload as *mut u8 }; let header_ptr = event as *mut WireEventHeader; + // The duration and process name were captured once per exit in + // fill_exit_context(); a per-CPU read here keeps this (heavily + // monomorphized) function small. + let (duration_ns, comm) = match unsafe { EXIT_CONTEXT.get(0) } { + Some(exit_ctx) => (exit_ctx.duration_ns, exit_ctx.comm), + None => (0, [0u8; pinchy_common::COMM_LEN]), + }; + unsafe { header_ptr.write(WireEventHeader { version: WIRE_VERSION, @@ -202,6 +211,8 @@ where pid: ctx.tgid(), tid: ctx.pid(), return_value, + duration_ns, + comm, }); } diff --git a/pinchy/src/client.rs b/pinchy/src/client.rs index 8b9e3cd..81c83b1 100644 --- a/pinchy/src/client.rs +++ b/pinchy/src/client.rs @@ -150,6 +150,10 @@ struct Args { #[arg(short = 'f', long = "follow-forks")] follow_forks: bool, + /// Show the time spent in each syscall, strace-style + #[arg(short = 'T', long = "syscall-times")] + syscall_times: bool, + /// Write trace output to FILE instead of stderr #[arg(short = 'o', long = "output")] output: Option, @@ -193,13 +197,27 @@ async fn main() -> Result<()> { // Read everything there is to read, the server will close the write end // of the pipe - relay_to_sink(fd, style, args.output).await?; + relay_to_sink( + fd, + style, + args.output, + args.syscall_times, + args.follow_forks, + ) + .await?; pinchy_client::cleanup_and_quit(pid); } else if let Some(pid) = args.pid { let fd = pinchy_client::attach(pid, syscalls, args.follow_forks).await; - relay_to_sink(fd, style, args.output).await?; + relay_to_sink( + fd, + style, + args.output, + args.syscall_times, + args.follow_forks, + ) + .await?; } else { // Print clap's usage message to stderr and exit eprintln!("{}", Args::command().render_help()); @@ -215,15 +233,25 @@ async fn relay_to_sink( fd: OwnedFd, style: FormattingStyle, output: Option, + syscall_times: bool, + annotate_comm: bool, ) -> Result<()> { match output { Some(path) => { let file = tokio::fs::File::create(&path).await?; - relay_trace(fd, style, file, false).await + relay_trace(fd, style, file, false, syscall_times, annotate_comm).await } None => { let is_terminal = std::io::stderr().is_terminal(); - relay_trace(fd, style, tokio::io::stderr(), is_terminal).await + relay_trace( + fd, + style, + tokio::io::stderr(), + is_terminal, + syscall_times, + annotate_comm, + ) + .await } } } @@ -233,6 +261,8 @@ async fn relay_trace( formatting_style: FormattingStyle, sink: W, sink_is_terminal: bool, + syscall_times: bool, + annotate_comm: bool, ) -> Result<()> { let mut reader = tokio::io::BufReader::with_capacity( 64 * 1024, @@ -298,7 +328,15 @@ async fn relay_trace( // Safety: we own the output vector and won't progress before handle_event() returns. let pin_output = unsafe { Pin::new_unchecked(&mut output) }; - let formatter = Formatter::new(pin_output, formatting_style); + let mut formatter = Formatter::new(pin_output, formatting_style); + + if syscall_times && header.duration_ns > 0 { + formatter = formatter.with_duration(header.duration_ns); + } + + if annotate_comm { + formatter = formatter.with_comm(header.comm); + } events::handle_event(&header, &payload, formatter).await?; diff --git a/pinchy/src/formatting.rs b/pinchy/src/formatting.rs index 3108f46..3bb43fe 100644 --- a/pinchy/src/formatting.rs +++ b/pinchy/src/formatting.rs @@ -70,21 +70,84 @@ pub enum FormattingStyle { pub struct Formatter<'f> { style: FormattingStyle, output: Pin<&'f mut dyn AsyncWrite>, + duration_ns: Option, + comm: Option<[u8; pinchy_common::COMM_LEN]>, } impl<'f> Formatter<'f> { pub fn new(output: Pin<&'f mut dyn AsyncWrite>, style: FormattingStyle) -> Self { - Formatter { style, output } + Formatter { + style, + output, + duration_ns: None, + comm: None, + } + } + + // When set, finish() appends the time spent in the syscall, strace-style: + // ` <0.000123>`. + pub fn with_duration(mut self, duration_ns: u64) -> Self { + self.duration_ns = Some(duration_ns); + self + } + + // When set, push_syscall() annotates the PID with the process name. + // Used when following forks, where lines from several processes + // interleave. + pub fn with_comm(mut self, comm: [u8; pinchy_common::COMM_LEN]) -> Self { + self.comm = Some(comm); + self } pub async fn push_syscall(mut self, pid: u32, syscall_nr: i64) -> Result> { let syscall_name = syscall_name_from_nr(syscall_nr) .ok_or_else(|| anyhow!(format!("Unknown syscall: {syscall_nr}")))?; + // Width of the one-line `[comm]` field; longer names are truncated + // to keep the columns compact. + const COMM_DISPLAY_WIDTH: usize = pinchy_common::COMM_LEN / 2; + + // Bound for the process name on multi-line headers; comm is much + // shorter today, but richer process information (full command line, + // terminal-width awareness) may use the room later. + const MULTILINE_COMM_MAX: usize = 65; + + let comm = self.comm; let output = &mut self.output; output.write_all(pid.to_string().as_bytes()).await?; + if let Some(mut comm) = comm { + let len = comm.iter().position(|&b| b == 0).unwrap_or(comm.len()); + + // comm is set by the traced process (prctl PR_SET_NAME) and can + // hold arbitrary bytes; neutralize anything that could carry a + // terminal escape sequence. + for byte in comm[..len].iter_mut() { + if !byte.is_ascii_graphic() && *byte != b' ' { + *byte = b'?'; + } + } + + match self.style { + FormattingStyle::OneLine => { + let len = len.min(COMM_DISPLAY_WIDTH); + let mut field = [b' '; COMM_DISPLAY_WIDTH]; + field[..len].copy_from_slice(&comm[..len]); + + output.write_all(b" [").await?; + output.write_all(&field).await?; + output.write_all(b"]").await?; + } + FormattingStyle::MultiLine => { + output.write_all(b" ").await?; + output + .write_all(&comm[..len.min(MULTILINE_COMM_MAX)]) + .await?; + } + } + } + match self.style { FormattingStyle::OneLine => output.write_all(b" ").await?, FormattingStyle::MultiLine => output.write_all(b"\n\t").await?, @@ -224,6 +287,14 @@ impl<'f> SyscallFormatter<'f> { output.write_all(suffix).await?; } + if let Some(duration_ns) = self.formatter.duration_ns { + let seconds = duration_ns as f64 / 1_000_000_000.0; + + output + .write_all(format!(" <{seconds:.6}>").as_bytes()) + .await?; + } + output.write_all(b"\n").await?; Ok(self.formatter) diff --git a/pinchy/src/server.rs b/pinchy/src/server.rs index 24455c8..0536c52 100644 --- a/pinchy/src/server.rs +++ b/pinchy/src/server.rs @@ -396,14 +396,6 @@ fn load_tailcalls(ebpf: &mut Ebpf) -> anyhow::Result<()> { const TRIVIAL_SYSCALLS: &[i64] = &[ syscalls::SYS_dup3, syscalls::SYS_fcntl, - syscalls::SYS_sched_yield, - syscalls::SYS_getpid, - syscalls::SYS_gettid, - syscalls::SYS_getuid, - syscalls::SYS_geteuid, - syscalls::SYS_getgid, - syscalls::SYS_getegid, - syscalls::SYS_getppid, syscalls::SYS_brk, syscalls::SYS_mprotect, syscalls::SYS_getrandom, @@ -414,12 +406,9 @@ fn load_tailcalls(ebpf: &mut Ebpf) -> anyhow::Result<()> { syscalls::SYS_rt_tgsigqueueinfo, syscalls::SYS_fchdir, syscalls::SYS_exit_group, - syscalls::SYS_rt_sigreturn, syscalls::SYS_dup, #[cfg(target_arch = "x86_64")] syscalls::SYS_dup2, - syscalls::SYS_sync, - syscalls::SYS_setsid, syscalls::SYS_setuid, syscalls::SYS_setgid, syscalls::SYS_close_range, @@ -427,7 +416,6 @@ fn load_tailcalls(ebpf: &mut Ebpf) -> anyhow::Result<()> { syscalls::SYS_getsid, syscalls::SYS_setpgid, syscalls::SYS_umask, - syscalls::SYS_vhangup, syscalls::SYS_ioprio_get, syscalls::SYS_ioprio_set, syscalls::SYS_setregid, @@ -436,10 +424,6 @@ fn load_tailcalls(ebpf: &mut Ebpf) -> anyhow::Result<()> { syscalls::SYS_setreuid, #[cfg(target_arch = "x86_64")] syscalls::SYS_alarm, - #[cfg(target_arch = "x86_64")] - syscalls::SYS_pause, - #[cfg(target_arch = "x86_64")] - syscalls::SYS_getpgrp, syscalls::SYS_personality, syscalls::SYS_getpriority, syscalls::SYS_setpriority, @@ -462,6 +446,28 @@ fn load_tailcalls(ebpf: &mut Ebpf) -> anyhow::Result<()> { syscalls::SYS_fchmod, syscalls::SYS_fchown, syscalls::SYS_flock, + syscalls::SYS_pidfd_open, + syscalls::SYS_process_mrelease, + syscalls::SYS_inotify_init1, + syscalls::SYS_inotify_rm_watch, + ]; + const TRIVIAL2_SYSCALLS: &[i64] = &[ + syscalls::SYS_sched_yield, + syscalls::SYS_getpid, + syscalls::SYS_gettid, + syscalls::SYS_getuid, + syscalls::SYS_geteuid, + syscalls::SYS_getgid, + syscalls::SYS_getegid, + syscalls::SYS_getppid, + syscalls::SYS_rt_sigreturn, + syscalls::SYS_sync, + syscalls::SYS_setsid, + syscalls::SYS_vhangup, + #[cfg(target_arch = "x86_64")] + syscalls::SYS_pause, + #[cfg(target_arch = "x86_64")] + syscalls::SYS_getpgrp, syscalls::SYS_futex_wake, syscalls::SYS_map_shadow_stack, syscalls::SYS_lsm_set_self_attr, @@ -469,9 +475,7 @@ fn load_tailcalls(ebpf: &mut Ebpf) -> anyhow::Result<()> { #[cfg(target_arch = "x86_64")] syscalls::SYS_epoll_create, syscalls::SYS_epoll_create1, - syscalls::SYS_pidfd_open, syscalls::SYS_pidfd_getfd, - syscalls::SYS_process_mrelease, syscalls::SYS_mlock, syscalls::SYS_mlock2, syscalls::SYS_mlockall, @@ -492,9 +496,7 @@ fn load_tailcalls(ebpf: &mut Ebpf) -> anyhow::Result<()> { syscalls::SYS_eventfd2, #[cfg(target_arch = "x86_64")] syscalls::SYS_inotify_init, - syscalls::SYS_inotify_init1, syscalls::SYS_set_mempolicy_home_node, - syscalls::SYS_inotify_rm_watch, syscalls::SYS_timer_delete, syscalls::SYS_timer_getoverrun, #[cfg(target_arch = "x86_64")] @@ -553,6 +555,19 @@ fn load_tailcalls(ebpf: &mut Ebpf) -> anyhow::Result<()> { explicitly_supported.insert(syscall_nr); } + // Second half of the trivial syscalls; see the eBPF side for why the + // handler is split in two. + let prog: &mut aya::programs::TracePoint = ebpf + .program_mut("syscall_exit_trivial2") + .context("missing syscall_exit_trivial2 tailcall")? + .try_into()?; + prog.load() + .context("trying to load syscall_exit_trivial2 into eBPF")?; + for &syscall_nr in TRIVIAL2_SYSCALLS { + prog_array.set(syscall_nr as u32, prog.fd()?, 0)?; + explicitly_supported.insert(syscall_nr); + } + // execve and execveat share the same handler, see eBPF side comments for details. let prog: &mut aya::programs::TracePoint = ebpf .program_mut("syscall_exit_execve") diff --git a/pinchy/src/tests/mod.rs b/pinchy/src/tests/mod.rs index 72a9994..52c6a27 100644 --- a/pinchy/src/tests/mod.rs +++ b/pinchy/src/tests/mod.rs @@ -29,6 +29,8 @@ pub(crate) fn make_compact_test_data( pid, tid: pid, return_value, + duration_ns: 0, + comm: [0u8; pinchy_common::COMM_LEN], }; let payload = unsafe { @@ -64,6 +66,80 @@ macro_rules! syscall_test { }; } +#[::tokio::test] +async fn syscall_duration_suffix() { + let data = pinchy_common::CloseData { fd: 3 }; + let (header, payload) = + make_compact_test_data(pinchy_common::syscalls::SYS_close, 100, 0, &data); + + let mut output: Vec = vec![]; + let pin_output = unsafe { std::pin::Pin::new_unchecked(&mut output) }; + let formatter = + crate::formatting::Formatter::new(pin_output, crate::formatting::FormattingStyle::OneLine) + .with_duration(123_456); + + crate::events::handle_event(&header, &payload, formatter) + .await + .unwrap(); + + assert_eq!( + String::from_utf8_lossy(&output), + "100 close(fd: 3) = 0 (success) <0.000123>\n" + ); +} + +#[::tokio::test] +async fn syscall_comm_annotation() { + let data = pinchy_common::CloseData { fd: 3 }; + let (mut header, payload) = + make_compact_test_data(pinchy_common::syscalls::SYS_close, 100, 0, &data); + + let comm = b"test-helper"; + header.comm[..comm.len()].copy_from_slice(comm); + + let mut output: Vec = vec![]; + let pin_output = unsafe { std::pin::Pin::new_unchecked(&mut output) }; + let formatter = + crate::formatting::Formatter::new(pin_output, crate::formatting::FormattingStyle::OneLine) + .with_comm(header.comm); + + crate::events::handle_event(&header, &payload, formatter) + .await + .unwrap(); + + assert_eq!( + String::from_utf8_lossy(&output), + "100 [test-hel] close(fd: 3) = 0 (success)\n" + ); +} + +#[::tokio::test] +async fn syscall_comm_annotation_multiline() { + let data = pinchy_common::CloseData { fd: 3 }; + let (mut header, payload) = + make_compact_test_data(pinchy_common::syscalls::SYS_close, 100, 0, &data); + + let comm = b"test-helper"; + header.comm[..comm.len()].copy_from_slice(comm); + + let mut output: Vec = vec![]; + let pin_output = unsafe { std::pin::Pin::new_unchecked(&mut output) }; + let formatter = crate::formatting::Formatter::new( + pin_output, + crate::formatting::FormattingStyle::MultiLine, + ) + .with_comm(header.comm); + + crate::events::handle_event(&header, &payload, formatter) + .await + .unwrap(); + + assert_eq!( + String::from_utf8_lossy(&output), + "100 test-helper\n\tclose(\n\t fd: 3\n\t) = 0 (success)\n" + ); +} + syscall_test!( parse_generic_syscall, { diff --git a/pinchy/tests/integration.rs b/pinchy/tests/integration.rs index 710b454..1117706 100644 --- a/pinchy/tests/integration.rs +++ b/pinchy/tests/integration.rs @@ -362,8 +362,9 @@ fn follow_forks() { // follow-forks trace sees the child's openat. let handle = run_workload_with_args(&pinchy, &["openat"], "fork_test", &["-f"]); + // Lines are annotated with the process name when following forks. let expected_output = escaped_regex(indoc! {r#" - @PID@ openat(dfd: AT_FDCWD, pathname: "/dev/null", flags: 0x0 (O_RDONLY), mode: 0) = @NUMBER@ (fd) + @PID@ [test-hel] openat(dfd: AT_FDCWD, pathname: "/dev/null", flags: 0x0 (O_RDONLY), mode: 0) = @NUMBER@ (fd) "#}); let output = handle.join().unwrap(); @@ -381,6 +382,29 @@ fn follow_forks() { )); } +#[test] +fn syscall_times() { + let pinchy = PinchyTest::new(); + + let handle = run_workload_with_args(&pinchy, &["fcntl"], "fcntl_test", &["-T"]); + + // Every trace line should carry a duration suffix. + let expected_output = escaped_regex(indoc! {r#" + @PID@ fcntl(fd: 3, cmd: F_GETFD, arg: 0x0) = 0 (success) <@NUMBER@.@NUMBER@> + "#}); + + let output = handle.join().unwrap(); + Assert::new(output) + .success() + .stderr(predicate::str::is_match(&expected_output).unwrap()); + + // Server output - has to be at the end, since we kill the server for waiting. + let output = pinchy.wait(); + Assert::new(output) + .success() + .stdout(predicate::str::ends_with("Exiting...\n")); +} + #[test] fn pipe_operations_syscalls() { let pinchy = PinchyTest::new();