Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,14 @@ Follow child processes created by the tracee (like `strace -f`):
```shell
pinchy -f -- sh -c 'ls | wc -l'
```
The daemon learns about new children from the parent's fork/clone exit, so a
child's very first syscalls may be missed.
Trace lines are annotated with a fixed-width process name field
(`1234 [sh ]`) so interleaved output stays readable. The daemon learns about new children from the parent's
fork/clone exit, so a child's very first syscalls may be missed.

Show the time spent in each syscall (like `strace -T`):
```shell
pinchy -T -p <PID>
```

Other knobs:

Expand Down
9 changes: 8 additions & 1 deletion pinchy-common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ pub const Q_XQUOTASYNC: i32 = 0x5807;
pub const Q_XGETQSTATV: i32 = 0x5808;
pub const Q_XGETNEXTQUOTA: i32 = 0x5809;

pub const WIRE_VERSION: u16 = 4;
pub const WIRE_VERSION: u16 = 5;

pub const EFF_STAT_EVENTS_SUBMITTED: u32 = 0;
pub const EFF_STAT_BYTES_SUBMITTED: u32 = 1;
Expand Down Expand Up @@ -582,6 +582,9 @@ pub fn max_compact_payload_size() -> usize {
max_payload_size
}

// Same as the kernel's TASK_COMM_LEN.
pub const COMM_LEN: usize = 16;

#[repr(C)]
#[derive(Clone, Copy, Debug, Default)]
pub struct WireEventHeader {
Expand All @@ -591,6 +594,10 @@ pub struct WireEventHeader {
pub pid: u32,
pub tid: u32,
pub return_value: i64,
// Time spent in the syscall; 0 when no matching enter was seen.
pub duration_ns: u64,
// NUL-padded name of the process, as the kernel sees it.
pub comm: [u8; COMM_LEN],
}

#[repr(C)]
Expand Down
76 changes: 75 additions & 1 deletion pinchy-ebpf/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

use aya_ebpf::{
macros::{map, tracepoint},
maps::{Array, HashMap, ProgramArray, RingBuf},
maps::{Array, HashMap, PerCpuArray, ProgramArray, RingBuf},
programs::TracePointContext,
EbpfContext as _,
};
Expand Down Expand Up @@ -58,6 +58,50 @@ static mut EVENTS: RingBuf = RingBuf::with_byte_size(83886080, 0);
#[map]
static mut ENTER_MAP: HashMap<u32, SyscallEnterData> = HashMap::with_max_entries(10240, 0);

// Per-exit context shared between the dispatching exit program and the
// tail-called handler that builds the event. Filled once per exit so the
// many submit_compact_payload() monomorphizations stay small -- inlining
// the map lookups and helper calls at every submit site pushed the larger
// programs past the BPF branch offset range. Tracepoint programs are not
// preempted, so the per-CPU slot is stable across the tail call.
#[repr(C)]
#[derive(Clone, Copy)]
pub struct ExitContext {
pub duration_ns: u64,
pub comm: [u8; pinchy_common::COMM_LEN],
}

#[map]
static mut EXIT_CONTEXT: PerCpuArray<ExitContext> = PerCpuArray::with_max_entries(1, 0);

// Record the time spent in the syscall (when we saw its enter) and the
// process name for the handler this exit tail-calls into.
#[inline(always)]
pub fn fill_exit_context(ctx: &TracePointContext, syscall_nr: i64) {
let tid = ctx.pid();

let mut duration_ns = 0u64;

if let Some(enter_data) = unsafe { ENTER_MAP.get(&tid) } {
if enter_data.syscall_nr == syscall_nr {
duration_ns = unsafe { aya_ebpf::helpers::bpf_ktime_get_ns() }
.saturating_sub(enter_data.enter_ns);
}
}

if let Some(exit_ctx) = unsafe { EXIT_CONTEXT.get_ptr_mut(0) } {
unsafe {
(*exit_ctx).duration_ns = duration_ns;

let comm_ptr = core::ptr::addr_of_mut!((*exit_ctx).comm);
aya_ebpf::helpers::gen::bpf_get_current_comm(
comm_ptr as *mut core::ffi::c_void,
pinchy_common::COMM_LEN as u32,
);
}
}
}

#[map(name = "SYSCALL_TAILCALLS")]
static mut SYSCALL_TAILCALLS: ProgramArray = ProgramArray::pinned(512, 0);

Expand All @@ -66,6 +110,7 @@ pub struct SyscallEnterData {
pub tgid: u32,
pub syscall_nr: i64,
pub args: [usize; SYSCALL_ARGS_COUNT],
pub enter_ns: u64,
}

#[inline(always)]
Expand Down Expand Up @@ -130,6 +175,7 @@ pub fn pinchy(ctx: TracePointContext) -> u32 {
tgid,
syscall_nr,
args,
enter_ns: unsafe { aya_ebpf::helpers::bpf_ktime_get_ns() },
};

if let Err(err) = unsafe { ENTER_MAP.insert(&tid, &data, 0) } {
Expand Down Expand Up @@ -182,6 +228,8 @@ pub fn pinchy_exit(ctx: TracePointContext) -> u32 {
return Ok(0);
}

fill_exit_context(&ctx, syscall_nr);

unsafe {
let Err(_) = SYSCALL_TAILCALLS.tail_call(&ctx, syscall_nr as u32);
error!(&ctx, "failed tailcall for syscall {}", syscall_nr);
Expand Down Expand Up @@ -798,6 +846,32 @@ pub fn syscall_exit_trivial(ctx: TracePointContext) -> u32 {
},
)?;
}
_ => {
trace!(&ctx, "unknown syscall {}", syscall_nr);
}
}

Ok(())
}
match inner(ctx) {
Ok(_) => 0,
Err(ret) => ret,
}
}

// Split from syscall_exit_trivial: with every submit site inlined, a single
// program holding all trivial syscalls exceeds the BPF branch offset range
// on x86_64 ("Branch target out of insn range" at link time). Each syscall
// is registered against exactly one of the two programs in the tailcall
// array.
#[tracepoint]
pub fn syscall_exit_trivial2(ctx: TracePointContext) -> u32 {
fn inner(ctx: TracePointContext) -> Result<(), u32> {
let syscall_nr = get_syscall_nr(&ctx)?;
let args = get_args(&ctx, syscall_nr)?;
let return_value = util::get_return_value(&ctx)?;

match syscall_nr {
syscalls::SYS_pidfd_getfd => {
crate::util::submit_compact_payload::<pinchy_common::PidfdGetfdData, _>(
&ctx,
Expand Down
15 changes: 13 additions & 2 deletions pinchy-ebpf/src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use pinchy_common::{
WireEventHeader, WIRE_VERSION,
};

use crate::{ENTER_MAP, EVENTS, SYSCALL_OFFSET, SYSCALL_RETURN_OFFSET};
use crate::{ENTER_MAP, EVENTS, EXIT_CONTEXT, SYSCALL_OFFSET, SYSCALL_RETURN_OFFSET};

mod efficiency {
#[cfg(feature = "efficiency-metrics")]
Expand Down Expand Up @@ -147,7 +147,8 @@ pub fn get_args(ctx: &TracePointContext, expected_syscall_nr: i64) -> Result<[us
// Copy the part of the enter data we care about...
let args = enter_data.args;

// Then remove the item from the map.
// Then remove the item from the map. The syscall duration was already
// computed from the entry in fill_exit_context() before the tail call.
let _ = unsafe { ENTER_MAP.remove(&tid) };

Ok(args)
Expand Down Expand Up @@ -194,6 +195,14 @@ where
let event = unsafe { event.assume_init_mut() as *mut WireCompactPayload<T> as *mut u8 };
let header_ptr = event as *mut WireEventHeader;

// The duration and process name were captured once per exit in
// fill_exit_context(); a per-CPU read here keeps this (heavily
// monomorphized) function small.
let (duration_ns, comm) = match unsafe { EXIT_CONTEXT.get(0) } {
Some(exit_ctx) => (exit_ctx.duration_ns, exit_ctx.comm),
None => (0, [0u8; pinchy_common::COMM_LEN]),
};

unsafe {
header_ptr.write(WireEventHeader {
version: WIRE_VERSION,
Expand All @@ -202,6 +211,8 @@ where
pid: ctx.tgid(),
tid: ctx.pid(),
return_value,
duration_ns,
comm,
});
}

Expand Down
48 changes: 43 additions & 5 deletions pinchy/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,10 @@ struct Args {
#[arg(short = 'f', long = "follow-forks")]
follow_forks: bool,

/// Show the time spent in each syscall, strace-style
#[arg(short = 'T', long = "syscall-times")]
syscall_times: bool,

/// Write trace output to FILE instead of stderr
#[arg(short = 'o', long = "output")]
output: Option<std::path::PathBuf>,
Expand Down Expand Up @@ -193,13 +197,27 @@ async fn main() -> Result<()> {

// Read everything there is to read, the server will close the write end
// of the pipe
relay_to_sink(fd, style, args.output).await?;
relay_to_sink(
fd,
style,
args.output,
args.syscall_times,
args.follow_forks,
)
.await?;

pinchy_client::cleanup_and_quit(pid);
} else if let Some(pid) = args.pid {
let fd = pinchy_client::attach(pid, syscalls, args.follow_forks).await;

relay_to_sink(fd, style, args.output).await?;
relay_to_sink(
fd,
style,
args.output,
args.syscall_times,
args.follow_forks,
)
.await?;
} else {
// Print clap's usage message to stderr and exit
eprintln!("{}", Args::command().render_help());
Expand All @@ -215,15 +233,25 @@ async fn relay_to_sink(
fd: OwnedFd,
style: FormattingStyle,
output: Option<std::path::PathBuf>,
syscall_times: bool,
annotate_comm: bool,
) -> Result<()> {
match output {
Some(path) => {
let file = tokio::fs::File::create(&path).await?;
relay_trace(fd, style, file, false).await
relay_trace(fd, style, file, false, syscall_times, annotate_comm).await
}
None => {
let is_terminal = std::io::stderr().is_terminal();
relay_trace(fd, style, tokio::io::stderr(), is_terminal).await
relay_trace(
fd,
style,
tokio::io::stderr(),
is_terminal,
syscall_times,
annotate_comm,
)
.await
}
}
}
Expand All @@ -233,6 +261,8 @@ async fn relay_trace<W: tokio::io::AsyncWrite + Unpin>(
formatting_style: FormattingStyle,
sink: W,
sink_is_terminal: bool,
syscall_times: bool,
annotate_comm: bool,
) -> Result<()> {
let mut reader = tokio::io::BufReader::with_capacity(
64 * 1024,
Expand Down Expand Up @@ -298,7 +328,15 @@ async fn relay_trace<W: tokio::io::AsyncWrite + Unpin>(

// Safety: we own the output vector and won't progress before handle_event() returns.
let pin_output = unsafe { Pin::new_unchecked(&mut output) };
let formatter = Formatter::new(pin_output, formatting_style);
let mut formatter = Formatter::new(pin_output, formatting_style);

if syscall_times && header.duration_ns > 0 {
formatter = formatter.with_duration(header.duration_ns);
}

if annotate_comm {
formatter = formatter.with_comm(header.comm);
}

events::handle_event(&header, &payload, formatter).await?;

Expand Down
Loading
Loading