From 92d6ab64bec74f9a252c28aa4043bd2901280660 Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Mon, 11 May 2026 09:26:01 +0000 Subject: [PATCH] fix(coreutils-port): accept let-bound Command chain in uu_app validator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upstream uutils refactored truncate's uu_app() to `let cmd = Command::new(...); configure_localized_command(cmd).arg(...)...`. The Rewriter folds the wrapper call to `cmd`, leaving a two-statement `let = ; .(...)` body — which the strict single-expression validator was rejecting, breaking the weekly Coreutils Args Drift workflow on every regen. Decompose `validate_uu_app_body` into `validate_command_chain_expr` and a new `validate_let_bound_command_body` that requires a simple `let ` pattern, a Command::new-rooted initializer, and a tail method-chain whose innermost receiver is exactly the let-bound identifier. Both branches share the existing `UuAppExprValidator` (disallowed methods, closures, macros, blocks) so TM-INF-025's trust boundary is unchanged. New regression tests cover the happy path (with the real `configure_localized_command` wrapper the Rewriter strips), a non-command initializer, a tail not chained off the binding, and a 3-statement body. Verified by regenerating all 11 ported utils against the pinned rev (byte-identical to repo after cargo fmt) and against upstream HEAD (truncate now regenerates cleanly). Specs: updated coreutils-args-port.md and threat-model.md TM-INF-025 to document the broadened accepted shapes and new regression tests. --- crates/bashkit-coreutils-port/src/args.rs | 220 +++++++++++++++++++++- specs/coreutils-args-port.md | 16 +- specs/threat-model.md | 2 +- 3 files changed, 224 insertions(+), 14 deletions(-) diff --git a/crates/bashkit-coreutils-port/src/args.rs b/crates/bashkit-coreutils-port/src/args.rs index 165bf358..5f4ad657 100644 --- a/crates/bashkit-coreutils-port/src/args.rs +++ b/crates/bashkit-coreutils-port/src/args.rs @@ -643,23 +643,38 @@ fn collect_option_constants(file: &syn::File) -> Vec { } // THREAT[TM-INF-025]: args-mode consumes third-party Rust from uutils. -// Only a single clap builder expression may become executable bashkit -// code; prefix statements would run during CI/parser construction. +// Only clap builder expressions may become executable bashkit code; +// any other statement shape would run during CI/parser construction. +// +// Two accepted body shapes: +// 1. `` (single tail expression) +// 2. `let = ; +// .(...)...` +// — equivalent to a single chain split across a binding. The +// tail's innermost receiver must be the let-bound identifier; +// both halves run through the same chain validator. fn validate_uu_app_body(uu_app: &ItemFn) -> Result<()> { - let command_expr = match uu_app.block.stmts.as_slice() { - [Stmt::Expr(expr, None)] => expr, + match uu_app.block.stmts.as_slice() { + [Stmt::Expr(expr, None)] => validate_command_chain_expr(expr), + [Stmt::Local(local), Stmt::Expr(tail_expr, None)] => { + validate_let_bound_command_body(local, tail_expr) + } stmts => bail!( - "unsafe uu_app body: expected a single clap Command builder expression, found {} statements", + "unsafe uu_app body: expected a single clap Command builder expression \ + or `let = Command::new(...)...; .method(...)...`, \ + found {} statements", stmts.len() ), - }; + } +} +fn validate_command_chain_expr(expr: &Expr) -> Result<()> { let mut validator = UuAppExprValidator { errors: vec![] }; - syn::visit::Visit::visit_expr(&mut validator, command_expr); + syn::visit::Visit::visit_expr(&mut validator, expr); if !validator.errors.is_empty() { bail!("unsafe uu_app body: {}", validator.errors.join("; ")); } - if !chain_roots_at_command_new(command_expr) { + if !chain_roots_at_command_new(expr) { bail!( "unsafe uu_app body: tail expression must be a clap::Command::new(...) builder chain" ); @@ -667,6 +682,83 @@ fn validate_uu_app_body(uu_app: &ItemFn) -> Result<()> { Ok(()) } +fn validate_let_bound_command_body(local: &syn::Local, tail_expr: &Expr) -> Result<()> { + let binding_ident = simple_let_ident(local).ok_or_else(|| { + anyhow!( + "unsafe uu_app body: let binding must be `let = ...` \ + (no destructuring, no let-else)" + ) + })?; + let init = local + .init + .as_ref() + .ok_or_else(|| anyhow!("unsafe uu_app body: let binding must have an initializer"))?; + if init.diverge.is_some() { + bail!("unsafe uu_app body: let-else is not allowed"); + } + // The bound expression must itself be a Command::new(...) builder + // chain so the only thing the binding can hold is a clap Command. + validate_command_chain_expr(&init.expr)?; + + // The tail must be a method chain whose innermost receiver is the + // let-bound identifier — i.e. structurally a continuation of the + // builder chain. + let mut validator = UuAppExprValidator { errors: vec![] }; + syn::visit::Visit::visit_expr(&mut validator, tail_expr); + if !validator.errors.is_empty() { + bail!("unsafe uu_app body: {}", validator.errors.join("; ")); + } + if !tail_chains_back_to_ident(tail_expr, binding_ident) { + bail!( + "unsafe uu_app body: tail expression must be a `{}.method(...)...` \ + chain on the let binding", + binding_ident + ); + } + Ok(()) +} + +/// Returns the bound identifier of a plain `let [: T] = ...;`. +/// Rejects destructuring, `ref`, subpatterns, or attributes — anything +/// that could hide an active expression inside the binding pattern. +fn simple_let_ident(local: &syn::Local) -> Option<&syn::Ident> { + let pat = match &local.pat { + syn::Pat::Type(pt) => &*pt.pat, + other => other, + }; + match pat { + syn::Pat::Ident(pi) + if pi.attrs.is_empty() && pi.by_ref.is_none() && pi.subpat.is_none() => + { + Some(&pi.ident) + } + _ => None, + } +} + +/// Walk down method-call receivers and require the innermost expression +/// to be a bare reference to `target` — i.e. the tail is shaped like +/// `target.foo(...).bar(...)`. Anything else (a fresh `Command::new`, +/// a different binding, a function call) is rejected. +fn tail_chains_back_to_ident(expr: &Expr, target: &syn::Ident) -> bool { + let mut cur = expr; + loop { + match cur { + Expr::MethodCall(mc) => cur = &mc.receiver, + Expr::Path(p) + if p.qself.is_none() + && p.path.leading_colon.is_none() + && p.path.segments.len() == 1 + && p.path.segments[0].ident == *target + && matches!(p.path.segments[0].arguments, syn::PathArguments::None) => + { + return true; + } + _ => return false, + } + } +} + struct UuAppExprValidator { errors: Vec, } @@ -1203,4 +1295,116 @@ pub fn uu_app() -> clap::Command { let msg = format!("{err:#}"); assert!(msg.contains("Command::new"), "got: {msg}"); } + + #[test] + fn accepts_let_bound_command_with_tail_chain() { + // Upstream uutils truncate now splits uu_app() into + // `let cmd = Command::new(...)...; + // configure_localized_command(cmd).arg(...)...`. + // The Rewriter folds `configure_localized_command(cmd)` to `cmd`, + // leaving exactly the `let`-then-tail shape the validator must accept. + let (_tmp, uutils) = fixture(&[ + ( + "src/uu/cat/src/cat.rs", + r#" +mod options { + pub static FILE: &str = "file"; +} + +pub fn uu_app() -> clap::Command { + let cmd = Command::new("cat") + .version(uucore::crate_version!()) + .infer_long_args(true); + uucore::clap_localization::configure_localized_command(cmd) + .arg(Arg::new(options::FILE)) +} +"#, + ), + ("src/uu/cat/locales/en-US.ftl", ""), + ]); + + let body = run(&uutils, "cat", "poc").unwrap(); + assert!(body.contains("pub fn cat_command() -> clap::Command")); + assert!(body.contains("Command::new(\"cat\")")); + // The configure_localized_command(cmd) wrapper is folded away; + // only `cmd.arg(...)` remains. + assert!( + !body.contains("configure_localized_command"), + "wrapper should be stripped: {body}" + ); + assert!(body.contains("cmd.arg("), "got: {body}"); + } + + #[test] + fn rejects_let_with_non_command_initializer() { + let (_tmp, uutils) = fixture(&[ + ( + "src/uu/cat/src/cat.rs", + r#" +mod options { + pub static FILE: &str = "file"; +} + +pub fn uu_app() -> clap::Command { + let cmd = std::process::Command::new("sh").arg("-c").arg("nope").status().unwrap(); + Command::new("cat").arg(Arg::new(options::FILE)) +} +"#, + ), + ("src/uu/cat/locales/en-US.ftl", ""), + ]); + + let err = run(&uutils, "cat", "poc").unwrap_err(); + let msg = format!("{err:#}"); + assert!(msg.contains("unsafe uu_app"), "got: {msg}"); + } + + #[test] + fn rejects_tail_not_chained_off_let_binding() { + let (_tmp, uutils) = fixture(&[ + ( + "src/uu/cat/src/cat.rs", + r#" +mod options { + pub static FILE: &str = "file"; +} + +pub fn uu_app() -> clap::Command { + let cmd = Command::new("cat"); + Command::new("cat").arg(Arg::new(options::FILE)) +} +"#, + ), + ("src/uu/cat/locales/en-US.ftl", ""), + ]); + + let err = run(&uutils, "cat", "poc").unwrap_err(); + let msg = format!("{err:#}"); + assert!(msg.contains("chain on the let binding"), "got: {msg}"); + } + + #[test] + fn rejects_three_statement_body() { + let (_tmp, uutils) = fixture(&[ + ( + "src/uu/cat/src/cat.rs", + r#" +mod options { + pub static FILE: &str = "file"; +} + +pub fn uu_app() -> clap::Command { + let cmd = Command::new("cat"); + std::process::abort(); + cmd.arg(Arg::new(options::FILE)) +} +"#, + ), + ("src/uu/cat/locales/en-US.ftl", ""), + ]); + + let err = run(&uutils, "cat", "poc").unwrap_err(); + let msg = format!("{err:#}"); + assert!(msg.contains("found 3 statements"), "got: {msg}"); + } } diff --git a/specs/coreutils-args-port.md b/specs/coreutils-args-port.md index bf6bb293..db59b614 100644 --- a/specs/coreutils-args-port.md +++ b/specs/coreutils-args-port.md @@ -45,11 +45,17 @@ codegen**, not by depending on `uu_*` crates at runtime. through the shim — if it does, every uutils env-default auto-lights as that option's bashkit support lands. 5. Validates the rewritten `uu_app()` before emission: args mode accepts - only a single tail clap `Command` builder expression. Prefix - statements, block expressions, loops, matches, async blocks, and - unsafe blocks are rejected before any generated Rust is written. - This keeps third-party uutils source from smuggling arbitrary - executable statements into `_command()` (TM-INF-025). + either a single tail clap `Command` builder expression, or the + two-statement shape `let = Command::new(...); + .(...)` where the `let` initializer is itself + a Command::new chain and the tail's innermost receiver is the + let-bound identifier (the same shape that emerges after the Rewriter + folds `uucore::clap_localization::configure_localized_command(cmd)` + to `cmd`). Anything else — additional prefix statements, block + expressions, loops, matches, async blocks, unsafe blocks, or a + destructuring/let-else binding — is rejected before any generated + Rust is written. This keeps third-party uutils source from smuggling + arbitrary executable statements into `_command()` (TM-INF-025). 6. Emits a generated file under `crates/bashkit/src/builtins/generated/_args.rs` with a clean `pub fn _command() -> clap::Command`. diff --git a/specs/threat-model.md b/specs/threat-model.md index a7a87828..b0f3f4aa 100644 --- a/specs/threat-model.md +++ b/specs/threat-model.md @@ -440,7 +440,7 @@ All execution stays within the virtual interpreter — no OS subprocess is spawn | TM-INF-022 | Library Debug shapes leak via stderr | Builtins wrapping external libs (jaq, regex, serde_json, semver, chrono, …) format errors with `format!("{:?}", e)` and dump internal struct shapes into stderr — e.g. jq printed `(File { code: "<800 chars of prepended compat-defs source>", path: () }, [("@tsv", Filter(0))])` to LLM agents | Three-layer enforcement, all run by `cargo test`: (1) static — `builtins::tests::no_debug_fmt_in_builtin_source` walks every `crates/bashkit/src/builtins/*.rs` and forbids `{:?}` / `{:#?}` / `{name:?}` (per-line opt-out: `// debug-ok: `); (2) dynamic per-tool — each tool's `mod tests` calls `bashkit::testing::assert_no_leak` with malformed inputs; (3) fuzz — every cargo-fuzz target plus 5 proptest cases call `bashkit::testing::assert_fuzz_invariants` against random input and check the same invariants on stderr/stdout | **FIXED** | | TM-INF-023 | jq `halt` / `halt_error` terminate the host process | jaq-std 3.0's `halt(N)` native calls `std::process::exit(exit_code)`. `defs.jq` wraps it with `def halt: halt(0);` and `def halt_error(...): ..., halt(...);`. Untrusted jq filters can invoke `halt` / `halt_error` to exit the entire embedding process, escaping the `ExecResult`-based sandbox boundary and causing process-wide DoS for any caller hosting bashkit. | Strip the upstream `halt` native from `jaq_std::funs::()` and chain in a safe replacement that pops the variable argument and returns `Error::str("halt is disabled in the bashkit sandbox")`. The wrapper defs (`halt`, `halt_error`) still resolve, so callers see a normal jq runtime error (exit 5) instead of the host being killed. Regression tests in `builtins::jq::tests`: `halt_does_not_terminate_host_process`, `halt_with_arg_does_not_terminate_host_process`, `halt_error_does_not_terminate_host_process`. | **FIXED** | | TM-INF-024 | Host env side-channel via clap `Arg::env(...)` | Clap-based builtins ported from uutils (currently `ls`, with `.env("TABSIZE")` and `.env("TIME_STYLE")` on `uu_app()`) resolve those defaults from `std::env`, not bashkit's `ctx.env`. This breaks the sandbox boundary two ways: (1) presence-probe — a script can tell whether the host process has `TIME_STYLE`/`TABSIZE` set by observing `ls`'s behaviour; (2) availability — a host- or container-set `TIME_STYLE` materialises as a clap value source for an option bashkit hasn't implemented yet, so the unsupported-option gate fires on every plain `ls` and breaks the builtin for unrelated tenants | Four-layer fix: (1) **codegen strips runtime `.env(...)` and harvests it** — `bashkit-coreutils-port` rewrites `.env("FOO")` chained method calls out of every Arg builder it ports AND emits a sidecar `_ENV_DEFAULTS: &[EnvDefault]` table recording each stripped annotation; (2) **virtual-env shim** — `crate::builtins::clap_env::apply_env_defaults` rewrites argv from `ctx.env` (never `std::env`) before `try_get_matches_from`, emulating clap's "argv > env > default" precedence so uutils' env-default UX is preserved without re-opening the host channel; (3) **static guards** — `no_clap_env_in_generated_parsers` forbids runtime `.env(` calls in `generated/*.rs`, and `every_generated_parser_emits_env_defaults_table` enforces every util emits the sidecar (uniform surface, no per-util conditionals); (4) **defence-in-depth** — workspace `clap` dep drops the `env` cargo feature, so a re-introduced `.env(...)` fails to compile rather than silently re-opening the channel. Coverage: ls runtime tests pin both directions — `ls_ignores_host_time_style_and_tabsize` (host `std::env` ignored) and `ls_honors_virtual_env_time_style` (virtual `ctx.env` honoured) | **FIXED** | -| TM-INF-025 | Untrusted generated Rust runs in drift CI with repository write token | The coreutils argument-drift workflow consumes third-party `uutils/coreutils` source, regenerates Rust from `uu_app()`, then builds/tests bashkit. If the generator preserved arbitrary upstream statements and the same job held `contents: write`/`pull-requests: write`, malicious upstream code could execute during parser construction with repository write impact. | Two-layer fix: (1) `bashkit-coreutils-port` validates args-mode `uu_app()` before emission and rejects anything other than a single tail clap `Command` builder expression; regression test `rejects_executable_statements_in_uu_app` proves write+abort payloads fail codegen. (2) `.github/workflows/coreutils-args-drift.yml` separates privilege: regeneration/build/test has only `contents: read` and `persist-credentials: false` on both checkouts; the later `open-pr` job gets write permission but does not build or execute generated Rust, only commits the tested artifact. | **FIXED** | +| TM-INF-025 | Untrusted generated Rust runs in drift CI with repository write token | The coreutils argument-drift workflow consumes third-party `uutils/coreutils` source, regenerates Rust from `uu_app()`, then builds/tests bashkit. If the generator preserved arbitrary upstream statements and the same job held `contents: write`/`pull-requests: write`, malicious upstream code could execute during parser construction with repository write impact. | Two-layer fix: (1) `bashkit-coreutils-port` validates args-mode `uu_app()` before emission and accepts only (a) a single tail clap `Command` builder expression or (b) `let = Command::new(...); .(...)` where the initializer is itself a Command::new chain and the tail's innermost receiver is the let-bound ident; both shapes run through the same disallowed-method/closure/macro/block visitor. Regression tests `rejects_executable_statements_in_uu_app`, `rejects_let_with_non_command_initializer`, `rejects_tail_not_chained_off_let_binding`, and `rejects_three_statement_body` prove write+abort, hijacked initializers, mismatched tails, and inserted statements all fail codegen. (2) `.github/workflows/coreutils-args-drift.yml` separates privilege: regeneration/build/test has only `contents: read` and `persist-credentials: false` on both checkouts; the later `open-pr` job gets write permission but does not build or execute generated Rust, only commits the tested artifact. | **FIXED** | **TM-INF-022**: Generalizes TM-INF-016 to cover the whole builtin surface. The originating bug was `builtins/jq/` formatting jaq's compile/parse