From 0616dab86ae18fb006e119ef11e48ad7439fd7cd Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Fri, 8 May 2026 15:06:39 -0500 Subject: [PATCH] fix(shuf): cap range and repeat output allocation --- crates/bashkit/src/builtins/shuf.rs | 276 ++++++++++++++++---- crates/bashkit/tests/shuf_resource_tests.rs | 30 +++ specs/threat-model.md | 2 + 3 files changed, 264 insertions(+), 44 deletions(-) create mode 100644 crates/bashkit/tests/shuf_resource_tests.rs diff --git a/crates/bashkit/src/builtins/shuf.rs b/crates/bashkit/src/builtins/shuf.rs index 34050dc3f..ad3959d10 100644 --- a/crates/bashkit/src/builtins/shuf.rs +++ b/crates/bashkit/src/builtins/shuf.rs @@ -4,8 +4,13 @@ //! `bashkit-coreutils-port` codegen tool — see `generated/shuf_args.rs` //! and `crates/bashkit-coreutils-port/`. Behaviour is implemented locally //! against the bashkit VFS. +//! +//! Resource decision: shuf builds an ExecResult/String in-process, so it must +//! reject outputs that exceed ExecutionLimits before allocation and must never +//! materialize numeric ranges only to apply `-n` afterward. use async_trait::async_trait; +use std::collections::HashMap; use std::ffi::OsString; use std::ops::RangeInclusive; use std::path::Path; @@ -14,6 +19,7 @@ use super::generated::shuf_args::shuf_command; use super::{Builtin, Context, read_text_file}; use crate::error::Result; use crate::interpreter::ExecResult; +use crate::limits::ExecutionLimits; pub struct Shuf; @@ -71,10 +77,10 @@ impl Builtin for Shuf { let separator = if zero_terminated { '\0' } else { '\n' }; - let items: Vec = if echo { - positionals + let input = if echo { + ShufInput::Items(positionals) } else if let Some(range) = input_range { - range.map(|n| n.to_string()).collect() + ShufInput::Range(range) } else { // Reading lines: positional file path, "-", or absent (stdin). let raw = match positionals.first() { @@ -92,53 +98,22 @@ impl Builtin for Shuf { } } }; - split_separated(&raw, separator) + ShufInput::Items(split_separated(&raw, separator)) }; + // THREAT[TM-DOS-090]: Bind shuf's in-process output construction to + // ExecutionLimits before range/repeat loops can allocate. + let output_limit = shuf_output_limit(&ctx, output_path.is_some()); let mut rng = SmallRng::seed_from_now(); - - let output_lines: Vec = if repeat { - // -r samples *with* replacement: each pick is independent. - // GNU shuf -r without -n loops forever; bashkit caps at 1 - // and requires -n, mirroring the safe behavior an embedded - // VFS shell needs. - let count = match head_count { - Some(n) => n, - None => { - return Ok(ExecResult::err( - "shuf: --repeat requires --head-count to be finite\n".to_string(), - 1, - )); - } - }; - if items.is_empty() { - return Ok(ExecResult::err( - "shuf: no lines to repeat from\n".to_string(), - 1, - )); - } - (0..count) - .map(|_| items[rng.next_usize_lt(items.len())].clone()) - .collect() + let out = match if repeat { + render_repeat(input, head_count, separator, output_limit, &mut rng) } else { - // Without -r: Fisher-Yates shuffle, then truncate to -n. - let mut v = items; - for i in (1..v.len()).rev() { - let j = rng.next_usize_lt(i + 1); - v.swap(i, j); - } - if let Some(n) = head_count { - v.truncate(n as usize); - } - v + render_non_repeat(input, head_count, separator, output_limit, &mut rng) + } { + Ok(out) => out, + Err(stderr) => return Ok(ExecResult::err(stderr, 1)), }; - let mut out = String::with_capacity(output_lines.iter().map(String::len).sum::()); - for line in &output_lines { - out.push_str(line); - out.push(separator); - } - if let Some(path) = output_path { let resolved = if path.is_absolute() { path.clone() @@ -171,6 +146,208 @@ fn split_separated(raw: &str, sep: char) -> Vec { out } +enum ShufInput { + Items(Vec), + Range(RangeInclusive), +} + +fn shuf_output_limit(ctx: &Context<'_>, output_to_file: bool) -> usize { + let exec_limit = ctx + .execution_extension::() + .map(|limits| limits.max_stdout_bytes) + .unwrap_or_else(|| ExecutionLimits::default().max_stdout_bytes); + + if !output_to_file { + return exec_limit; + } + + let fs_limits = ctx.fs.limits(); + exec_limit + .min(u64_to_usize_saturating(fs_limits.max_file_size)) + .min(u64_to_usize_saturating(fs_limits.max_total_bytes)) +} + +fn render_repeat( + input: ShufInput, + head_count: Option, + separator: char, + output_limit: usize, + rng: &mut SmallRng, +) -> std::result::Result { + // -r samples *with* replacement: each pick is independent. + // GNU shuf -r without -n loops forever; bashkit requires -n because + // an embedded VFS shell needs a finite output contract. + let count = match head_count { + Some(n) => n, + None => { + return Err("shuf: --repeat requires --head-count to be finite\n".to_string()); + } + }; + + match input { + ShufInput::Items(items) => { + if items.is_empty() { + return Err("shuf: no lines to repeat from\n".to_string()); + } + let max_line_len = items + .iter() + .map(String::len) + .max() + .unwrap_or(0) + .saturating_add(separator.len_utf8()); + ensure_output_fits(count as u128, max_line_len as u128, output_limit)?; + + let mut out = + String::with_capacity(repeat_capacity(count, max_line_len, output_limit)?); + for _ in 0..count { + out.push_str(&items[rng.next_usize_lt(items.len())]); + out.push(separator); + } + Ok(out) + } + ShufInput::Range(range) => { + let Some((start, end, len)) = range_parts(&range) else { + return Err("shuf: no lines to repeat from\n".to_string()); + }; + let max_line_len = max_range_value_len(start, end).saturating_add(separator.len_utf8()); + ensure_output_fits(count as u128, max_line_len as u128, output_limit)?; + + let mut out = + String::with_capacity(repeat_capacity(count, max_line_len, output_limit)?); + for _ in 0..count { + let offset = rng.next_u128_lt(len); + out.push_str(&(start as u128 + offset).to_string()); + out.push(separator); + } + Ok(out) + } + } +} + +fn render_non_repeat( + input: ShufInput, + head_count: Option, + separator: char, + output_limit: usize, + rng: &mut SmallRng, +) -> std::result::Result { + match input { + ShufInput::Items(mut items) => { + for i in (1..items.len()).rev() { + let j = rng.next_usize_lt(i + 1); + items.swap(i, j); + } + if let Some(n) = head_count { + items.truncate(u64_to_usize_saturating(n)); + } + render_items(items, separator, output_limit) + } + ShufInput::Range(range) => { + let Some((start, end, len)) = range_parts(&range) else { + return Ok(String::new()); + }; + let output_count = head_count.map(u128::from).unwrap_or(len).min(len); + let max_line_len = max_range_value_len(start, end).saturating_add(separator.len_utf8()); + ensure_output_fits(output_count, max_line_len as u128, output_limit)?; + + let count = + usize::try_from(output_count).map_err(|_| output_too_large(output_limit))?; + let values = sample_range_without_replacement(start, len, count, rng); + render_items(values, separator, output_limit) + } + } +} + +fn render_items( + items: Vec, + separator: char, + output_limit: usize, +) -> std::result::Result { + let output_len = + items_output_len(&items, separator).ok_or_else(|| output_too_large(output_limit))?; + if output_len > output_limit { + return Err(output_too_large(output_limit)); + } + + let mut out = String::with_capacity(output_len); + for line in &items { + out.push_str(line); + out.push(separator); + } + Ok(out) +} + +fn sample_range_without_replacement( + start: u64, + len: u128, + count: usize, + rng: &mut SmallRng, +) -> Vec { + let mut swaps: HashMap = HashMap::with_capacity(count.saturating_mul(2)); + let mut out = Vec::with_capacity(count); + for i in 0..count as u128 { + let j = i + rng.next_u128_lt(len - i); + let selected = *swaps.get(&j).unwrap_or(&j); + let replacement = *swaps.get(&i).unwrap_or(&i); + swaps.insert(j, replacement); + out.push((start as u128 + selected).to_string()); + } + out +} + +fn range_parts(range: &RangeInclusive) -> Option<(u64, u64, u128)> { + let start = *range.start(); + let end = *range.end(); + if start > end { + return None; + } + Some((start, end, u128::from(end - start) + 1)) +} + +fn items_output_len(items: &[String], separator: char) -> Option { + let sep_len = separator.len_utf8(); + items.iter().try_fold(0usize, |sum, item| { + sum.checked_add(item.len())?.checked_add(sep_len) + }) +} + +fn ensure_output_fits( + count: u128, + max_line_len: u128, + output_limit: usize, +) -> std::result::Result<(), String> { + let Some(max_output_len) = count.checked_mul(max_line_len) else { + return Err(output_too_large(output_limit)); + }; + if max_output_len > output_limit as u128 { + return Err(output_too_large(output_limit)); + } + Ok(()) +} + +fn repeat_capacity( + count: u64, + max_line_len: usize, + output_limit: usize, +) -> std::result::Result { + let capacity = (count as u128) + .checked_mul(max_line_len as u128) + .ok_or_else(|| output_too_large(output_limit))?; + usize::try_from(capacity).map_err(|_| output_too_large(output_limit)) +} + +fn max_range_value_len(start: u64, end: u64) -> usize { + start.to_string().len().max(end.to_string().len()) +} + +fn u64_to_usize_saturating(n: u64) -> usize { + usize::try_from(n).unwrap_or(usize::MAX) +} + +fn output_too_large(output_limit: usize) -> String { + format!("shuf: output too large (limit {output_limit} bytes)\n") +} + // Note: range parsing is handled by `parse_range` inlined into the // generated `shuf_args.rs` (codegen copies it from uutils' source). // We don't need a second parser here; clap returns the parsed @@ -228,6 +405,17 @@ impl SmallRng { } } } + + fn next_u128_lt(&mut self, bound: u128) -> u128 { + debug_assert!(bound > 0); + loop { + let value = (u128::from(self.next_u64()) << 64) | u128::from(self.next_u64()); + let zone = u128::MAX - (u128::MAX % bound); + if value < zone { + return value % bound; + } + } + } } #[cfg(test)] diff --git a/crates/bashkit/tests/shuf_resource_tests.rs b/crates/bashkit/tests/shuf_resource_tests.rs new file mode 100644 index 000000000..f54a6c9fd --- /dev/null +++ b/crates/bashkit/tests/shuf_resource_tests.rs @@ -0,0 +1,30 @@ +use bashkit::{Bash, ExecutionLimits}; + +#[tokio::test] +async fn shuf_range_head_count_does_not_materialize_full_range() { + let limits = ExecutionLimits::new().max_stdout_bytes(64); + let mut bash = Bash::builder().limits(limits).build(); + + let result = bash + .exec("shuf -i 1-18446744073709551615 -n 1") + .await + .unwrap(); + + assert_eq!(result.exit_code, 0); + assert!(!result.stdout_truncated); + let value = result.stdout.trim().parse::().unwrap(); + assert!(value >= 1); +} + +#[tokio::test] +async fn shuf_repeat_head_count_is_checked_before_output_allocation() { + let limits = ExecutionLimits::new().max_stdout_bytes(16); + let mut bash = Bash::builder().limits(limits).build(); + + let result = bash.exec("shuf -r -e x -n 50").await.unwrap(); + + assert_eq!(result.exit_code, 1); + assert!(!result.stdout_truncated); + assert!(result.stdout.is_empty()); + assert!(result.stderr.contains("output too large")); +} diff --git a/specs/threat-model.md b/specs/threat-model.md index 406018d44..74a55b089 100644 --- a/specs/threat-model.md +++ b/specs/threat-model.md @@ -1338,6 +1338,7 @@ This section maps former vulnerability IDs to the new threat ID scheme and track | ~~TM-DOS-044~~ | ~~Nested `$()` stack overflow (regression)~~ | ~~Process crash (SIGABRT) at depth ~50 despite #492 fix~~ | ~~Interpreter execution path may need separate depth tracking from lexer fix~~ — depth-50 nested-subst test (`finding_nested_cmd_subst_stack_overflow::depth_50_is_bounded`) passes (**FIXED**) | | TM-DOS-088 | Command substitution OOM via state cloning | OOM at depth N (memory ≈ N × state_size) | Dedicated `max_subst_depth` limit (default 32), separate from `max_function_depth` — **FIXED** via #1088 | | TM-DOS-089 | Command substitution stack overflow via inlined futures | SIGABRT at ~20-30 nested $() levels | Box::pin `expand_word` and `execute_cmd_subst` to cap per-level stack — **FIXED** via #1089 | +| ~~TM-DOS-090~~ | ~~`shuf` unbounded range/repeat materialization~~ | ~~OOM/CPU exhaustion via huge `--input-range` or `--head-count` before stdout truncation~~ | ~~Sample numeric ranges without full collection and reject output that exceeds `ExecutionLimits` before allocation~~ — `shuf_resource_tests` cover huge range `-n 1` and repeat output caps (**FIXED**) | ### Accepted (Low Priority) @@ -1369,6 +1370,7 @@ This section maps former vulnerability IDs to the new threat ID scheme and track | Arithmetic depth limit (50) | TM-DOS-026 | `interpreter/mod.rs` | Yes | | Builtin parser depth limit (100) | TM-DOS-027 | `builtins/awk.rs`, `builtins/jq/` | Yes | | Execution timeout (30s) | TM-DOS-023 | `limits.rs` | Yes | +| Builtin output pre-allocation caps | TM-DOS-058, TM-DOS-090 | `limits.rs`, `builtins/shuf.rs` | Yes | | Virtual filesystem | TM-ESC-001, TM-ESC-003 | `fs/memory.rs` | Yes | | Filesystem limits | TM-DOS-005 to TM-DOS-010, TM-DOS-014 | `fs/limits.rs` | Yes | | Path depth limit (100) | TM-DOS-012 | `fs/limits.rs` | Yes |