From 2000da4696a6a2f2456f610bd7a189565ac7c96c Mon Sep 17 00:00:00 2001 From: Naohiro CHIKAMATSU Date: Wed, 20 May 2026 23:01:16 +0900 Subject: [PATCH] fix(content_disposition): enforce RFC 7230 quoted-pair grammar strictly (#50) --- CHANGELOG.md | 4 + src/multipartkit/content_disposition.gleam | 19 +++- src/multipartkit/error.gleam | 11 +++ src/multipartkit/internal/text.gleam | 110 +++++++++++++++++++++ test/regression_cd_quoted_pair_test.gleam | 67 +++++++++++++ 5 files changed, 206 insertions(+), 5 deletions(-) create mode 100644 test/regression_cd_quoted_pair_test.gleam diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d3acc4..17db6ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/). ## [Unreleased] +### Changed + +- `multipartkit/content_disposition.parse` now enforces RFC 7230 §3.2.6 `quoted-pair` strictly: a `\X` whose `X` is outside `HTAB / SP / VCHAR / obs-text` (for example `NUL`, `CR`, `LF`, or any other ASCII control byte) is rejected with the new `Error(InvalidQuotedPair(_))` instead of silently dropping the backslash, blocking `NUL`-smuggling into the decoded `name` / `filename`. (#50) + ## [0.12.0] - 2026-05-11 ### Changed diff --git a/src/multipartkit/content_disposition.gleam b/src/multipartkit/content_disposition.gleam index 1c0c693..6065513 100644 --- a/src/multipartkit/content_disposition.gleam +++ b/src/multipartkit/content_disposition.gleam @@ -3,8 +3,10 @@ import gleam/list import gleam/option.{type Option, None, Some} import gleam/result import gleam/string -import multipartkit/error.{type MultipartError, InvalidContentDisposition} -import multipartkit/internal/text +import multipartkit/error.{ + type MultipartError, InvalidContentDisposition, InvalidQuotedPair, +} +import multipartkit/internal/text.{QuotedPairInvalid, QuotedSyntax} /// A parsed `Content-Disposition` header value. /// @@ -19,7 +21,13 @@ import multipartkit/internal/text /// RFC 5987 / RFC 8187 `*`-form when present (with the `*`-form taking /// precedence over the plain form), otherwise from the plain parameter /// value with surrounding quotes removed and backslash-escapes resolved -/// per RFC 7230 quoted-string rules. +/// per RFC 7230 §3.2.6 quoted-string / quoted-pair rules. A `\X` +/// escape whose `X` is outside the `quoted-pair` grammar +/// (`HTAB / SP / VCHAR / obs-text`) — for instance `NUL`, `CR`, `LF`, +/// or any other ASCII control byte — causes `parse` to return +/// `Error(InvalidQuotedPair(original_value))` instead of silently +/// dropping the backslash. This blocks `NUL`-smuggling into the +/// decoded `name` / `filename`. /// - `params/1` contains every parameter as it appeared in the input /// including `filename` and `name`, in order. Duplicate parameter names /// are preserved left-to-right; only the first occurrence wins for the @@ -116,8 +124,9 @@ fn parse_one_param( _ -> case string.pop_grapheme(after_key) { Ok(#("=", value_rest)) -> - case text.read_token_or_quoted(value_rest) { - Error(Nil) -> Error(InvalidContentDisposition(original)) + case text.read_token_or_quoted_strict(value_rest) { + Error(QuotedPairInvalid) -> Error(InvalidQuotedPair(original)) + Error(QuotedSyntax) -> Error(InvalidContentDisposition(original)) Ok(#(raw_value, tail)) -> parse_params(tail, [#(key, raw_value), ..acc], original) } diff --git a/src/multipartkit/error.gleam b/src/multipartkit/error.gleam index 54f756b..0659072 100644 --- a/src/multipartkit/error.gleam +++ b/src/multipartkit/error.gleam @@ -51,4 +51,15 @@ pub type MultipartError { /// inject a header break or split into a different `name: value` pair /// at parse time. Carries the offending name. InvalidHeaderName(name: String) + /// A `Content-Disposition` quoted-string parameter contained a + /// `\X` escape whose `X` is outside the RFC 7230 §3.2.6 + /// `quoted-pair` grammar (`HTAB / SP / VCHAR / obs-text`). The + /// offending second character is not `HTAB`, `SP`, a `VCHAR` + /// (`%x21-7E`), nor `obs-text` (`%x80-FF`) — for example + /// `NUL`, `CR`, `LF`, or any other ASCII control byte. Allowing + /// these would let an attacker smuggle `NUL` into the decoded + /// `name` / `filename` (e.g. for C-string truncation attacks). + /// The carried value is the original Content-Disposition header + /// text so callers can render diagnostics. + InvalidQuotedPair(String) } diff --git a/src/multipartkit/internal/text.gleam b/src/multipartkit/internal/text.gleam index 0d88aac..9aa5cc5 100644 --- a/src/multipartkit/internal/text.gleam +++ b/src/multipartkit/internal/text.gleam @@ -100,6 +100,116 @@ pub fn read_token_or_quoted(input: String) -> Result(#(String, String), Nil) { } } +/// Internal: Fault categories returned by `read_token_or_quoted_strict`. +/// +/// - `QuotedSyntax`: the quoted-string is malformed in a way that is +/// not a quoted-pair grammar violation (e.g. unterminated, missing +/// opening quote, or an empty token where one was expected). +/// - `QuotedPairInvalid`: the input was otherwise a well-formed +/// quoted-string but contained a `\X` escape whose `X` falls outside +/// the RFC 7230 §3.2.6 `quoted-pair` grammar +/// (`HTAB / SP / VCHAR / obs-text`). +pub type QuotedFault { + QuotedSyntax + QuotedPairInvalid +} + +/// Internal: Like `read_token_or_quoted`, but enforces RFC 7230 §3.2.6 +/// strictly inside quoted-strings: the second character of a +/// `quoted-pair` (`\X`) MUST be `HTAB`, `SP`, a `VCHAR` (`%x21-7E`), +/// or `obs-text` (`%x80-FF`). Any other byte (NUL, CR, LF, other ASCII +/// control bytes, or DEL) results in `Error(QuotedPairInvalid)`. +pub fn read_token_or_quoted_strict( + input: String, +) -> Result(#(String, String), QuotedFault) { + case string.starts_with(input, "\"") { + True -> read_quoted_string_strict(input) + False -> { + let #(token, rest) = read_token(input) + case token { + "" -> Error(QuotedSyntax) + _ -> Ok(#(token, rest)) + } + } + } +} + +fn read_quoted_string_strict( + input: String, +) -> Result(#(String, String), QuotedFault) { + case string.pop_grapheme(input) { + Ok(#("\"", rest)) -> read_quoted_loop_strict(rest, "") + _ -> Error(QuotedSyntax) + } +} + +fn read_quoted_loop_strict( + input: String, + acc: String, +) -> Result(#(String, String), QuotedFault) { + case string.pop_grapheme(input) { + Error(_) -> Error(QuotedSyntax) + Ok(#("\"", rest)) -> Ok(#(acc, rest)) + Ok(#("\\", rest)) -> + case string.pop_grapheme(rest) { + Error(_) -> Error(QuotedSyntax) + Ok(#(escaped, after)) -> + case is_valid_quoted_pair_second_char(escaped) { + True -> read_quoted_loop_strict(after, acc <> escaped) + False -> Error(QuotedPairInvalid) + } + } + Ok(#(grapheme, rest)) -> read_quoted_loop_strict(rest, acc <> grapheme) + } +} + +/// Internal: RFC 7230 §3.2.6 — the second character of a quoted-pair +/// must be `HTAB / SP / VCHAR / obs-text`. `HTAB` is `%x09`, `SP` is +/// `%x20`, `VCHAR` is `%x21-7E`, `obs-text` is `%x80-FF`. Excluded +/// here are `NUL` (`%x00`), other `%x01-08`, `LF` (`%x0A`), `CR` +/// (`%x0D`), the remaining `%x0B-1F` control bytes, and `DEL` +/// (`%x7F`). Non-ASCII graphemes (any code point ≥ U+0080) are +/// permitted as `obs-text`. +fn is_valid_quoted_pair_second_char(grapheme: String) -> Bool { + case grapheme { + "\t" -> True + " " -> True + "\u{0000}" -> False + "\u{0001}" -> False + "\u{0002}" -> False + "\u{0003}" -> False + "\u{0004}" -> False + "\u{0005}" -> False + "\u{0006}" -> False + "\u{0007}" -> False + "\u{0008}" -> False + "\n" -> False + "\u{000B}" -> False + "\u{000C}" -> False + "\r" -> False + "\u{000E}" -> False + "\u{000F}" -> False + "\u{0010}" -> False + "\u{0011}" -> False + "\u{0012}" -> False + "\u{0013}" -> False + "\u{0014}" -> False + "\u{0015}" -> False + "\u{0016}" -> False + "\u{0017}" -> False + "\u{0018}" -> False + "\u{0019}" -> False + "\u{001A}" -> False + "\u{001B}" -> False + "\u{001C}" -> False + "\u{001D}" -> False + "\u{001E}" -> False + "\u{001F}" -> False + "\u{007F}" -> False + _ -> True + } +} + /// Internal: Lowercase ASCII A-Z; preserve all other code points. Used in /// places where we must avoid Unicode case-folding (e.g., header names per /// RFC 7230). diff --git a/test/regression_cd_quoted_pair_test.gleam b/test/regression_cd_quoted_pair_test.gleam new file mode 100644 index 0000000..59f82b5 --- /dev/null +++ b/test/regression_cd_quoted_pair_test.gleam @@ -0,0 +1,67 @@ +//// Regression tests for Issue #50: `multipartkit/content_disposition.parse` +//// quoted-pair handling must align with RFC 7230 §3.2.6 +//// +//// quoted-pair = "\" ( HTAB / SP / VCHAR / obs-text ) +//// +//// where `VCHAR = %x21-7E` and `obs-text = %x80-FF`. A `\X` whose `X` +//// is outside that set (NUL, CR, LF, other control bytes, or DEL) +//// must be rejected with `Error(InvalidQuotedPair(...))` rather than +//// silently dropping the backslash — otherwise an attacker can smuggle +//// a NUL byte into a decoded `name` / `filename`. + +import gleam/option.{Some} +import gleeunit/should +import multipartkit/content_disposition as cd + +// === RFC 7230 §3.2.6 strict compliance === + +pub fn cd_canonical_escape_backslash_test() { + let assert Ok(p) = cd.parse("form-data; name=\"\\\\backslash\"") + cd.name(p) |> should.equal(Some("\\backslash")) +} + +pub fn cd_canonical_escape_quote_test() { + let assert Ok(p) = cd.parse("form-data; name=\"\\\"quoted\\\"\"") + cd.name(p) |> should.equal(Some("\"quoted\"")) +} + +pub fn cd_vchar_escape_drops_backslash_test() { + // \X where X is VCHAR ('n' = U+006E ∈ [%x21-7E]) + // RFC 7230: \X 展開 = X (backslash drop) + let assert Ok(p) = cd.parse("form-data; name=\"\\nnewline\"") + cd.name(p) |> should.equal(Some("nnewline")) +} + +pub fn cd_nul_in_quoted_pair_rejected_test() { + // \X where X is NUL (U+0000): not VCHAR, not obs-text → invalid quoted-pair + case cd.parse("form-data; name=\"\\\u{0000}nul\"") { + Error(_) -> Nil + Ok(_) -> should.fail() + } +} + +pub fn cd_cr_in_quoted_pair_rejected_test() { + case cd.parse("form-data; name=\"\\\rcr\"") { + Error(_) -> Nil + Ok(_) -> should.fail() + } +} + +pub fn cd_lf_in_quoted_pair_rejected_test() { + case cd.parse("form-data; name=\"\\\nlf\"") { + Error(_) -> Nil + Ok(_) -> should.fail() + } +} + +// === Behavioural regression baseline === + +pub fn cd_no_escape_test() { + let assert Ok(p) = cd.parse("form-data; name=\"plain\"") + cd.name(p) |> should.equal(Some("plain")) +} + +pub fn cd_utf8_name_test() { + let assert Ok(p) = cd.parse("form-data; name=\"日本語\"") + cd.name(p) |> should.equal(Some("日本語")) +}