diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..417a09b --- /dev/null +++ b/Makefile @@ -0,0 +1,28 @@ +# Overridable: `make bench LUAJIT=/path/to/luajit LUA_CPATH='...'` +LUAJIT ?= $(shell command -v luajit 2>/dev/null || echo /usr/local/openresty/luajit/bin/luajit) +LUA_CPATH ?= ./?.so;/usr/local/openresty/lualib/?.so;/usr/local/lib/lua/5.1/?.so;/usr/local/openresty/luajit/lib/lua/5.1/?.so + +LIB_DIR := $(CURDIR)/target/release +LUA_ENV := LD_LIBRARY_PATH=$(LIB_DIR) LUA_CPATH='$(LUA_CPATH)' + +.PHONY: help build test lint bench clean + +help: ## Show this help + @awk 'BEGIN {FS = ":.*## "} /^[a-zA-Z_-]+:.*## / {printf " \033[36m%-10s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST) + +build: ## Build the release cdylib (target/release/libquickdecode.so) + cargo build --release + +test: build ## Run cargo tests + busted Lua tests + cargo test --release + $(LUA_ENV) busted --lua=$(LUAJIT) tests/lua --lpath='./lua/?.lua' + +lint: ## Run clippy (deny warnings) and rustfmt --check + cargo clippy --release --all-targets -- -D warnings + cargo fmt --check + +bench: build ## Run the LuaJIT vs cjson benchmark + $(LUA_ENV) $(LUAJIT) benches/lua_bench.lua + +clean: ## Remove build artifacts + cargo clean diff --git a/README.md b/README.md index 4bd9044..82309b5 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,8 @@ cargo build --release # Output: target/release/libquickdecode.so ``` +A `Makefile` wraps the common workflows; run `make help` to see `build`, `test`, `lint`, `bench`, and `clean` targets. Override `LUAJIT` / `LUA_CPATH` per invocation if your environment differs from the defaults. + ## Testing ```sh @@ -67,5 +69,8 @@ Items intentionally pushed out of the first implementation. Each will be picked - **Lossless 64-bit integer mode** — return cdata `int64_t` to LuaJIT to preserve precision > 2⁵³. - **Skip-cache LRU eviction** — only if memory pressure on huge documents proves problematic in practice. - **Path-position info on Phase 1 errors** — currently only an opaque `QJD_PARSE_ERROR`. -- **AVX2 tail-bypass optimization** — current implementation falls back to whole-buffer scalar when a tail exists; could be optimized by emitting tail structural offsets directly. - **Large bench fixtures** — spec §9.3 lists `large_dump.json` (~20 MB) and `deep_nest.json` (depth stress test); not yet committed. Only `small_api.json` and `medium_resp.json` ship today. +- **`structural_mask_chunk` via shuffle-based set check** — the current AVX2 scanner does 7 `_mm256_cmpeq_epi8` + `_mm256_movemask_epi8` per chunk half (one per structural char in `{}[]:,"`). A single `_mm256_shuffle_epi8` against a 16-byte LUT plus one cmpeq can do the same set membership in 2-3 ops per half. Estimated 15-25% scanner speedup on dense-structural workloads. Not on the hot path for string-heavy payloads (those already short-circuit via the fast path). +- **Adaptive `out.reserve` in scanners** — `out.reserve(buf.len() / 6)` is calibrated for object-heavy JSON. On string-heavy multimodal payloads (one big content array, mostly base64) the actual emit rate is <1 structural per 1 KB, so we over-reserve by 100x+. Mainly a memory hygiene concern (mmap'd pages stay lazily faulted), <5% throughput effect. +- **AVX-512 scanner backend** — 64-byte → 128-byte chunks doubles scan throughput on CPUs with `avx512bw` + `vpclmulqdq` (Sapphire Rapids, Zen 4+). Estimated 2× on supporting silicon; no effect elsewhere. +- **`# Safety` docs on unsafe FFI exports** — `make lint` currently fails on 22 `missing_safety_doc` clippy warnings from the public `qjd_*` C-ABI functions. Tracked separately so the Makefile can ship with `-D warnings` already wired up. diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua index 17d6c7e..e75579c 100644 --- a/benches/lua_bench.lua +++ b/benches/lua_bench.lua @@ -11,6 +11,32 @@ local function read_file(p) return s end +-- Shape: a multimodal chat-completion request with one ~1.5K text question +-- and N base64-encoded image parts (each 50-500 KB) until the payload reaches +-- target_bytes. Mirrors the production case the bench is meant to reflect. +local function make_payload(target_bytes) + math.randomseed(42) + local text = string.rep("Q", 1500) + local text_part = '{"type":"text","text":"' .. text .. '"}' + local parts = { text_part } + local current = 200 + #text_part -- approx outer envelope overhead + + while current < target_bytes do + local remaining = target_bytes - current + local upper = math.min(500 * 1024, math.max(50 * 1024, remaining + 50 * 1024)) + local lower = math.min(50 * 1024, upper) + local img_size = math.random(lower, upper) + local b64 = string.rep("A", img_size) + local img_part = '{"type":"image_url","image_url":{"url":"data:image/jpeg;base64,' + .. b64 .. '"}}' + parts[#parts + 1] = img_part + current = current + #img_part + 1 -- +1 for comma + end + + return '{"model":"gpt-4-vision","temperature":0.7,"messages":' + .. '[{"role":"user","content":[' .. table.concat(parts, ",") .. ']}]}' +end + local function bench(name, iters, fn) collectgarbage("collect") local mem_before = collectgarbage("count") @@ -18,31 +44,36 @@ local function bench(name, iters, fn) for _ = 1, iters do fn() end local t1 = os.clock() local mem_after = collectgarbage("count") - print(string.format("%-44s %7.2fms total %6.2fus/op %+8.1fKB", - name, (t1 - t0) * 1000, (t1 - t0) * 1e6 / iters, + local elapsed = t1 - t0 + print(string.format("%-44s %7.2fms total %10.0f ops/s %+8.1fKB", + name, elapsed * 1000, iters / elapsed, mem_after - mem_before)) end -local fixtures = { - small = read_file("benches/fixtures/small_api.json"), - medium = read_file("benches/fixtures/medium_resp.json"), +local scenarios = { + {name = "small", iters = 5000, payload = read_file("benches/fixtures/small_api.json")}, + {name = "medium", iters = 500, payload = read_file("benches/fixtures/medium_resp.json")}, + {name = "100k", iters = 100, payload = make_payload(100 * 1024)}, + {name = "200k", iters = 50, payload = make_payload(200 * 1024)}, + {name = "500k", iters = 20, payload = make_payload(500 * 1024)}, + {name = "1m", iters = 15, payload = make_payload(1024 * 1024)}, + {name = "2m", iters = 10, payload = make_payload(2 * 1024 * 1024)}, + {name = "5m", iters = 10, payload = make_payload(5 * 1024 * 1024)}, + {name = "10m", iters = 10, payload = make_payload(10 * 1024 * 1024)}, } -local iters_for = { small = 5000, medium = 500 } - -for _, size in ipairs({"small", "medium"}) do - local payload = fixtures[size] - print(string.format("=== %s (%d bytes) ===", size, #payload)) +for _, s in ipairs(scenarios) do + print(string.format("=== %s (%d bytes) ===", s.name, #s.payload)) - bench("cjson.decode + access 3 fields", iters_for[size], function() - local obj = cjson.decode(payload) + bench("cjson.decode + access 3 fields", s.iters, function() + local obj = cjson.decode(s.payload) local _ = obj.model local _ = obj.temperature local _ = obj.messages and obj.messages[1] and obj.messages[1].role end) - bench("quickdecode.parse + access 3 fields", iters_for[size], function() - local d = qd.parse(payload) + bench("quickdecode.parse + access 3 fields", s.iters, function() + local d = qd.parse(s.payload) local _ = d:get_str("model") local _ = d:get_f64("temperature") local _ = d:get_str("messages[0].role") diff --git a/docs/superpowers/specs/2026-05-15-makefile-design.md b/docs/superpowers/specs/2026-05-15-makefile-design.md new file mode 100644 index 0000000..c0c01a5 --- /dev/null +++ b/docs/superpowers/specs/2026-05-15-makefile-design.md @@ -0,0 +1,48 @@ +# Makefile for lua-quick-decode + +Add a root-level `Makefile` that wraps the common Rust + LuaJIT workflows so contributors don't have to remember the env-var dance for bench/test. + +## Targets + +| Target | Action | +|---|---| +| `help` (default) | Print each target and its `## ` doc-comment via awk | +| `build` | `cargo build --release` (produces `target/release/libquickdecode.so`) | +| `test` | Depends on `build`. Runs `cargo test --release`, then `busted tests/lua --lua=$(LUAJIT) --lpath='./lua/?.lua'` with `LD_LIBRARY_PATH=$(CURDIR)/target/release` and `LUA_CPATH=$(LUA_CPATH)` exported | +| `lint` | `cargo clippy --release --all-targets -- -D warnings` then `cargo fmt --check` | +| `bench` | Depends on `build`. Runs `$(LUAJIT) benches/lua_bench.lua` with the same env exports as `test` | +| `clean` | `cargo clean` | + +All targets are `.PHONY`. + +## Overridable variables + +```make +LUAJIT ?= $(shell command -v luajit 2>/dev/null || echo /usr/local/openresty/luajit/bin/luajit) +LUA_CPATH ?= ./?.so;/usr/local/openresty/lualib/?.so;/usr/local/lib/lua/5.1/?.so;/usr/local/openresty/luajit/lib/lua/5.1/?.so +``` + +- `LUAJIT` autodetects: prefers `luajit` on `PATH` (apt/CI install), falls back to OpenResty's path (local dev box). +- `LUA_CPATH` includes OpenResty's `lualib` (where `cjson.so` lives on the local box) plus the standard LuaJIT 5.1 search paths. The default value is intentionally absolute, not appended to LuaJIT's built-in default, so the Makefile is reproducible regardless of which LuaJIT build is invoked. +- Users override per invocation: `make bench LUAJIT=/path/to/luajit`. + +## Help format + +Each target line carries a `## description` trailing comment. The `help` target greps targets with `## ` and pretty-prints `target — description` via awk. This pattern lets `help` stay in sync automatically when a new target is added. + +## Out of scope + +- **`luacheck` lint for Lua sources.** Neither the local box nor CI has it installed; adding it now would be dead code. Track in README if/when desired. +- **Separate `release` / `debug` build targets.** The repo only ships release artifacts (bench and Lua FFI tests both require release). Add later if a debug workflow appears. +- **Cross-target dependency on `test` from `bench`.** Bench depends only on `build`; running tests as part of bench would slow down iterative perf work. + +## Non-goals + +- Replacing CI. The Makefile mirrors CI commands but is not invoked by CI (CI keeps its explicit steps for cache-key clarity). +- Cross-platform. macOS/Windows are not supported; the OpenResty path defaults are Linux-specific. PRs welcome but not required for v1. + +## Failure modes (intentional, loud) + +- `busted` not installed → `test` fails with a clear `command not found`. Fix: `sudo luarocks install busted`. +- `luajit` not on PATH and OpenResty fallback missing → `bench` and `test` fail at the luajit invocation. Fix: install LuaJIT or pass `LUAJIT=...`. +- `target/release/libquickdecode.so` missing → impossible by construction; `bench` and `test` depend on `build`. diff --git a/src/scan/avx2.rs b/src/scan/avx2.rs index 2467d08..0b9f09e 100644 --- a/src/scan/avx2.rs +++ b/src/scan/avx2.rs @@ -31,6 +31,17 @@ unsafe fn scan_avx2_impl(buf: &[u8], out: &mut Vec) -> Result<(), usize> { let escaped = find_escape_mask_with_carry(backslash, &mut bs_carry); let real_quote = quote & !escaped; + // String-skip fast path: when the previous chunk left us inside a + // string and this chunk contains no unescaped quote, the entire + // chunk is string interior. No structural chars to emit and + // in_string stays 1; bs_carry was already updated above. Skip the + // 14 cmpeq / movemask ops in structural_mask_chunk plus the PCLMUL + // prefix-XOR — the dominant cost on string-heavy payloads. + if in_string != 0 && real_quote == 0 { + i += 64; + continue; + } + let (inside, new_in_string) = inside_string_mask(real_quote, in_string); in_string = new_in_string; @@ -43,17 +54,30 @@ unsafe fn scan_avx2_impl(buf: &[u8], out: &mut Vec) -> Result<(), usize> { i += 64; } - // Whenever there's a tail, fall back to scalar for the whole buffer. - // This is necessary because ScalarScanner validates bracket matching against - // its own stack; a tail containing `]` or `}` that closes a bracket opened - // in the AVX2-processed prefix would cause ScalarScanner::scan on the tail - // slice to return Err, silently dropping those structural chars. - // The common case (input length is a multiple of 64) is unaffected. + // Tail (<64 bytes): continue emit-only via scalar, carrying the + // in_string / bs_carry state from the last AVX2 chunk. Bracket pairing + // is checked once at the end on the merged indices. + // + // If bs_carry == 1 the byte at position `i` is escape-targeted by the + // trailing backslash run of the prior chunk; inside a string we must + // skip it (treat as an escaped data byte, not a structural). Outside + // a string backslashes are plain characters and bs_carry has no effect. if i < buf.len() { - out.clear(); - return super::ScalarScanner::scan(buf, out); + let scalar_start = if in_string != 0 && bs_carry != 0 { + i + 1 + } else { + i + }; + if scalar_start <= buf.len() { + super::scalar::scan_emit_resume(buf, scalar_start, in_string != 0, out)?; + } else if in_string != 0 { + return Err(buf.len()); + } + } else if in_string != 0 { + return Err(buf.len()); } - Ok(()) + + super::validate_brackets(buf, out) } #[inline(always)] @@ -239,6 +263,96 @@ mod tests { parity(&buf); } + /// Exercises the string-skip fast path: a string spanning many AVX2 + /// chunks with no internal quotes. The fast-path branch must produce + /// the same emitted offsets as the slow path (which the parity check + /// against scalar implicitly verifies). + #[test] + fn long_string_engages_skip_fastpath() { + if !host_supports_avx2() { return; } + let mut buf = Vec::new(); + buf.extend_from_slice(b"{\"k\":\""); + // ~10 KB of string interior — many chunks fully inside the string. + for _ in 0..10_000 { buf.push(b'a'); } + buf.push(b'"'); + buf.push(b'}'); + // Pad to 64-aligned to also exercise the no-tail branch. + while buf.len() % 64 != 0 { buf.push(b' '); } + parity(&buf); + } + + /// String contains escaped quotes — the fast path must NOT fire when + /// `real_quote != 0` even though we may still be inside a string at + /// the chunk boundary. + #[test] + fn escaped_quotes_do_not_trip_fastpath() { + if !host_supports_avx2() { return; } + let mut buf = Vec::new(); + buf.extend_from_slice(b"{\"k\":\""); + // Embed real escape sequences periodically so quotes appear in the + // raw bytes but are escaped; real_quote should still be 0 over + // these chunks, fast path should fire. + for _ in 0..50 { + buf.extend_from_slice(b"aaaaaaaa\\\""); // 8 a's + \" + } + buf.push(b'"'); + buf.push(b'}'); + while buf.len() % 64 != 0 { buf.push(b' '); } + parity(&buf); + } + + /// AVX2 main loop + scalar tail: input length not a multiple of 64. + /// Exercises the path that used to bypass AVX2 entirely. + #[test] + fn unaligned_tail_parity() { + if !host_supports_avx2() { return; } + // Valid JSON of various non-64-aligned total lengths. + for tail_len in [1usize, 5, 17, 33, 63] { + let mut buf = Vec::new(); + buf.extend_from_slice(b"{\"key\":\""); + while buf.len() < 60 { buf.push(b'x'); } + buf.extend_from_slice(b"abc\"}"); + // buf now well-formed; pad with spaces after the closing `}` + // to land at 64 + tail_len total bytes. + let target = 64 + tail_len; + while buf.len() < target { buf.push(b' '); } + assert_eq!(buf.len(), target, "test setup"); + parity(&buf); + } + } + + /// String spans the 64-byte chunk boundary; the closing quote lives + /// in the scalar tail. Requires in_string state to carry correctly. + #[test] + fn string_crosses_avx2_boundary() { + if !host_supports_avx2() { return; } + let mut buf = Vec::new(); + buf.extend_from_slice(b"{\"k\":\""); // 6 bytes, in_string from byte 5 + while buf.len() < 80 { buf.push(b'a'); } // long string content past byte 64 + buf.push(b'"'); + buf.push(b'}'); + parity(&buf); + } + + /// Backslash at the LAST byte of the AVX2 chunk; the escaped target + /// is the FIRST byte of the scalar tail. Exercises bs_carry. + #[test] + fn backslash_at_chunk_boundary() { + if !host_supports_avx2() { return; } + // Bytes 0..63: `{"key":"` followed by 'x' padding ending with `\`. + // Byte 64 (first tail byte): an escaped `"` — must NOT close the string. + // Then real closing `"` and `}` follow. + let mut buf = Vec::new(); + buf.extend_from_slice(b"{\"key\":\""); // 8 bytes + while buf.len() < 63 { buf.push(b'x'); } // pad to 63 + buf.push(b'\\'); // byte 63: backslash + buf.push(b'"'); // byte 64: escaped quote (tail) + buf.push(b'y'); // byte 65 + buf.push(b'"'); // byte 66: real string close + buf.push(b'}'); // byte 67 + parity(&buf); + } + /// Verifies PCLMUL prefix-XOR produces correct inside-string mask /// for multiple strings in a single 64-byte chunk. #[test] diff --git a/src/scan/mod.rs b/src/scan/mod.rs index 910e539..85f9874 100644 --- a/src/scan/mod.rs +++ b/src/scan/mod.rs @@ -33,3 +33,49 @@ pub(crate) fn scan(buf: &[u8], out: &mut Vec) -> Result<(), usize> { }); f(buf, out) } + +/// Walk a sequence of already-emitted structural offsets and verify that +/// `{`/`}` and `[`/`]` are properly paired. String quotes toggle an +/// `in_string` flag and are otherwise skipped — well-formed emit paths +/// never push structural chars from inside strings, but the check is +/// defensive. +/// +/// On the first mismatch, returns `Err(offset_in_buf)`. On unmatched +/// openers at end of input, returns `Err(buf.len())`. +pub(crate) fn validate_brackets(buf: &[u8], indices: &[u32]) -> Result<(), usize> { + let mut stack: Vec = Vec::with_capacity(32); + let mut in_string = false; + + for &idx in indices { + let pos = idx as usize; + let b = buf[pos]; + + if b == b'"' { + in_string = !in_string; + continue; + } + if in_string { + continue; + } + + match b { + b'{' | b'[' => stack.push(b), + b'}' => { + if stack.pop() != Some(b'{') { + return Err(pos); + } + } + b']' => { + if stack.pop() != Some(b'[') { + return Err(pos); + } + } + _ => {} + } + } + + if !stack.is_empty() { + return Err(buf.len()); + } + Ok(()) +} diff --git a/src/scan/scalar.rs b/src/scan/scalar.rs index 73bc33b..d693ad1 100644 --- a/src/scan/scalar.rs +++ b/src/scan/scalar.rs @@ -5,66 +5,58 @@ pub struct ScalarScanner; impl Scanner for ScalarScanner { fn scan(buf: &[u8], out: &mut Vec) -> Result<(), usize> { out.reserve(buf.len() / 6); + scan_emit_resume(buf, 0, false, out)?; + super::validate_brackets(buf, out) + } +} + +/// Emit structural-character offsets for `buf[start..]`, continuing from a +/// given in-string state. Does NOT validate bracket pairing; the caller is +/// responsible for running `validate_brackets` over the emitted offsets. +/// +/// Used by `ScalarScanner::scan` (with start=0, in_str_init=false) and as +/// the unaligned-tail handler by `Avx2Scanner::scan` (with the carried +/// in-string state from the last AVX2 chunk). +pub(crate) fn scan_emit_resume( + buf: &[u8], + start: usize, + in_str_init: bool, + out: &mut Vec, +) -> Result<(), usize> { + let mut i = start; + let mut in_str = in_str_init; + + while i < buf.len() { + let b = buf[i]; - let mut i = 0usize; - let mut in_str = false; - let mut stack: Vec = Vec::with_capacity(32); - - while i < buf.len() { - let b = buf[i]; - - if in_str { - if b == b'\\' { - // Skip the escape and the next byte unconditionally. - // Anything in a string cannot be a structural char. - i += 2; - continue; - } - if b == b'"' { - in_str = false; - out.push(i as u32); - } - i += 1; + if in_str { + if b == b'\\' { + i += 2; continue; } - - match b { - b'"' => { - in_str = true; - out.push(i as u32); - } - b'{' | b'[' => { - stack.push(b); - out.push(i as u32); - } - b'}' => { - match stack.pop() { - Some(b'{') => {} - _ => return Err(i), - } - out.push(i as u32); - } - b']' => { - match stack.pop() { - Some(b'[') => {} - _ => return Err(i), - } - out.push(i as u32); - } - b',' | b':' => out.push(i as u32), - _ => {} + if b == b'"' { + in_str = false; + out.push(i as u32); } i += 1; + continue; } - if in_str { - return Err(buf.len()); - } - if !stack.is_empty() { - return Err(buf.len()); + match b { + b'"' => { + in_str = true; + out.push(i as u32); + } + b'{' | b'}' | b'[' | b']' | b',' | b':' => out.push(i as u32), + _ => {} } - Ok(()) + i += 1; + } + + if in_str { + return Err(buf.len()); } + Ok(()) } #[cfg(test)]