api7 · membphis · May 23, 2026 · May 23, 2026 · May 23, 2026 · May 23, 2026
diff --git a/Makefile b/Makefile
@@ -34,8 +34,12 @@ test: build ## Run cargo tests + busted Lua tests
 lint: ## Run clippy with -D warnings
 	cargo clippy --release --all-targets -- -D warnings
 
-bench: build vendor/lua-cjson/cjson.so ## Run the OpenResty LuaJIT benchmark
-	$(LUA_ENV) $(RESTY) benches/lua_bench.lua
+BENCH_SCENARIOS := small medium github-100k 100k 200k 500k 1m 2m 5m 10m interleaved
+
+bench: build vendor/lua-cjson/cjson.so ## Run each scenario in a fresh LuaJIT process
+	@for s in $(BENCH_SCENARIOS); do \
+		$(LUA_ENV) $(RESTY) benches/lua_bench.lua $$s; \
+	done
 
 vendor/lua-cjson/cjson.so: | vendor/lua-cjson/Makefile
 ifeq ($(shell uname),Darwin)

diff --git a/README.md b/README.md
@@ -99,29 +99,36 @@ LD_LIBRARY_PATH="$PWD/target/release" \
 ## Benchmarks
 
 `qjson` vs. `lua-cjson` and `lua-resty-simdjson` on multimodal
-chat-completion payloads, "parse + access model, temperature, and all
-messages[*].content paths" workload (median ops/s under OpenResty LuaJIT 2.1,
-AMD EPYC Rome (Zen 2, 4 vCPUs); 5 rounds, deterministic payload):
+chat-completion payloads (median ops/s under OpenResty LuaJIT 2.1,
+AMD EPYC Rome, Zen 2, 4 vCPUs; 5 rounds, deterministic payload).
 
-| Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | speedup vs. cjson |
+### Parse + access (read-only)
+
+| Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access` | speedup vs. cjson |
 |---:|---:|---:|---:|---:|---:|
-|   2 KB |  94,075 | 108,108 | 127,214 | 120,398 |  1.4× /  1.3× |
-|  60 KB |   9,041 |  83,043 | 123,487 | 214,500 | 13.7× / 23.7× |
-| 100 KB |   5,302 |  32,248 | 109,649 | 102,564 | 20.7× / 19.3× |
-|   1 MB |     517 |   3,538 |  16,520 |  16,988 | 32.0× / 32.9× |
-|  10 MB |      50 |     402 |   1,899 |   1,918 | 38.0× / 38.4× |
-
-`qjson.parse` wins because it skips building a Lua table for the parts you
-never read; `qjson.decode + t.field` adds a cjson-shaped table proxy on top
-with similar throughput. Memory retention for `qjson` is essentially
-flat in payload size (a few KB for the reusable buffers), while `cjson`
-and `simdjson` retain more Lua heap because they materialize the table tree.
-
-See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder,
-memory numbers, an "encode round-trip" row (passthrough emit via
-`memcpy`), exact environment, and the reproduction command. `make bench`
-uses `lua-resty-simdjson` when `resty.simdjson` is available in the
-OpenResty environment; otherwise it skips the simdjson rows.
+|   2 KB |  92,716 | 102,602 | 128,005 | 125,815 |  1.4× /  1.4× |
+|  60 KB |   9,007 |  82,699 | 116,198 | 219,491 | 12.9× / 24.4× |
+| 100 KB |   2,769 |  40,437 |  84,034 | 121,803 | 30.3× / 44.0× |
+|   1 MB |     512 |   4,020 |  16,056 |  15,400 | 31.4× / 30.1× |
+|  10 MB |      51 |     363 |   1,830 |   1,783 | 35.9× / 35.0× |
+
+### Encode (unmodified) + modify-then-re-encode
+
+| Size | encode (unmodified) | modify top (cjson / qjson) | modify nested (cjson / qjson) | speedup vs. cjson |
+|---:|---:|---:|---:|---:|
+|   2 KB | 219,925 | 59,761 /  56,909 | 61,685 /  49,798 |  1.0× /  0.8× |
+|  60 KB | 143,843 |  4,590 / **44,370** |  4,616 / **196,386** |  9.7× / 42.5× |
+| 100 KB | 119,617 |  2,645 / **32,712** |  5,263 /  **59,809** | 12.4× / 11.4× |
+|   1 MB |  16,269 |    241 /  **3,108** |    516 /  **14,134** | 12.9× / 27.4× |
+
+> **qjson.encode(unmodified)** re-emits the original byte range via `memcpy` —
+> no fields touched means zero serializer work.
+> **qjson modify+encode** materializes only the mutated subtree; unmodified
+> siblings stay on the fast path. cjson always does a full materialize +
+> re-serialize on every encode. At 60 KB+, qjson modify+encode is **10–43×**
+> faster than the cjson equivalent.
+> See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder,
+> memory numbers, and environment.
 
 ```sh
 make bench       # qjson vs cjson and lua-resty-simdjson

diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
@@ -145,7 +145,10 @@ local ROUNDS = 5
 local function bench(name, iters, fn)
     -- Warmup pass: lets JIT compile hot traces and any one-time pools fill
     -- before measurement starts. Excluded from timing and memory delta.
-    local warmup = math.max(3, math.floor(iters / 5))
+    -- Floor at 50: LuaJIT hotloop default is 56, so fewer iterations leave
+    -- the bench measuring interpreter mode for the large-payload scenarios
+    -- (e.g. 500k has iters=100, iters/5=20 → without floor, traces may not compile).
+    local warmup = math.max(50, math.floor(iters / 5))
     for _ = 1, warmup do fn() end
 
     collectgarbage("collect")
@@ -220,6 +223,21 @@ local function default_table_access(t)
     end
 end
 
+local function default_table_modify_top(t)
+    t.model = "new-model"
+    t.temperature = 0.0
+end
+
+local function default_table_modify_add(t)
+    t.stream = true
+end
+
+local function default_table_modify_nested(t)
+    if t.messages and qjson.len(t.messages) > 0 then
+        t.messages[1].content = "modified"
+    end
+end
+
 -- GitHub issues accessors: array of issues, access first issue's fields
 local function github_cjson_access(obj)
     local _ = obj[1] and obj[1].id
@@ -239,15 +257,32 @@ local function github_table_access(t)
     local _ = t[1] and t[1].user and t[1].user.login
 end
 
+local function github_table_modify_top(t)
+    t[1].title = "modified title"
+end
+
+local function github_table_modify_add(t)
+    if t[1] then
+        t[1].extra_field = true
+    end
+end
+
+local function github_table_modify_nested(t)
+    if t[1] and t[1].user then
+        t[1].user.login = "modified-user"
+    end
+end
+
 local scenarios = {
     {name = "small",  iters = 5000, payload = read_file("benches/fixtures/small_api.json")},
     {name = "medium", iters = 500,  payload = read_file("benches/fixtures/medium_resp.json")},
     {name = "github-100k", iters = 100, payload = make_github_issues_payload(100 * 1024),
-     cjson_access = github_cjson_access, qjson_access = github_qjson_access, table_access = github_table_access},
+     cjson_access = github_cjson_access, qjson_access = github_qjson_access, table_access = github_table_access,
+     modify_top = github_table_modify_top, modify_add = github_table_modify_add, modify_nested = github_table_modify_nested},
     {name = "100k",   iters = 100,  payload = make_payload(100 * 1024)},
     {name = "200k",   iters = 50,   payload = make_payload(200 * 1024)},
-    {name = "500k",   iters = 20,   payload = make_payload(500 * 1024)},
-    {name = "1m",     iters = 15,   payload = make_payload(1024 * 1024)},
+    {name = "500k",   iters = 100,  payload = make_payload(500 * 1024)},
+    {name = "1m",     iters = 60,   payload = make_payload(1024 * 1024)},
     {name = "2m",     iters = 20,   payload = make_payload(2 * 1024 * 1024)},
     {name = "5m",     iters = 20,   payload = make_payload(5 * 1024 * 1024)},
     {name = "10m",    iters = 20,   payload = make_payload(10 * 1024 * 1024)},
@@ -258,23 +293,56 @@ local scenarios = {
 local has_pooled_api = type(qjson.new_decoder) == "function"
 local pooled_decoder = has_pooled_api and qjson.new_decoder() or nil
 
+-- Optional scenario filter: arg[1] = scenario name (e.g. "small").
+-- When set, only that single scenario runs in a fresh LuaJIT process,
+-- avoiding accumulated GC/JIT state from prior payloads.
+local filter = arg[1]
+
 if not simdjson then
     print("lua-resty-simdjson unavailable; skipping simdjson rows: "
         .. tostring(simdjson_or_err))
 end
 
 for _, s in ipairs(scenarios) do
+    if filter and s.name ~= filter then goto continue_scenario end
     print(string.format("=== %s (%d bytes) ===", s.name, #s.payload))
 
     local cjson_access = s.cjson_access or default_cjson_access
     local qjson_access = s.qjson_access or default_qjson_access
     local table_access = s.table_access or default_table_access
+    local modify_top = s.modify_top or default_table_modify_top
+    local modify_add = s.modify_add or default_table_modify_add
+    local modify_nested = s.modify_nested or default_table_modify_nested
 
     bench("cjson.decode + access fields", s.iters, function()
         local obj = cjson.decode(s.payload)
         cjson_access(obj)
     end)
 
+    -- cjson always fully materializes on decode, so modify+encode is the
+    -- same cost as a full re-encode — useful as a realistic baseline for
+    -- modify workloads.
+    bench("cjson.decode + modify top + encode", s.iters, function()
+        local obj = cjson.decode(s.payload)
+        modify_top(obj)
+        local _enc = cjson.encode(obj)
+        if #_enc < 2 then error("cjson.encode produced too-short result") end
+    end)
+
+    bench("cjson.decode + add field + encode", s.iters, function()
+        local obj = cjson.decode(s.payload)
+        modify_add(obj)
+        local _enc = cjson.encode(obj)
+        if #_enc < 2 then error("cjson.encode produced too-short result") end
+    end)
+
+    bench("cjson.decode + modify nested + encode", s.iters, function()
+        local obj = cjson.decode(s.payload)
+        modify_nested(obj)
+        local _enc = cjson.encode(obj)
+        if #_enc < 2 then error("cjson.encode produced too-short result") end
+    end)
+
     if simdjson then
         bench("simdjson.decode + access fields", s.iters, function()
             local obj = simdjson:decode(s.payload)
@@ -307,8 +375,31 @@ for _, s in ipairs(scenarios) do
 
     bench("qjson.decode + qjson.encode (unmodified)", s.iters, function()
         local t = qjson.decode(s.payload)
-        local _ = qjson.encode(t)
+        local _enc = qjson.encode(t)
+        if #_enc < 2 then error("qjson.encode produced too-short result") end
     end)
+
+    bench("qjson.decode + modify top + encode", s.iters, function()
+        local t = qjson.decode(s.payload)
+        modify_top(t)
+        local _enc = qjson.encode(t)
+        if #_enc < 2 then error("qjson.encode produced too-short result") end
+    end)
+
+    bench("qjson.decode + add field + encode", s.iters, function()
+        local t = qjson.decode(s.payload)
+        modify_add(t)
+        local _enc = qjson.encode(t)
+        if #_enc < 2 then error("qjson.encode produced too-short result") end
+    end)
+
+    bench("qjson.decode + modify nested + encode", s.iters, function()
+        local t = qjson.decode(s.payload)
+        modify_nested(t)
+        local _enc = qjson.encode(t)
+        if #_enc < 2 then error("qjson.encode produced too-short result") end
+    end)
+    ::continue_scenario::
 end
 
 -- Interleaved scenario: cycle through several payloads of different sizes
@@ -338,6 +429,8 @@ local function make_cycler(items)
     end
 end
 
+if not filter or filter == "interleaved" then
+
 print(string.format("=== interleaved %s ===", table.concat(interleaved_names, ",")))
 
 do
@@ -384,6 +477,36 @@ do
     bench("qjson.decode + qjson.encode (unmodified)", 400, function()
         local p = next_p()
         local t = qjson.decode(p)
-        local _ = qjson.encode(t)
+        local _enc = qjson.encode(t)
+        if #_enc < 2 then error("qjson.encode produced too-short result") end
+    end)
+
+    next_p = make_cycler(interleaved)
+    bench("qjson.decode + modify top + encode", 400, function()
+        local p = next_p()
+        local t = qjson.decode(p)
+        default_table_modify_top(t)
+        local _enc = qjson.encode(t)
+        if #_enc < 2 then error("qjson.encode produced too-short result") end
+    end)
+
+    next_p = make_cycler(interleaved)
+    bench("qjson.decode + add field + encode", 400, function()
+        local p = next_p()
+        local t = qjson.decode(p)
+        default_table_modify_add(t)
+        local _enc = qjson.encode(t)
+        if #_enc < 2 then error("qjson.encode produced too-short result") end
+    end)
+
+    next_p = make_cycler(interleaved)
+    bench("qjson.decode + modify nested + encode", 400, function()
+        local p = next_p()
+        local t = qjson.decode(p)
+        default_table_modify_nested(t)
+        local _enc = qjson.encode(t)
+        if #_enc < 2 then error("qjson.encode produced too-short result") end
     end)
 end
+
+end  -- filter == "interleaved"