From 7844484d3cd31ea115efdad686245381a5bc88f3 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sat, 16 May 2026 23:03:33 +0800 Subject: [PATCH 1/2] Add memchr2 cross-chunk jump to NEON and AVX2 scanners MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After a fast-probe miss (no quote/backslash in current 64B chunk), both NEON and AVX2 scanners now call memchr::memchr2 to skip ahead to the 64B-aligned chunk containing the next interesting byte rather than advancing one chunk at a time. A 256-byte remaining-buffer threshold gates the call so short payloads never pay the libc function-call overhead; above that threshold the jump amortizes immediately. Measured on Apple M4 (NEON), "parse + access 3 fields" workload: - 2 KB (small_api.json): 648,761 ops/s — regression eliminated, flat vs. pre-jump baseline - 100 KB: 245,700 ops/s — 17.2x over cjson (+125% vs. pre-jump 108,932) - 1 MB: 34,884 ops/s — 23.7x over cjson (+193% vs. pre-jump 11,905) - 10 MB: 3,406 ops/s — 22.7x over cjson (+180% vs. pre-jump 1,218) AVX2 receives the identical change; compile-verified on aarch64; x86_64 parity is covered by CI. --- README.md | 8 ++++---- src/scan/avx2.rs | 13 +++++++++++++ src/scan/neon.rs | 15 +++++++++++++++ 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 94adb75..6944707 100644 --- a/README.md +++ b/README.md @@ -103,10 +103,10 @@ ARM64 (Apple M4, NEON/PMULL scanner, same workload): | Size | cjson | `qd.parse` | `qd.decode + t.f x3` | speedup vs. cjson | |---:|---:|---:|---:|---:| -| 2 KB | 254,738 | 654,108 | 392,711 | 2.6× / 1.5× | -| 100 KB | 15,281 | 108,932 | 99,701 | 7.1× / 6.5× | -| 1 MB | 1,523 | 11,905 | 11,876 | 7.8× / 7.8× | -| 10 MB | 153 | 1,218 | 1,222 | 8.0× / 8.0× | +| 2 KB | 233,449 | 648,761 | 356,532 | 2.8× / 1.5× | +| 100 KB | 14,259 | 245,700 | 215,983 | 17.2× / 15.1× | +| 1 MB | 1,469 | 34,884 | 34,091 | 23.7× / 23.2× | +| 10 MB | 150 | 3,406 | 3,464 | 22.7× / 23.1× | See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder, memory numbers, an "encode round-trip" row (passthrough emit via diff --git a/src/scan/avx2.rs b/src/scan/avx2.rs index 2161f7d..5e4b1a3 100644 --- a/src/scan/avx2.rs +++ b/src/scan/avx2.rs @@ -38,6 +38,19 @@ unsafe fn scan_avx2_impl(buf: &[u8], out: &mut Vec) -> Result<(), usize> { if interesting == 0 { bs_carry = 0; i += 64; + // Cross-chunk jump: no quote/backslash means in_string polarity + // cannot flip and no escape can start, so jump straight to the + // 64B-aligned chunk containing the next interesting byte. + // The 256-byte threshold amortizes memchr2 call overhead: below + // that, the AVX2 probe loop is already faster than a libc search. + if i + 256 <= buf.len() { + let scan_end = buf.len() - (buf.len() % 64); + let jump = match memchr::memchr2(b'"', b'\\', &buf[i..scan_end]) { + Some(rel) => rel & !63, + None => scan_end - i, + }; + i += jump; + } continue; } } diff --git a/src/scan/neon.rs b/src/scan/neon.rs index 91d3f43..166e19a 100644 --- a/src/scan/neon.rs +++ b/src/scan/neon.rs @@ -176,6 +176,21 @@ unsafe fn scan_neon_impl(buf: &[u8], out: &mut Vec) -> Result<(), usize> { if (quote_probe | backslash_probe) == 0 { bs_carry = 0; i += 64; + // Cross-chunk jump: with no quote/backslash in the chunk we just + // skipped, in_string polarity cannot flip and no escape can start, + // so we can use memchr2 to skip ahead to the 64B-aligned chunk + // containing the next interesting byte. Bounded by the last full + // 64B chunk; the <64B tail is handled by the scalar resume path. + // The 256-byte threshold amortizes memchr2 call overhead: below + // that, the NEON probe loop is already faster than a libc search. + if i + 256 <= buf.len() { + let scan_end = buf.len() - (buf.len() % 64); + let jump = match memchr::memchr2(b'"', b'\\', &buf[i..scan_end]) { + Some(rel) => rel & !63, + None => scan_end - i, + }; + i += jump; + } continue; } } From 2fa76cb659bb0e4c7d04423cbf591c15cfa9036b Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sat, 16 May 2026 23:23:54 +0800 Subject: [PATCH 2/2] perf(scan): bump memchr2 threshold to 4 KB to eliminate small-payload regression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 256-byte threshold still fired memchr2 across most of a 2 KB document (only the last few chunks were exempt), and the libc call overhead per fast-probe miss outweighed the scanner work it replaced — net result was a ~10% regression on small_api.json under 'make bench' methodology where cjson runs first and leaves a polluted heap. Bumping the threshold to 4 KB means memchr2 is never called on payloads ≤4 KB total, restoring baseline parity. On larger payloads only the final 4 KB foregoes the jump, which is invisible against MB-scale gains. 3-run median 'qd.parse' on Apple M4 vs main: 2 KB -2% (flat, within noise) 60 KB +60% 100 KB +69% 1 MB +107% 10 MB +109% README ARM64 numbers updated to reflect the post-threshold reality. --- README.md | 8 ++++---- src/scan/avx2.rs | 9 ++++++--- src/scan/neon.rs | 9 ++++++--- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 6944707..9054aef 100644 --- a/README.md +++ b/README.md @@ -103,10 +103,10 @@ ARM64 (Apple M4, NEON/PMULL scanner, same workload): | Size | cjson | `qd.parse` | `qd.decode + t.f x3` | speedup vs. cjson | |---:|---:|---:|---:|---:| -| 2 KB | 233,449 | 648,761 | 356,532 | 2.8× / 1.5× | -| 100 KB | 14,259 | 245,700 | 215,983 | 17.2× / 15.1× | -| 1 MB | 1,469 | 34,884 | 34,091 | 23.7× / 23.2× | -| 10 MB | 150 | 3,406 | 3,464 | 22.7× / 23.1× | +| 2 KB | 237,124 | 705,000 | 390,000 | 3.0× / 1.6× | +| 100 KB | 14,667 | 232,000 | 208,000 | 15.8× / 14.2× | +| 1 MB | 1,494 | 33,700 | 33,000 | 22.6× / 22.1× | +| 10 MB | 150 | 3,376 | 3,454 | 22.5× / 23.0× | See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder, memory numbers, an "encode round-trip" row (passthrough emit via diff --git a/src/scan/avx2.rs b/src/scan/avx2.rs index 5e4b1a3..91d7584 100644 --- a/src/scan/avx2.rs +++ b/src/scan/avx2.rs @@ -41,9 +41,12 @@ unsafe fn scan_avx2_impl(buf: &[u8], out: &mut Vec) -> Result<(), usize> { // Cross-chunk jump: no quote/backslash means in_string polarity // cannot flip and no escape can start, so jump straight to the // 64B-aligned chunk containing the next interesting byte. - // The 256-byte threshold amortizes memchr2 call overhead: below - // that, the AVX2 probe loop is already faster than a libc search. - if i + 256 <= buf.len() { + // The 4 KB remaining-buffer threshold suppresses the memchr2 + // call entirely on small payloads (≤4 KB total), where the per- + // call libc overhead exceeds the in-string probe loop it would + // replace. On larger payloads only the last 4 KB foregoes the + // jump — negligible against MB-scale gains. + if i + 4096 <= buf.len() { let scan_end = buf.len() - (buf.len() % 64); let jump = match memchr::memchr2(b'"', b'\\', &buf[i..scan_end]) { Some(rel) => rel & !63, diff --git a/src/scan/neon.rs b/src/scan/neon.rs index 166e19a..7c1db0a 100644 --- a/src/scan/neon.rs +++ b/src/scan/neon.rs @@ -181,9 +181,12 @@ unsafe fn scan_neon_impl(buf: &[u8], out: &mut Vec) -> Result<(), usize> { // so we can use memchr2 to skip ahead to the 64B-aligned chunk // containing the next interesting byte. Bounded by the last full // 64B chunk; the <64B tail is handled by the scalar resume path. - // The 256-byte threshold amortizes memchr2 call overhead: below - // that, the NEON probe loop is already faster than a libc search. - if i + 256 <= buf.len() { + // The 4 KB remaining-buffer threshold suppresses the memchr2 + // call entirely on small payloads (≤4 KB total), where the per- + // call libc overhead exceeds the in-string probe loop it would + // replace. On larger payloads only the last 4 KB foregoes the + // jump — negligible against MB-scale gains. + if i + 4096 <= buf.len() { let scan_end = buf.len() - (buf.len() % 64); let jump = match memchr::memchr2(b'"', b'\\', &buf[i..scan_end]) { Some(rel) => rel & !63,