From e1d25766281a22abd6485633aba1289e0f98f288 Mon Sep 17 00:00:00 2001
From: Laura Wang <3700467+Laurawly@users.noreply.github.com>
Date: Wed, 29 Apr 2026 14:27:44 -0700
Subject: [PATCH 1/4] Clean up Oink benchmarks and SM103 docs

---
 oink/README.md                                |  136 +-
 oink/benchmarks/README.md                     |  155 +-
 oink/benchmarks/benchmark/bench_utils.py      |   26 +-
 .../benchmark_fused_add_rmsnorm_sm100.py      |   13 +-
 .../benchmark/benchmark_layernorm_sm100.py    |    8 +
 .../benchmark/benchmark_paulius_rmsnorm.py    |  242 --
 .../benchmark/benchmark_rmsnorm_all.py        |  325 --
 .../benchmark/benchmark_rmsnorm_bwd_sm100.py  |   22 +-
 .../benchmark/benchmark_rmsnorm_sm100.py      |    8 +
 ...m103_bf16_oink_vs_quack_with_layernorm.svg | 2627 +++++++++++++++++
 .../benchmarks/readme/plot_quack_style_svg.py |   32 +-
 oink/benchmarks/readme/run_sm100_suite.py     |  117 +-
 oink/benchmarks/readme/summarize_results.py   |    2 +-
 oink/pyproject.toml                           |    6 +-
 oink/src/kernelagent_oink/__init__.py         |    4 +-
 .../blackwell/_cutedsl_cache.py               |   49 +
 .../blackwell/_rmsnorm_impl.py                |   46 +-
 .../blackwell/_rmsnorm_simple_weightonly.py   |   32 +-
 .../blackwell/cross_entropy.py                |   20 +-
 .../kernelagent_oink/blackwell/layernorm.py   |   21 +-
 .../blackwell/oink_custom_ops.py              |   12 +-
 .../src/kernelagent_oink/blackwell/softmax.py |   21 +-
 22 files changed, 3111 insertions(+), 813 deletions(-)
 delete mode 100644 oink/benchmarks/benchmark/benchmark_paulius_rmsnorm.py
 delete mode 100644 oink/benchmarks/benchmark/benchmark_rmsnorm_all.py
 create mode 100644 oink/benchmarks/media/sm103_bf16_oink_vs_quack_with_layernorm.svg
 create mode 100644 oink/src/kernelagent_oink/blackwell/_cutedsl_cache.py

diff --git a/oink/README.md b/oink/README.md
index d21720a3..bcb10328 100644
--- a/oink/README.md
+++ b/oink/README.md
@@ -1,65 +1,61 @@
 # KernelAgent-Oink
 
-KernelAgent-Oink is a small **CuTeDSL (CUTLASS DSL) kernel library** for
-**NVIDIA Blackwell (SM10x / GB200 / GB300 / B200-class)**, bundled as a lightweight
-Python package that can be used standalone or as a **vLLM general plugin**.
+KernelAgent-Oink is a lightweight **CuTeDSL (CUTLASS DSL) kernel package** for
+NVIDIA Blackwell **SM10x** GPUs. It can be used standalone or loaded as a
+**vLLM general plugin**.
 
-At the moment, the vLLM integration exposes the following `torch.library.custom_op`
-entrypoints under the `oink::` namespace:
+Current custom ops:
 
 - `torch.ops.oink.rmsnorm(x, weight, eps) -> Tensor`
 - `torch.ops.oink.fused_add_rms_norm(x, residual, weight, eps) -> None` (in-place)
 
-The package also includes additional SM100 kernels used by the benchmark suite:
-LayerNorm, Softmax (fwd+bwd), and CrossEntropy (fwd+bwd).
+The repo also contains benchmark-facing Blackwell kernels for LayerNorm, Softmax,
+and CrossEntropy.
 
 ## Requirements
 
-- GPU: **SM10x (Blackwell)** for the fast CuTeDSL paths. On other GPUs, Oink falls back to
-  reference PyTorch implementations for correctness.
-- Python dependencies:
-  - `nvidia-cutlass-dsl` (CuTeDSL)
-  - `cuda-python`
-  - `torch` (provided by your environment / vLLM)
+- Blackwell GPU for optimized CuTeDSL paths; other GPUs use correctness-first
+  PyTorch fallbacks.
+- `nvidia-cutlass-dsl>=4.4.2`
+- `cuda-python`
+- `torch` from the surrounding environment / vLLM
 
 Recommended env vars:
 
 ```bash
-export CUTE_DSL_ARCH=sm_100a
 export PYTORCH_ALLOC_CONF=expandable_segments:True
+export CUTE_DSL_ARCH=sm_103a   # GB300 / SM103
+# export CUTE_DSL_ARCH=sm_100a # GB200/B200 / SM100
 ```
 
-On **GB300 / SM103**, prefer:
-
-```bash
-export CUTE_DSL_ARCH=sm_103a
-```
-
-## Install (editable)
+## Install
 
 From the `KernelAgent` repo root:
 
 ```bash
 pip install -e ./oink
+pip install -e "./oink[bench]"  # optional benchmark/plot deps
 ```
 
-For running the in-repo benchmark suite / plots:
+A reproducible GB300 benchmark environment used for the results below:
 
 ```bash
-pip install -e "./oink[bench]"
+conda create -y -n cute python=3.12
+conda run -n cute python -m pip install --upgrade pip setuptools wheel packaging ninja
+conda run -n cute python -m pip install --upgrade --index-url https://download.pytorch.org/whl/cu130 torch
+conda run -n cute python -m pip install 'nvidia-cutlass-dsl==4.4.2' cuda-python triton matplotlib
+conda run -n cute python -m pip install -e './oink[bench]'
 ```
 
 ## Usage
 
-### vLLM (general plugin)
-
-1) Enable the plugin:
+### vLLM plugin
 
 ```bash
 export VLLM_USE_OINK_RMSNORM=1
 ```
 
-2) Ensure vLLM keeps `rms_norm` as a custom op when using `torch.compile` / CUDA graphs:
+When using `torch.compile` / CUDA graphs, keep vLLM RMSNorm as a custom op:
 
 ```python
 from vllm import LLM
@@ -72,12 +68,7 @@ llm = LLM(
 )
 ```
 
-Without `+rms_norm`, Inductor may fuse RMSNorm into larger kernels and neither
-vLLM’s CUDA RMSNorm nor Oink will run.
-
-### Direct PyTorch usage (manual op registration)
-
-For standalone use (outside vLLM), register the custom ops once:
+### Direct PyTorch
 
 ```python
 import kernelagent_oink
@@ -92,73 +83,40 @@ y = torch.ops.oink.rmsnorm(x, w, 1e-6)
 
 ## Benchmarks
 
-### GB200 / B200 (SM100) benchmark suite
-
-The repo includes a Quack-style benchmark suite (tables + SVG plots) to compare
-Oink against Quack and to reproduce the reported speedups. The pre-generated
-plots below were measured on **GB200 / B200-class SM100** systems.
-
-In short, Oink’s edge comes from lower pointer-path launch overhead plus Blackwell-tuned shape routing for both hot small-`M` and larger RMSNorm rows.
-
-On the current B200 forward sweep, Oink holds `1.12x` / `1.06x` geomean over Quack for same-dtype weights on the Quack-suite / DSv3 sets, and `1.18x` / `1.06x` for fp32 weights, with worst output rel-L2 `1.45e-5` (Quack `2.01e-5`).
-
-- How to run + methodology: `oink/benchmarks/README.md`
-- Pre-generated plots: `oink/benchmarks/media/`
-
-<div align="center">
-  <img src="benchmarks/media/sm100_bf16_oink_vs_quack_with_layernorm.svg" alt="SM100 BF16: Oink vs Quack (Quack-suite)">
-</div>
-
-<div align="center">
-  <img src="benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_all.svg" alt="SM100 BF16: Oink vs Quack (DSv3-like shapes)">
-</div>
-
-### GB300 (SM103) Q/K-norm results
+Benchmark details and commands are in [`benchmarks/README.md`](benchmarks/README.md).
+Reported numbers are correctness-gated against PyTorch references before timing.
 
-We also benchmarked the real Llama4x-style Q/K-norm workload on **GB300
-(SM103)** using non-contiguous `q` / `k` views produced by `qkv.split()`. This
-benchmark reports both the direct CuTeDSL/CUTLASS baseline and the optimized
-Oink path for the production strided `[M, N]` views. The CuTeDSL/CUTLASS
-baseline here is a **Q/K-norm adaptation** derived from the
-[CUTLASS CuTeDSL Blackwell RMSNorm example](https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/blackwell/rmsnorm.py),
-not the example kernel used unchanged.
+Current GB300 / SM103 setup:
 
-For roofline context, we also plot the same workload using a dedicated
-useful-bandwidth harness: median CUDA-event timing plus a logical IO model of
-one read + one write of the fused `[M, N]` tensor. This is the physically
-meaningful view for comparing against the measured practical GB300 BF16 stream
-roof, whereas the steady-state CUDA-graph replay medians below are better read
-as a latency view.
+- NVIDIA GB300, capability `(10, 3)`, `CUTE_DSL_ARCH=sm_103a`
+- `torch==2.11.0+cu130`, CUDA `13.0`
+- `nvidia-cutlass-dsl==4.4.2`, `cuda-python==13.2.0`
+- measured BF16 STREAM-like roof: **7.140 TB/s**
 
 <div align="center">
-  <img src="benchmarks/media/gb300_bf16_qk_norm_oink_vs_cutedsl_roofline.svg" alt="GB300 BF16: Q/K-norm roofline (Oink vs CuTeDSL)">
+  <img src="benchmarks/media/sm103_bf16_oink_vs_quack_with_layernorm.svg" alt="SM103 / GB300 BF16 benchmark summary">
 </div>
 
-Representative steady-state CUDA-graph replay medians from one GB300 run are
-shown below (absolute microseconds may vary slightly run to run, but the
-ranking and trend were stable).
+Quack-suite BF16 summary (`N=4096`):
 
-- Q path: Oink is roughly **2.4–3.1x faster** than the CuTeDSL baseline on
-  representative multi-row workloads.
-- K path: Oink is roughly **2.0–3.6x faster** on the same sweep.
+| op | rows | geomean vs Quack | large-row roofline note |
+|---|---:|---:|---|
+| RMSNorm fwd, weight=same | 19 | 1.019x | near measured roof on large rows |
+| RMSNorm fwd, weight=fp32 | 19 | 1.100x | near measured roof on large rows |
+| LayerNorm fwd | 19 | 1.241x | near measured roof on large rows |
+| Softmax fwd+bwd | 19 | 1.673x | near measured roof on large rows |
+| CrossEntropy fwd+bwd | 19 | 1.635x | mixed memory/SFU behavior |
 
-Takeaways from the GB300 Q/K-norm sweep:
+Historical plots remain under `benchmarks/media/`:
 
-- For the user-relevant multi-row workloads, Oink beats the CuTeDSL/CUTLASS
-  baseline by comfortably more than 20%.
-- In the roofline view, Oink gets close to the practical GB300 BF16 streaming
-  ceiling on the large-row Q/K shapes, while the CuTeDSL baseline stays much
-  farther from the roof.
-- The only cases below 20% are the tiny single-row latency-floor microcases:
-  Q `M=1` is ~12% faster and K `M=1` is ~6% faster.
-- Correctness spot-check from the same harness:
-  - Q max diff vs eager: `0.03125`
-  - K max diff vs eager: `0.007812`
+- `sm100_*`: historical SM100 / B200 runs.
+- `gb300_bf16_qk_norm_oink_vs_cutedsl_roofline.svg`: historical GB300 Q/K-norm
+  harness, separate from the Quack-suite table above.
 
 ## Links
 
 | What | Link |
 |---|---|
-| Quack (expert baseline) | https://github.com/Dao-AILab/quack |
-| KernelAgent (agentic framework) | https://github.com/meta-pytorch/KernelAgent |
-| vLLM PR (Oink RMSNorm integration) | https://github.com/vllm-project/vllm/pull/31828 |
+| Quack baseline | https://github.com/Dao-AILab/quack |
+| KernelAgent | https://github.com/meta-pytorch/KernelAgent |
+| vLLM Oink RMSNorm PR | https://github.com/vllm-project/vllm/pull/31828 |
diff --git a/oink/benchmarks/README.md b/oink/benchmarks/README.md
index 26f3c07a..c99966e1 100644
--- a/oink/benchmarks/README.md
+++ b/oink/benchmarks/README.md
@@ -1,30 +1,39 @@
-# SM100 Benchmarks (KernelAgent-Oink vs Quack)
+# Blackwell SM10x Benchmarks (KernelAgent-Oink vs Quack)
 
-This folder contains SM10x (GB200 / GB300 / Blackwell) microbenchmarks for the Oink
-CuTeDSL kernels vendored into KernelAgent, comparing against Quack’s SM100
-kernels where Quack provides an equivalent API.
+This folder contains SM10x (GB200 / GB300 / Blackwell) microbenchmarks for the
+Oink CuTeDSL kernels, comparing against Quack’s SM100 kernels where Quack
+provides an equivalent API.
 
 ## Prereqs
 
 - GPU: **SM10x / Blackwell** (`torch.cuda.get_device_capability()[0] == 10`).
 - Python deps in your environment:
   - `torch`
-  - `nvidia-cutlass-dsl` (CuTeDSL)
+  - `nvidia-cutlass-dsl>=4.4.2` (CuTeDSL)
   - `cuda-python`
   - `triton` (only for `triton.testing.do_bench`)
-  - `quack` (optional; only needed for Oink-vs-Quack comparisons)
+  - `quack` / `quack-kernels` (optional; only needed for Oink-vs-Quack comparisons)
 
 Recommended env vars:
 
 ```bash
 export PYTORCH_ALLOC_CONF=expandable_segments:True
-export CUTE_DSL_ARCH=sm_100a
+# GB300 / SM103:
+export CUTE_DSL_ARCH=sm_103a
+# GB200/B200 / SM100 historical runs:
+# export CUTE_DSL_ARCH=sm_100a
 ```
 
-On **GB300 / SM103**, prefer:
+For the pinned GB300 / SM103 benchmark environment used by the current README
+numbers:
 
 ```bash
-export CUTE_DSL_ARCH=sm_103a
+conda create -y -n cute python=3.12
+conda run -n cute python -m pip install --upgrade pip setuptools wheel packaging ninja
+conda run -n cute python -m pip install --upgrade --index-url https://download.pytorch.org/whl/cu130 torch
+conda run -n cute python -m pip install 'nvidia-cutlass-dsl==4.4.2' cuda-python triton matplotlib pytest pytest-cov
+conda run -n cute python -m pip install -e '.[bench]'
+conda run -n cute python -m pip install 'git+https://github.com/Dao-AILab/quack.git'  # optional comparison baseline
 ```
 
 ## Shape suites
@@ -34,21 +43,47 @@ export CUTE_DSL_ARCH=sm_103a
 - **DeepSeek-V3-like (DSv3)**
   - RMSNorm / LayerNorm / Softmax: `M ∈ {4096, 16384, 65536}`, `N ∈ {6144, 7168, 8192}`
   - Cross-entropy: `M ∈ {4096, 16384, 65536}`, `N ∈ {3072, 6144, 8192, 12288}`
+- **DeepSeek-V4-Flash norm shapes (DSv4)** from `deepseek-ai/DeepSeek-V4-Flash/inference/model.py`
+  - hidden-state RMSNorm / LayerNorm: `M ∈ {4096, 16384, 65536}`, `N = 7168`
+  - q_lora RMSNorm: `M ∈ {4096, 16384, 65536}`, `N = 1536`
+  - kv latent / per-head RMSNorm: `M ∈ {4096, 16384, 65536}`, `N = 512`
 
 ## Correctness gates
 
-By default, each script runs a per-shape `torch.testing.assert_close` check
-vs a **pure-PyTorch reference** **before** emitting timing numbers. When Quack
-is available for that op/path, the script also validates Quack vs the *same*
+By default, each script runs a per-shape `torch.testing.assert_close` check vs a
+**pure-PyTorch reference** **before** emitting timing numbers. When Quack is
+available for that op/path, the script also validates Quack vs the *same*
 reference (so speedups can’t come from looser numerics).
 
-Disable with `--skip-verify` only for quick smoke tests.
+Disable with `--skip-verify` only for quick smoke tests. Do not use
+`--skip-verify` for README or release performance numbers.
+
+## Roofline reporting
+
+Most benchmark JSONs include `*_hbm_frac` using `bench_utils.detect_hbm_peak_gbps()`.
+That helper is a coarse fallback (`8000 GB/s` for SM10x) so old JSONs can be
+compared consistently. For GB300/SM103 published results, use a measured roofline
+run instead.
+
+Current measured GB300 BF16 STREAM-like roof used in the README:
+
+- **7.140 TB/s** (triad, `BLOCK=2048`, `warps=8`)
+- 90% target: **6.426 TB/s**
+
+Regenerate on the current machine:
+
+```bash
+conda run -n cute bash -lc 'PYTHONNOUSERSITE=1 CUTE_DSL_ARCH=sm_103a \
+  python benchmarks/benchmark/benchmark_hbm_roofline_sm100.py --dtype bf16 --op both --gb 1 \
+  --json /tmp/oink_sm103_hbm_roofline_bf16_current.json'
+```
 
 ## Running benchmarks
 
-All scripts support:
+All primary scripts support:
 
-- `--quack-suite` or `--dsv3` (or `--configs MxN,...`)
+- `--quack-suite` or `--dsv3` (and `--dsv4` where applicable)
+- `--configs MxN,...`
 - `--dtype {bf16,fp16,fp32}`
 - `--iters <ms>` and `--warmup-ms <ms>` for kernel-only timing
 - `--json <path>` and/or `--csv <path>` outputs (meta + rows)
@@ -59,100 +94,126 @@ Run the full Quack-suite + DSv3 set (Oink vs Quack) and write all JSON artifacts
 to a timestamped directory:
 
 ```bash
-python oink/benchmarks/readme/run_sm100_suite.py --dtype bf16
+conda run -n cute bash -lc 'PYTHONNOUSERSITE=1 CUTE_DSL_ARCH=sm_103a \
+  python benchmarks/readme/run_sm100_suite.py --dtype bf16'
+
+# Include DeepSeek-V4-Flash norm workloads:
+conda run -n cute bash -lc 'PYTHONNOUSERSITE=1 CUTE_DSL_ARCH=sm_103a \
+  python benchmarks/readme/run_sm100_suite.py --dtype bf16 --include-dsv4 \
+  --out-dir /tmp/oink_sm103_suite_bf16_current'
 ```
 
-Turn the JSON artifacts into Markdown tables (with geomean speedups):
+Turn JSON artifacts into Markdown tables (with geomean speedups):
 
 ```bash
-python oink/benchmarks/readme/summarize_results.py --in-dir /tmp/kernelagent_oink_sm100_suite_<timestamp> \
-  --out /tmp/kernelagent_oink_sm100_suite_summary.md
+conda run -n cute bash -lc 'python benchmarks/readme/summarize_results.py \
+  --in-dir /tmp/oink_sm103_suite_bf16_current \
+  --out /tmp/oink_sm103_suite_bf16_current_summary.md'
 ```
 
-### Measured HBM roofline (STREAM-like)
-
-To contextualize the `*_tbps` numbers as a fraction of a *measured* bandwidth
-ceiling (rather than a theoretical spec), run:
+Generate SM103 SVGs from current JSONs and measured roofline:
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py --dtype bf16 --op both --gb 2 \
-  --json /tmp/hbm_roofline_sm100_bf16.json
+conda run -n cute bash -lc 'python benchmarks/readme/plot_quack_style_svg.py \
+  --in-dir /tmp/oink_sm103_suite_bf16_current \
+  --suite quack_suite --include-layernorm \
+  --roofline-json /tmp/oink_sm103_hbm_roofline_bf16_current.json \
+  --arch-label "SM103 / GB300" \
+  --out benchmarks/media/sm103_bf16_oink_vs_quack_with_layernorm.svg'
+
+conda run -n cute bash -lc 'python benchmarks/readme/plot_quack_style_svg.py \
+  --in-dir /tmp/oink_sm103_suite_bf16_current \
+  --suite dsv3_all --shape-policy first \
+  --roofline-json /tmp/oink_sm103_hbm_roofline_bf16_current.json \
+  --arch-label "SM103 / GB300" \
+  --out benchmarks/media/sm103_bf16_oink_vs_quack_dsv3_all.svg'
 ```
 
+The existing `sm100_*` SVGs in `benchmarks/media/` are historical SM100/B200
+plots. Do not use them as GB300 evidence.
+
 ### RMSNorm forward
 
 ```bash
-python oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py --dtype bf16 --weight-dtype fp32 --quack-suite --iters 200 --warmup-ms 25 \
+python benchmarks/benchmark/benchmark_rmsnorm_sm100.py --dtype bf16 --weight-dtype fp32 --quack-suite --iters 200 --warmup-ms 25 \
   --json /tmp/oink_rmsnorm_fwd_quack_suite.json
 
-python oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py --dtype bf16 --weight-dtype fp32 --dsv3 --iters 200 --warmup-ms 25 \
+python benchmarks/benchmark/benchmark_rmsnorm_sm100.py --dtype bf16 --weight-dtype fp32 --dsv3 --iters 200 --warmup-ms 25 \
   --json /tmp/oink_rmsnorm_fwd_dsv3.json
 
 # vLLM-style inference weights (weight dtype == activation dtype)
-python oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py --dtype bf16 --weight-dtype same --quack-suite --iters 200 --warmup-ms 25 \
+python benchmarks/benchmark/benchmark_rmsnorm_sm100.py --dtype bf16 --weight-dtype same --quack-suite --iters 200 --warmup-ms 25 \
   --json /tmp/oink_rmsnorm_fwd_quack_suite_wsame.json
+
+# DeepSeek-V4-Flash norm grid
+python benchmarks/benchmark/benchmark_rmsnorm_sm100.py --dtype bf16 --weight-dtype same --dsv4 --iters 200 --warmup-ms 25 \
+  --json /tmp/oink_rmsnorm_fwd_dsv4_wsame.json
 ```
 
 ### Fused Add + RMSNorm (vLLM-style, in-place)
 
-This is a good "roofline case study" kernel (heavy read/write traffic, very little extra math):
+This is a good roofline case study kernel (heavy read/write traffic, very little
+extra math):
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py --dtype bf16 --M 65536 --N 4096 \
+CUDA_VISIBLE_DEVICES=0 python benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py --dtype bf16 --M 65536 --N 4096 \
   --json /tmp/fused_add_rmsnorm_sm100_bf16.json
 ```
 
-Note on the Quack baseline: Oink exposes an **in-place** fused op (updates `x` and `residual`).
-Quack’s fused kernel produces `out` and `residual_out` out-of-place, so by default the benchmark
-times `quack::_rmsnorm_fwd` **plus** two explicit copies (`x.copy_(out)`, `residual.copy_(residual_out)`)
-to match the in-place semantics (integration-realistic). Use `--quack-baseline kernel` to time only
-the Quack fused kernel with preallocated outputs.
+Note on the Quack baseline: Oink exposes an **in-place** fused op (updates `x`
+and `residual`). Quack’s fused kernel produces `out` and `residual_out`
+out-of-place, so by default the benchmark times `quack::_rmsnorm_fwd` **plus**
+two explicit copies (`x.copy_(out)`, `residual.copy_(residual_out)`) to match the
+in-place semantics. Use `--quack-baseline kernel` to time only the Quack fused
+kernel with preallocated outputs.
 
 ### RMSNorm backward
 
 ```bash
-python oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py --dtype bf16 --weight-dtype fp32 --quack-suite --iters 100 --warmup-ms 25 \
+python benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py --dtype bf16 --weight-dtype fp32 --quack-suite --iters 100 --warmup-ms 25 \
   --csv /tmp/oink_rmsnorm_bwd_quack_suite.csv
 
-python oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py --dtype bf16 --weight-dtype fp32 --dsv3 --iters 100 --warmup-ms 25 \
+python benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py --dtype bf16 --weight-dtype fp32 --dsv3 --iters 100 --warmup-ms 25 \
   --csv /tmp/oink_rmsnorm_bwd_dsv3.csv
 ```
 
 ### Softmax (forward + backward)
 
 ```bash
-python oink/benchmarks/benchmark/benchmark_softmax_sm100.py --dtype bf16 --mode fwd_bwd --quack-suite --iters 50 --warmup-ms 25 \
+python benchmarks/benchmark/benchmark_softmax_sm100.py --dtype bf16 --mode fwd_bwd --quack-suite --iters 50 --warmup-ms 25 \
   --json /tmp/oink_softmax_fwd_bwd_quack_suite.json
 
-python oink/benchmarks/benchmark/benchmark_softmax_sm100.py --dtype bf16 --mode fwd_bwd --dsv3 --iters 50 --warmup-ms 25 \
+python benchmarks/benchmark/benchmark_softmax_sm100.py --dtype bf16 --mode fwd_bwd --dsv3 --iters 50 --warmup-ms 25 \
   --json /tmp/oink_softmax_fwd_bwd_dsv3.json
 ```
 
 ### Cross-entropy (forward + backward)
 
 ```bash
-python oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py --dtype bf16 --mode fwd_bwd --quack-suite --iters 50 --warmup-ms 25 \
+python benchmarks/benchmark/benchmark_cross_entropy_sm100.py --dtype bf16 --mode fwd_bwd --quack-suite --iters 50 --warmup-ms 25 \
   --json /tmp/oink_cross_entropy_fwd_bwd_quack_suite.json
 
-python oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py --dtype bf16 --mode fwd_bwd --dsv3 --iters 50 --warmup-ms 25 \
+python benchmarks/benchmark/benchmark_cross_entropy_sm100.py --dtype bf16 --mode fwd_bwd --dsv3 --iters 50 --warmup-ms 25 \
   --json /tmp/oink_cross_entropy_fwd_bwd_dsv3.json
 ```
 
 ### LayerNorm forward
 
 ```bash
-python oink/benchmarks/benchmark/benchmark_layernorm_sm100.py --dtype bf16 --quack-suite --iters 200 --warmup-ms 25 \
+python benchmarks/benchmark/benchmark_layernorm_sm100.py --dtype bf16 --quack-suite --iters 200 --warmup-ms 25 \
   --json /tmp/oink_layernorm_fwd_quack_suite.json
 
-python oink/benchmarks/benchmark/benchmark_layernorm_sm100.py --dtype bf16 --dsv3 --iters 200 --warmup-ms 25 \
+python benchmarks/benchmark/benchmark_layernorm_sm100.py --dtype bf16 --dsv3 --iters 200 --warmup-ms 25 \
   --json /tmp/oink_layernorm_fwd_dsv3.json
 ```
 
 ## Notes
 
 - These scripts intentionally avoid importing any external Oink checkout so the
-  results reflect the in-tree KernelAgent Oink kernels.
-- For RMSNorm, the `rmsnorm_with_stage2` implementation is a **fallback** that
-  is only used when the pointer-based fast path cannot be used (e.g. when
-  `weight.dtype != x.dtype`, or when layouts/alignments are incompatible). You
+  results reflect the in-tree KernelAgent-Oink kernels.
+- `src/kernelagent_oink/blackwell/rmsnorm_with_stage2.py` is a compatibility
+  facade. The stage-2 scheduling policy lives in `_rmsnorm_impl.py`; keep the
+  facade for downstream imports.
+- For RMSNorm, the stage-2 path is a fallback used when the pointer-based fast
+  path cannot be used (for example when layouts/alignments are incompatible). You
   can force it for A/B testing via `KERNELAGENT_OINK_FORCE_RMSNORM_STAGE2=1`.
diff --git a/oink/benchmarks/benchmark/bench_utils.py b/oink/benchmarks/benchmark/bench_utils.py
index dd1c2d6e..6ba7a3fa 100644
--- a/oink/benchmarks/benchmark/bench_utils.py
+++ b/oink/benchmarks/benchmark/bench_utils.py
@@ -93,7 +93,12 @@ def ensure_blackwell_arch_env(device: Optional[torch.device] = None) -> str:
 
 
 def detect_hbm_peak_gbps(device: Optional[torch.device] = None) -> float:
-    """Approximate HBM peak bandwidth in GB/s for roofline fractions."""
+    """Return a coarse fallback HBM peak in GB/s for benchmark JSON fields.
+
+    This helper is intentionally approximate.  For published GB300/SM103
+    roofline reporting, prefer a measured roofline JSON from
+    ``benchmark_hbm_roofline_sm100.py`` and compute fractions against that run.
+    """
     if device is None:
         device = torch.device("cuda")
     props = torch.cuda.get_device_properties(device)
@@ -144,6 +149,25 @@ def quack_suite_configs() -> List[Tuple[int, int, int]]:
     return cfgs
 
 
+def dsv4_norm_configs() -> List[Tuple[int, int]]:
+    """Return DeepSeek-V4-Flash norm shapes from `inference/model.py`.
+
+    Source dimensions:
+    - hidden-state norm: N=7168
+    - q_lora norm: N=1536
+    - kv latent / per-head norm: N=512
+    """
+    Ms = [4096, 16384, 65536]
+    Ns = [7168, 1536, 512]
+    return [(m, n) for n in Ns for m in Ms]
+
+
+def dsv4_hidden_norm_configs() -> List[Tuple[int, int]]:
+    """Return DeepSeek-V4-Flash hidden-state norm shapes (N=7168)."""
+    Ms = [4096, 16384, 65536]
+    return [(m, 7168) for m in Ms]
+
+
 def ensure_oink_src_on_path() -> None:
     """Make the in-repo KernelAgent Oink package importable without an editable install."""
     here = os.path.dirname(os.path.abspath(__file__))
diff --git a/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py b/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py
index e9e5b22d..148c8e5d 100644
--- a/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py
@@ -58,6 +58,7 @@
     collect_device_meta,
     detect_hbm_peak_gbps,
     do_bench_triton,
+    dsv4_hidden_norm_configs,
     ensure_blackwell_arch_env,
     error_stats_to_row,
     ensure_oink_src_on_path,
@@ -310,6 +311,11 @@ def main() -> None:
         action="store_true",
         help="Run DSv3 set: M in {4096,16384,65536}, N in {6144,7168,8192}",
     )
+    p.add_argument(
+        "--dsv4",
+        action="store_true",
+        help="Run DSv4 hidden-state fused-add RMSNorm set: M in {4096,16384,65536}, N=7168",
+    )
     p.add_argument("--warmup-ms", type=int, default=25)
     p.add_argument(
         "--iters", type=int, default=200, help="rep_ms for do_bench (default: 200)"
@@ -333,7 +339,12 @@ def main() -> None:
     dtype = parse_dtype(args.dtype)
     meta = collect_device_meta(torch.device("cuda"))
 
-    cfgs = dsv3_configs() if bool(args.dsv3) else [(int(args.M), int(args.N))]
+    if bool(args.dsv3):
+        cfgs = dsv3_configs()
+    elif bool(args.dsv4):
+        cfgs = dsv4_hidden_norm_configs()
+    else:
+        cfgs = [(int(args.M), int(args.N))]
     rows: List[Dict[str, Any]] = []
     for M, N in cfgs:
         print(
diff --git a/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py b/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py
index 7ad7f779..569d4c76 100644
--- a/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py
@@ -28,6 +28,7 @@
     collect_device_meta,
     detect_hbm_peak_gbps,
     do_bench_triton,
+    dsv4_hidden_norm_configs,
     ensure_blackwell_arch_env,
     error_stats_to_row,
     ensure_oink_src_on_path,
@@ -355,6 +356,11 @@ def main() -> None:
         action="store_true",
         help="Run DSv3 set: M in {4096,16384,65536}, N in {6144,7168,8192}",
     )
+    p.add_argument(
+        "--dsv4",
+        action="store_true",
+        help="Run DSv4 hidden-state LayerNorm set: M in {4096,16384,65536}, N=7168",
+    )
     p.add_argument(
         "--skip-verify",
         action="store_true",
@@ -369,6 +375,8 @@ def main() -> None:
         cfgs = [(bs * sl, hidden) for (bs, sl, hidden) in quack_suite_configs()]
     elif args.dsv3:
         cfgs = dsv3_configs()
+    elif args.dsv4:
+        cfgs = dsv4_hidden_norm_configs()
     else:
         cfgs = parse_configs(args.configs)
 
diff --git a/oink/benchmarks/benchmark/benchmark_paulius_rmsnorm.py b/oink/benchmarks/benchmark/benchmark_paulius_rmsnorm.py
deleted file mode 100644
index b2bf75cd..00000000
--- a/oink/benchmarks/benchmark/benchmark_paulius_rmsnorm.py
+++ /dev/null
@@ -1,242 +0,0 @@
-from __future__ import annotations
-
-import argparse
-import os
-import re
-import subprocess
-from pathlib import Path
-from typing import Any, Dict, List, Tuple
-
-import torch
-
-from bench_utils import collect_device_meta, detect_hbm_peak_gbps, write_csv, write_json
-
-
-def _bench_oink_smallm_noweight(M: int, N: int) -> float:
-    import sys
-
-    from triton.testing import do_bench_cudagraph
-
-    repo_src = Path(__file__).resolve().parents[2] / "src"
-    if str(repo_src) not in sys.path:
-        sys.path.insert(0, str(repo_src))
-    from kernelagent_oink.blackwell import _rmsnorm_impl as impl
-
-    x = torch.randn(M, N, device="cuda", dtype=torch.bfloat16)
-    out = torch.empty_like(x)
-    return float(
-        do_bench_cudagraph(
-            lambda: impl._rmsnorm_forward_ptr_into(
-                x, None, None, None, out, None, None, 1e-6
-            ),
-            rep=100,
-            return_mode="mean",
-        )
-    )
-
-
-def bytes_io_model_fwd(M: int, N: int, dtype: torch.dtype) -> int:
-    elem = torch.tensor(0, dtype=dtype).element_size()
-    return int(2 * M * N * elem)
-
-
-def _cuda_13_nvcc() -> Path:
-    nvcc = Path("/usr/local/cuda-13.0/bin/nvcc")
-    if not nvcc.is_file():
-        raise FileNotFoundError(f"CUDA 13.0 nvcc not found at {nvcc}")
-    return nvcc
-
-
-def _build_paulius_binary(src_dir: Path) -> Path:
-    nvcc = _cuda_13_nvcc()
-    out = src_dir / "r.out"
-    cmd = [
-        str(nvcc),
-        "-arch=sm_100",
-        "-Xptxas",
-        "-v",
-        "-O3",
-        "RmsNorm.cu",
-        "-I../../../",
-        "-o",
-        str(out),
-        "-lnvidia-ml",
-    ]
-    env = os.environ.copy()
-    env["CUDA_HOME"] = "/usr/local/cuda-13.0"
-    env["PATH"] = f"/usr/local/cuda-13.0/bin:{env.get('PATH', '')}"
-    subprocess.run(
-        cmd,
-        cwd=src_dir,
-        env=env,
-        check=True,
-        stdout=subprocess.DEVNULL,
-        stderr=subprocess.DEVNULL,
-    )
-    return out
-
-
-def _parse_paulius_output(text: str) -> List[Tuple[float, float]]:
-    rows: List[Tuple[float, float]] = []
-    pattern = re.compile(r"BF16\s+\d+:\s+([0-9.eE+-]+)\s+ms\s+([0-9.eE+-]+)\s+GB/s")
-    for line in text.splitlines():
-        match = pattern.search(line)
-        if match is None:
-            continue
-        rows.append((float(match.group(1)), float(match.group(2))))
-    return rows
-
-
-def _run_paulius(
-    binary: Path,
-    *,
-    M: int,
-    N: int,
-    cta_dim_y: int,
-    warmup_reps: int,
-    timing_reps: int,
-    gpu_id: int,
-) -> Tuple[float, float, Dict[str, Any]]:
-    cmd = [
-        str(binary),
-        str(M),
-        str(N),
-        str(cta_dim_y),
-        str(warmup_reps),
-        str(timing_reps),
-        str(gpu_id),
-        "0",
-        "5",
-        "1",
-    ]
-    proc = subprocess.run(
-        cmd,
-        cwd=binary.parent,
-        text=True,
-        capture_output=True,
-        check=True,
-    )
-    parsed = _parse_paulius_output(proc.stdout)
-    if not parsed:
-        raise RuntimeError(
-            f"Failed to parse Paulius output:\n{proc.stdout}\n{proc.stderr}"
-        )
-    ms, gbps = min(parsed, key=lambda row: row[0])
-    return ms, gbps, {"raw_stdout": proc.stdout, "raw_stderr": proc.stderr}
-
-
-def main() -> None:
-    if not torch.cuda.is_available():
-        raise SystemExit("CUDA not available")
-
-    torch.cuda.set_device(0)
-    device = torch.device("cuda")
-    props = torch.cuda.get_device_properties(device)
-    sm = props.major * 10 + props.minor
-    print(f"Running on {torch.cuda.get_device_name(device)} (SM{sm})")
-
-    p = argparse.ArgumentParser()
-    p.add_argument(
-        "--paulius-dir",
-        type=str,
-        default=os.path.expanduser("~/fbsource/fbcode/scripts/paulius/rmsnorm"),
-    )
-    p.add_argument("--gpu-id", type=int, default=0)
-    p.add_argument("--warmup-reps", type=int, default=10)
-    p.add_argument("--timing-reps", type=int, default=100)
-    p.add_argument("--configs", type=str, default="4096x4096,65536x4096")
-    p.add_argument("--csv", type=str, default=None)
-    p.add_argument("--json", type=str, default=None)
-    args = p.parse_args()
-
-    src_dir = Path(args.paulius_dir)
-    binary = _build_paulius_binary(src_dir)
-
-    cfgs: List[Tuple[int, int]] = []
-    for part in args.configs.split(","):
-        m, n = part.lower().split("x")
-        cfgs.append((int(m), int(n)))
-
-    meta = collect_device_meta(device)
-    hbm_peak = detect_hbm_peak_gbps(device)
-    rows_out: List[Dict[str, Any]] = []
-    for M, N in cfgs:
-        if N != 4096:
-            raise SystemExit("Paulius benchmark only supports N=4096")
-        best_ms = float("inf")
-        best_gbps = 0.0
-        best_cta_dim_y = -1
-        debug_runs: List[Dict[str, Any]] = []
-        for cta_dim_y in (1, 2, 4, 8):
-            ms, gbps, debug = _run_paulius(
-                binary,
-                M=M,
-                N=N,
-                cta_dim_y=cta_dim_y,
-                warmup_reps=int(args.warmup_reps),
-                timing_reps=int(args.timing_reps),
-                gpu_id=int(args.gpu_id),
-            )
-            debug_runs.append({"cta_dim_y": cta_dim_y, "ms": ms, "gbps": gbps})
-            if ms < best_ms:
-                best_ms = ms
-                best_gbps = gbps
-                best_cta_dim_y = cta_dim_y
-        row: Dict[str, Any] = {
-            "M": M,
-            "N": N,
-            "dtype": "bf16",
-            "paulius_ms": best_ms,
-            "paulius_gbps": best_gbps,
-            "paulius_tbps": best_gbps / 1000.0,
-            "paulius_hbm_frac": best_gbps / hbm_peak,
-            "best_cta_dim_y": best_cta_dim_y,
-            "io_model_bytes": bytes_io_model_fwd(M, N, torch.bfloat16),
-            "cta_dim_y_candidates": debug_runs,
-        }
-        if M == 4096:
-            oink_ms = _bench_oink_smallm_noweight(M, N)
-            oink_gbps = (
-                bytes_io_model_fwd(M, N, torch.bfloat16) / (oink_ms * 1e-3) / 1e9
-            )
-            row.update(
-                {
-                    "oink_kernel_ms": oink_ms,
-                    "oink_kernel_tbps": oink_gbps / 1000.0,
-                    "oink_speedup_vs_paulius": best_ms / oink_ms,
-                }
-            )
-        rows_out.append(row)
-
-    if args.csv is not None:
-        write_csv(args.csv, rows_out)
-    if args.json is not None:
-        write_json(
-            args.json,
-            meta,
-            rows_out,
-            extra={
-                "method": "Paulius CUDA benchmark binary",
-                "warmup_reps": int(args.warmup_reps),
-                "timing_reps": int(args.timing_reps),
-                "paulius_dir": str(src_dir),
-            },
-        )
-
-    print("\nSummary:")
-    print(
-        f"{'M':>14} {'N':>14} {'paulius_ms':>14} {'paulius_tbps':>14}"
-        f" {'ctaDimY':>14} {'oink_ms':>14} {'oink/paulius':>14}"
-    )
-    for r in rows_out:
-        oink_ms = float(r.get("oink_kernel_ms", float("nan")))
-        speedup = float(r.get("oink_speedup_vs_paulius", float("nan")))
-        print(
-            f"{int(r['M']):>14} {int(r['N']):>14} {float(r['paulius_ms']):14.4f}"
-            f" {float(r['paulius_tbps']):14.4f} {int(r['best_cta_dim_y']):>14}"
-            f" {oink_ms:14.4f} {speedup:14.4f}"
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/oink/benchmarks/benchmark/benchmark_rmsnorm_all.py b/oink/benchmarks/benchmark/benchmark_rmsnorm_all.py
deleted file mode 100644
index 6d6eb3df..00000000
--- a/oink/benchmarks/benchmark/benchmark_rmsnorm_all.py
+++ /dev/null
@@ -1,325 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Benchmark aten vs quack vs oink RMSNorm: normal dispatch + CUDA graph.
-
-All calls go through ``torch.ops.aten._fused_rms_norm``.
-Quack is registered via ``torch._native`` (quack PR pattern).
-Oink is registered via ``kernelagent_oink.register_all_kernels()``.
-
-Produces four tables:
-  - Forward (normal dispatch)
-  - Forward + Backward (normal dispatch)
-  - Forward (CUDA graph)
-  - Forward + Backward (CUDA graph)
-
-Usage::
-
-    python oink/benchmarks/benchmark/benchmark_rmsnorm_all.py
-"""
-
-from __future__ import annotations
-
-import json
-import os
-import subprocess
-import sys
-import tempfile
-
-os.environ.setdefault("TORCH_NATIVE_SKIP_VERSION_CHECK", "1")
-
-
-# ---------------------------------------------------------------------------
-# Worker code: runs in a subprocess per mode to avoid cross-contamination.
-# ---------------------------------------------------------------------------
-
-WORKER_CODE = r"""
-import json, os, sys
-os.environ.setdefault("TORCH_NATIVE_SKIP_VERSION_CHECK", "1")
-
-import torch
-from triton.testing import do_bench
-
-DTYPE = torch.bfloat16
-
-def bench_normal(fn, warmup=50, rep=200):
-    return do_bench(fn, warmup=warmup, rep=rep, return_mode="median")
-
-def bench_cudagraph(fn, warmup=50, rep=200):
-    for _ in range(warmup):
-        fn()
-    torch.cuda.synchronize()
-    g = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(g):
-        fn()
-    torch.cuda.synchronize()
-    return do_bench(lambda: g.replay(), warmup=10, rep=rep, return_mode="median")
-
-mode = sys.argv[1]
-shapes_json = sys.argv[2]
-SHAPES = json.loads(shapes_json)
-
-if mode == "oink":
-    import kernelagent_oink
-    kernelagent_oink.register_all_kernels(force=True)
-
-# Warm up
-for M, N in SHAPES:
-    x = torch.randn(M, N, dtype=DTYPE, device="cuda")
-    w = torch.randn(N, dtype=DTYPE, device="cuda")
-    torch.ops.aten._fused_rms_norm(x, [N], w, 1e-5)
-torch.cuda.synchronize()
-
-results = {}
-for M, N in SHAPES:
-    x = torch.randn(M, N, dtype=DTYPE, device="cuda", requires_grad=True)
-    w = torch.randn(N, dtype=DTYPE, device="cuda", requires_grad=True)
-    grad = torch.randn(M, N, dtype=DTYPE, device="cuda")
-
-    # Forward (normal)
-    def fn_fwd(x=x, w=w, N=N):
-        return torch.ops.aten._fused_rms_norm(x, [N], w, 1e-5)
-    fwd_ms = bench_normal(fn_fwd)
-
-    # Forward + Backward (normal)
-    x_ = x.detach().requires_grad_(True)
-    w_ = w.detach().requires_grad_(True)
-    def fn_fwdbwd(x_=x_, w_=w_, N=N, grad=grad):
-        y, _ = torch.ops.aten._fused_rms_norm(x_, [N], w_, 1e-5)
-        y.backward(grad)
-    fwdbwd_ms = bench_normal(fn_fwdbwd)
-
-    # Forward (CUDA graph)
-    x_g = torch.randn(M, N, dtype=DTYPE, device="cuda")
-    w_g = torch.randn(N, dtype=DTYPE, device="cuda")
-    def fn_fwd_g(x=x_g, w=w_g, N=N):
-        return torch.ops.aten._fused_rms_norm(x, [N], w, 1e-5)
-    try:
-        fwd_graph_ms = bench_cudagraph(fn_fwd_g)
-    except Exception:
-        fwd_graph_ms = -1.0
-
-    # Forward + Backward (CUDA graph)
-    x_gb = torch.randn(M, N, dtype=DTYPE, device="cuda", requires_grad=True)
-    w_gb = torch.randn(N, dtype=DTYPE, device="cuda", requires_grad=True)
-    grad_gb = torch.randn(M, N, dtype=DTYPE, device="cuda")
-    def fn_fwdbwd_g(x=x_gb, w=w_gb, N=N, grad=grad_gb):
-        y, _ = torch.ops.aten._fused_rms_norm(x, [N], w, 1e-5)
-        y.backward(grad)
-    try:
-        fwdbwd_graph_ms = bench_cudagraph(fn_fwdbwd_g)
-    except Exception:
-        fwdbwd_graph_ms = -1.0
-
-    results[f"{M}x{N}"] = {
-        "fwd": fwd_ms,
-        "fwdbwd": fwdbwd_ms,
-        "fwd_graph": fwd_graph_ms,
-        "fwdbwd_graph": fwdbwd_graph_ms,
-    }
-
-print(json.dumps({"mode": mode, "results": results}))
-"""
-
-
-# ---------------------------------------------------------------------------
-# Main: orchestrates subprocesses and prints tables.
-# ---------------------------------------------------------------------------
-
-SHAPES = [
-    [1, 4096],
-    [1, 8192],
-    [32, 4096],
-    [32, 8192],
-    [256, 4096],
-    [256, 8192],
-    [1024, 4096],
-    [1024, 8192],
-    [4096, 4096],
-    [4096, 8192],
-    [16384, 4096],
-    [16384, 8192],
-    [65536, 4096],
-    [65536, 8192],
-]
-
-COL_W = {  # column widths
-    "shape": 14,
-    "ms": 10,
-    "ratio": 8,
-}
-
-
-def find_norm_dir():
-    import torch
-    from pathlib import Path
-
-    d = Path(torch.__file__).parent / "_native" / "ops" / "norm"
-    return str(d) if d.is_dir() else None
-
-
-def run_mode(mode, norm_dir, shapes):
-    init_file = os.path.join(norm_dir, "__init__.py")
-
-    if mode in ("aten", "oink"):
-        with open(init_file, "w") as f:
-            f.write("")
-    elif mode == "quack":
-        with open(init_file, "w") as f:
-            f.write("from . import rmsnorm_impl  # noqa: F401\n")
-
-    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as tmp:
-        tmp.write(WORKER_CODE)
-        tmp_path = tmp.name
-
-    try:
-        result = subprocess.run(
-            [sys.executable, tmp_path, mode, json.dumps(shapes)],
-            capture_output=True,
-            text=True,
-            timeout=600,
-        )
-        if result.returncode != 0:
-            print(f"  [{mode}] FAILED: {result.stderr[-300:]}", file=sys.stderr)
-            return None
-        return json.loads(result.stdout.strip())["results"]
-    finally:
-        os.unlink(tmp_path)
-
-
-def _fmt_ms(v):
-    return f"{v:>{COL_W['ms']}.4f}" if v > 0 else "FAIL".rjust(COL_W["ms"])
-
-
-def _fmt_ratio(n, d):
-    if d <= 0 or n <= 0:
-        return "N/A".rjust(COL_W["ratio"])
-    return f"{f'{n / d:.2f}x':>{COL_W['ratio']}}"
-
-
-def print_table(title, subtitle, aten, quack, oink, key):
-    sw, mw, rw = COL_W["shape"], COL_W["ms"], COL_W["ratio"]
-    w = [sw, mw, mw, mw, rw, rw, rw]
-
-    def hr(left, mid, right):
-        return left + mid.join("─" * (c + 2) for c in w) + right
-
-    hdr = (
-        f"│ {'Shape (M,N)':^{sw}} "
-        f"│ {'Aten (ms)':^{mw}} "
-        f"│ {'Quack (ms)':^{mw}} "
-        f"│ {'Oink (ms)':^{mw}} "
-        f"│ {'Q/A':^{rw}} "
-        f"│ {'O/A':^{rw}} "
-        f"│ {'O/Q':^{rw}} │"
-    )
-
-    print()
-    print(f"  {title}")
-    print(f"  {subtitle}")
-    print(hr("┌", "┬", "┐"))
-    print(hdr)
-    print(hr("├", "┼", "┤"))
-
-    for shape in aten:
-        M, N = shape.split("x")
-        a, q, o = aten[shape][key], quack[shape][key], oink[shape][key]
-        row = (
-            f"│ {f'({M},{N})':>{sw}} "
-            f"│ {_fmt_ms(a)} "
-            f"│ {_fmt_ms(q)} "
-            f"│ {_fmt_ms(o)} "
-            f"│ {_fmt_ratio(a, q)} "
-            f"│ {_fmt_ratio(a, o)} "
-            f"│ {_fmt_ratio(q, o)} │"
-        )
-        print(row)
-
-    print(hr("└", "┴", "┘"))
-
-
-def main():
-    import torch
-
-    print("=" * 72)
-    print("  RMSNorm Kernel Benchmark: Aten vs Quack vs Oink")
-    print("=" * 72)
-    print(f"  Device : {torch.cuda.get_device_name(0)}")
-    print(f"  Torch  : {torch.__version__}")
-    print("  Dtype  : bfloat16")
-    print("  Quack  : registered via torch._native (quack PR)")
-    print("  Oink   : registered via kernelagent_oink.register_all_kernels()")
-    print("  Bench  : triton.testing.do_bench (median, 200 reps)")
-
-    norm_dir = find_norm_dir()
-    if norm_dir is None:
-        print("ERROR: torch._native/ops/norm/ not found.", file=sys.stderr)
-        sys.exit(1)
-
-    print()
-    print("Running aten...")
-    aten = run_mode("aten", norm_dir, SHAPES)
-    print("Running quack...")
-    quack = run_mode("quack", norm_dir, SHAPES)
-    print("Running oink...")
-    oink = run_mode("oink", norm_dir, SHAPES)
-
-    # Restore
-    with open(os.path.join(norm_dir, "__init__.py"), "w") as f:
-        f.write("from . import rmsnorm_impl  # noqa: F401\n")
-
-    if not all([aten, quack, oink]):
-        print("ERROR: one or more modes failed.", file=sys.stderr)
-        sys.exit(1)
-
-    print_table(
-        "Forward — Normal Dispatch",
-        "Standard Python dispatch through torch.ops.aten._fused_rms_norm.",
-        aten,
-        quack,
-        oink,
-        "fwd",
-    )
-    print_table(
-        "Forward + Backward — Normal Dispatch",
-        "Fwd + autograd backward, standard Python dispatch.",
-        aten,
-        quack,
-        oink,
-        "fwdbwd",
-    )
-    print_table(
-        "Forward — CUDA Graph (zero Python overhead)",
-        "Kernel captured once, replayed without re-entering Python.",
-        aten,
-        quack,
-        oink,
-        "fwd_graph",
-    )
-    print_table(
-        "Forward + Backward — CUDA Graph (zero Python overhead)",
-        "Fwd + bwd captured once, replayed without re-entering Python.",
-        aten,
-        quack,
-        oink,
-        "fwdbwd_graph",
-    )
-
-    print()
-    print("Done.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py b/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py
index e137de7c..10385972 100644
--- a/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py
@@ -18,7 +18,7 @@
 import csv
 import os
 from dataclasses import dataclass
-from typing import List, Optional, Tuple
+from typing import List, Tuple
 
 import torch
 from triton.testing import do_bench as triton_do_bench
@@ -29,6 +29,8 @@
 from bench_utils import (  # noqa: E402
     ErrorStatsAccumulator,
     collect_device_meta,
+    detect_hbm_peak_gbps,
+    dsv4_norm_configs,
     ensure_blackwell_arch_env,
     ensure_oink_src_on_path,
     error_stats_to_row,
@@ -55,17 +57,6 @@
 }
 
 
-def detect_hbm_peak_gbps(device: Optional[torch.device] = None) -> float:
-    """Approximate HBM peak bandwidth in GB/s for roofline fractions."""
-    if device is None:
-        device = torch.device("cuda")
-    props = torch.cuda.get_device_properties(device)
-    sm = props.major * 10 + props.minor
-    if sm >= 100:
-        return 8000.0
-    return 2000.0
-
-
 @dataclass
 class Result:
     ms: float
@@ -360,6 +351,11 @@ def main() -> None:
         action="store_true",
         help="Run DSv3 set: M in {4096,16384,65536}, N in {6144,7168,8192}",
     )
+    p.add_argument(
+        "--dsv4",
+        action="store_true",
+        help="Run DSv4 norm set: M in {4096,16384,65536}, N in {7168,1536,512}",
+    )
     p.add_argument(
         "--skip-verify",
         action="store_true",
@@ -378,6 +374,8 @@ def main() -> None:
         cfgs = [(bs * sl, hidden) for (bs, sl, hidden) in quack_suite_configs()]
     elif args.dsv3:
         cfgs = dsv3_configs()
+    elif args.dsv4:
+        cfgs = dsv4_norm_configs()
     else:
         cfgs = parse_configs(args.configs)
 
diff --git a/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py b/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py
index 809a4756..fccafedf 100644
--- a/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py
@@ -28,6 +28,7 @@
     collect_device_meta,
     detect_hbm_peak_gbps,
     do_bench_triton,
+    dsv4_norm_configs,
     ensure_blackwell_arch_env,
     error_stats_to_row,
     ensure_oink_src_on_path,
@@ -285,6 +286,11 @@ def main() -> None:
         action="store_true",
         help="Run DSv3 set: M in {4096,16384,65536}, N in {6144,7168,8192}",
     )
+    p.add_argument(
+        "--dsv4",
+        action="store_true",
+        help="Run DSv4 norm set: M in {4096,16384,65536}, N in {7168,1536,512}",
+    )
     p.add_argument(
         "--skip-verify",
         action="store_true",
@@ -303,6 +309,8 @@ def main() -> None:
         cfgs = [(bs * sl, hidden) for (bs, sl, hidden) in quack_suite_configs()]
     elif args.dsv3:
         cfgs = dsv3_configs()
+    elif args.dsv4:
+        cfgs = dsv4_norm_configs()
     else:
         cfgs = parse_configs(args.configs)
 
diff --git a/oink/benchmarks/media/sm103_bf16_oink_vs_quack_with_layernorm.svg b/oink/benchmarks/media/sm103_bf16_oink_vs_quack_with_layernorm.svg
new file mode 100644
index 00000000..0e161033
--- /dev/null
+++ b/oink/benchmarks/media/sm103_bf16_oink_vs_quack_with_layernorm.svg
@@ -0,0 +1,2627 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1706.903903pt" height="387.112144pt" viewBox="0 0 1706.903903 387.112144" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2026-04-29T13:05:02.928069</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.10.9, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 387.112144 
+L 1706.903903 387.112144 
+L 1706.903903 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 58.465 334.546471 
+L 429.474812 334.546471 
+L 429.474812 144.816 
+L 58.465 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="mec2ce50703" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#mec2ce50703" x="75.329082" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- (8K, 4K) -->
+      <g transform="translate(76.799643 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-28" d="M 1984 4856 
+Q 1566 4138 1362 3434 
+Q 1159 2731 1159 2009 
+Q 1159 1288 1364 580 
+Q 1569 -128 1984 -844 
+L 1484 -844 
+Q 1016 -109 783 600 
+Q 550 1309 550 2009 
+Q 550 2706 781 3412 
+Q 1013 4119 1484 4856 
+L 1984 4856 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-4b" d="M 628 4666 
+L 1259 4666 
+L 1259 2694 
+L 3353 4666 
+L 4166 4666 
+L 1850 2491 
+L 4331 0 
+L 3500 0 
+L 1259 2247 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-2c" d="M 750 794 
+L 1409 794 
+L 1409 256 
+L 897 -744 
+L 494 -744 
+L 750 256 
+L 750 794 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-34" d="M 2419 4116 
+L 825 1625 
+L 2419 1625 
+L 2419 4116 
+z
+M 2253 4666 
+L 3047 4666 
+L 3047 1625 
+L 3713 1625 
+L 3713 1100 
+L 3047 1100 
+L 3047 0 
+L 2419 0 
+L 2419 1100 
+L 313 1100 
+L 313 1709 
+L 2253 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-29" d="M 513 4856 
+L 1013 4856 
+Q 1481 4119 1714 3412 
+Q 1947 2706 1947 2009 
+Q 1947 1309 1714 600 
+Q 1481 -109 1013 -844 
+L 513 -844 
+Q 928 -128 1133 580 
+Q 1338 1288 1338 2009 
+Q 1338 2731 1133 3434 
+Q 928 4138 513 4856 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#mec2ce50703" x="131.54269" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- (16K, 4K) -->
+      <g transform="translate(133.013251 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#mec2ce50703" x="187.756298" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- (32K, 4K) -->
+      <g transform="translate(189.226859 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-33" d="M 2597 2516 
+Q 3050 2419 3304 2112 
+Q 3559 1806 3559 1356 
+Q 3559 666 3084 287 
+Q 2609 -91 1734 -91 
+Q 1441 -91 1130 -33 
+Q 819 25 488 141 
+L 488 750 
+Q 750 597 1062 519 
+Q 1375 441 1716 441 
+Q 2309 441 2620 675 
+Q 2931 909 2931 1356 
+Q 2931 1769 2642 2001 
+Q 2353 2234 1838 2234 
+L 1294 2234 
+L 1294 2753 
+L 1863 2753 
+Q 2328 2753 2575 2939 
+Q 2822 3125 2822 3475 
+Q 2822 3834 2567 4026 
+Q 2313 4219 1838 4219 
+Q 1578 4219 1281 4162 
+Q 984 4106 628 3988 
+L 628 4550 
+Q 988 4650 1302 4700 
+Q 1616 4750 1894 4750 
+Q 2613 4750 3031 4423 
+Q 3450 4097 3450 3541 
+Q 3450 3153 3228 2886 
+Q 3006 2619 2597 2516 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#mec2ce50703" x="243.969906" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- (64K, 4K) -->
+      <g transform="translate(245.440467 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#mec2ce50703" x="300.183514" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- (128K, 4K) -->
+      <g transform="translate(301.654075 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#mec2ce50703" x="356.397122" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- (256K, 4K) -->
+      <g transform="translate(357.867683 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-35" d="M 691 4666 
+L 3169 4666 
+L 3169 4134 
+L 1269 4134 
+L 1269 2991 
+Q 1406 3038 1543 3061 
+Q 1681 3084 1819 3084 
+Q 2600 3084 3056 2656 
+Q 3513 2228 3513 1497 
+Q 3513 744 3044 326 
+Q 2575 -91 1722 -91 
+Q 1428 -91 1123 -41 
+Q 819 9 494 109 
+L 494 744 
+Q 775 591 1075 516 
+Q 1375 441 1709 441 
+Q 2250 441 2565 725 
+Q 2881 1009 2881 1497 
+Q 2881 1984 2565 2268 
+Q 2250 2553 1709 2553 
+Q 1456 2553 1204 2497 
+Q 953 2441 691 2322 
+L 691 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#mec2ce50703" x="412.61073" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- (512K, 4K) -->
+      <g transform="translate(414.081291 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_8">
+      <path d="M 58.465 334.546471 
+L 429.474812 334.546471 
+" clip-path="url(#pdf8b875346)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_9">
+      <defs>
+       <path id="maae9073dec" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#maae9073dec" x="58.465" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- 0 -->
+      <g transform="translate(43.83 339.105533) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_10">
+      <path d="M 58.465 286.445772 
+L 429.474812 286.445772 
+" clip-path="url(#pdf8b875346)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_11">
+      <g>
+       <use xlink:href="#maae9073dec" x="58.465" y="286.445772" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- 2000 -->
+      <g transform="translate(20.925 291.004835) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_12">
+      <path d="M 58.465 238.345074 
+L 429.474812 238.345074 
+" clip-path="url(#pdf8b875346)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_13">
+      <g>
+       <use xlink:href="#maae9073dec" x="58.465" y="238.345074" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_10">
+      <!-- 4000 -->
+      <g transform="translate(20.925 242.904136) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_14">
+      <path d="M 58.465 190.244375 
+L 429.474812 190.244375 
+" clip-path="url(#pdf8b875346)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_15">
+      <g>
+       <use xlink:href="#maae9073dec" x="58.465" y="190.244375" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_11">
+      <!-- 6000 -->
+      <g transform="translate(20.925 194.803438) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-36"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_12">
+     <!-- Memory Bandwidth (GB/s) -->
+     <g transform="translate(13.5975 344.684985) rotate(-90) scale(0.16 -0.16)">
+      <defs>
+       <path id="DejaVuSans-4d" d="M 628 4666 
+L 1569 4666 
+L 2759 1491 
+L 3956 4666 
+L 4897 4666 
+L 4897 0 
+L 4281 0 
+L 4281 4097 
+L 3078 897 
+L 2444 897 
+L 1241 4097 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6d" d="M 3328 2828 
+Q 3544 3216 3844 3400 
+Q 4144 3584 4550 3584 
+Q 5097 3584 5394 3201 
+Q 5691 2819 5691 2113 
+L 5691 0 
+L 5113 0 
+L 5113 2094 
+Q 5113 2597 4934 2840 
+Q 4756 3084 4391 3084 
+Q 3944 3084 3684 2787 
+Q 3425 2491 3425 1978 
+L 3425 0 
+L 2847 0 
+L 2847 2094 
+Q 2847 2600 2669 2842 
+Q 2491 3084 2119 3084 
+Q 1678 3084 1418 2786 
+Q 1159 2488 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1356 3278 1631 3431 
+Q 1906 3584 2284 3584 
+Q 2666 3584 2933 3390 
+Q 3200 3197 3328 2828 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6f" d="M 1959 3097 
+Q 1497 3097 1228 2736 
+Q 959 2375 959 1747 
+Q 959 1119 1226 758 
+Q 1494 397 1959 397 
+Q 2419 397 2687 759 
+Q 2956 1122 2956 1747 
+Q 2956 2369 2687 2733 
+Q 2419 3097 1959 3097 
+z
+M 1959 3584 
+Q 2709 3584 3137 3096 
+Q 3566 2609 3566 1747 
+Q 3566 888 3137 398 
+Q 2709 -91 1959 -91 
+Q 1206 -91 779 398 
+Q 353 888 353 1747 
+Q 353 2609 779 3096 
+Q 1206 3584 1959 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-79" d="M 2059 -325 
+Q 1816 -950 1584 -1140 
+Q 1353 -1331 966 -1331 
+L 506 -1331 
+L 506 -850 
+L 844 -850 
+Q 1081 -850 1212 -737 
+Q 1344 -625 1503 -206 
+L 1606 56 
+L 191 3500 
+L 800 3500 
+L 1894 763 
+L 2988 3500 
+L 3597 3500 
+L 2059 -325 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-42" d="M 1259 2228 
+L 1259 519 
+L 2272 519 
+Q 2781 519 3026 730 
+Q 3272 941 3272 1375 
+Q 3272 1813 3026 2020 
+Q 2781 2228 2272 2228 
+L 1259 2228 
+z
+M 1259 4147 
+L 1259 2741 
+L 2194 2741 
+Q 2656 2741 2882 2914 
+Q 3109 3088 3109 3444 
+Q 3109 3797 2882 3972 
+Q 2656 4147 2194 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2241 4666 
+Q 2963 4666 3353 4366 
+Q 3744 4066 3744 3513 
+Q 3744 3084 3544 2831 
+Q 3344 2578 2956 2516 
+Q 3422 2416 3680 2098 
+Q 3938 1781 3938 1306 
+Q 3938 681 3513 340 
+Q 3088 0 2303 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-64" d="M 2906 2969 
+L 2906 4863 
+L 3481 4863 
+L 3481 0 
+L 2906 0 
+L 2906 525 
+Q 2725 213 2448 61 
+Q 2172 -91 1784 -91 
+Q 1150 -91 751 415 
+Q 353 922 353 1747 
+Q 353 2572 751 3078 
+Q 1150 3584 1784 3584 
+Q 2172 3584 2448 3432 
+Q 2725 3281 2906 2969 
+z
+M 947 1747 
+Q 947 1113 1208 752 
+Q 1469 391 1925 391 
+Q 2381 391 2643 752 
+Q 2906 1113 2906 1747 
+Q 2906 2381 2643 2742 
+Q 2381 3103 1925 3103 
+Q 1469 3103 1208 2742 
+Q 947 2381 947 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-77" d="M 269 3500 
+L 844 3500 
+L 1563 769 
+L 2278 3500 
+L 2956 3500 
+L 3675 769 
+L 4391 3500 
+L 4966 3500 
+L 4050 0 
+L 3372 0 
+L 2619 2869 
+L 1863 0 
+L 1184 0 
+L 269 3500 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-47" d="M 3809 666 
+L 3809 1919 
+L 2778 1919 
+L 2778 2438 
+L 4434 2438 
+L 4434 434 
+Q 4069 175 3628 42 
+Q 3188 -91 2688 -91 
+Q 1594 -91 976 548 
+Q 359 1188 359 2328 
+Q 359 3472 976 4111 
+Q 1594 4750 2688 4750 
+Q 3144 4750 3555 4637 
+Q 3966 4525 4313 4306 
+L 4313 3634 
+Q 3963 3931 3569 4081 
+Q 3175 4231 2741 4231 
+Q 1884 4231 1454 3753 
+Q 1025 3275 1025 2328 
+Q 1025 1384 1454 906 
+Q 1884 428 2741 428 
+Q 3075 428 3337 486 
+Q 3600 544 3809 666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-2f" d="M 1625 4666 
+L 2156 4666 
+L 531 -594 
+L 0 -594 
+L 1625 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-4d"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(86.279297 0)"/>
+      <use xlink:href="#DejaVuSans-6d" transform="translate(147.802734 0)"/>
+      <use xlink:href="#DejaVuSans-6f" transform="translate(245.214844 0)"/>
+      <use xlink:href="#DejaVuSans-72" transform="translate(306.396484 0)"/>
+      <use xlink:href="#DejaVuSans-79" transform="translate(347.509766 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(406.689453 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(438.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(507.080078 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(568.359375 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(631.738281 0)"/>
+      <use xlink:href="#DejaVuSans-77" transform="translate(695.214844 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(777.001953 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(804.785156 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(868.261719 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(907.470703 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(970.849609 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(1002.636719 0)"/>
+      <use xlink:href="#DejaVuSans-47" transform="translate(1041.650391 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(1119.140625 0)"/>
+      <use xlink:href="#DejaVuSans-2f" transform="translate(1187.744141 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(1221.435547 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(1273.535156 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_16">
+    <path d="M 75.329082 280.566968 
+L 131.54269 194.524041 
+L 187.756298 180.346678 
+L 243.969906 171.964979 
+L 300.183514 167.64368 
+L 356.397122 165.033384 
+L 412.61073 163.868322 
+" clip-path="url(#pdf8b875346)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="m6f9dc476d7" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #5ba3f5"/>
+    </defs>
+    <g clip-path="url(#pdf8b875346)">
+     <use xlink:href="#m6f9dc476d7" x="75.329082" y="280.566968" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="131.54269" y="194.524041" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="187.756298" y="180.346678" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="243.969906" y="171.964979" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="300.183514" y="167.64368" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="356.397122" y="165.033384" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="412.61073" y="163.868322" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_17">
+    <path d="M 75.329082 262.622869 
+L 131.54269 211.681834 
+L 187.756298 198.139216 
+L 243.969906 190.178141 
+L 300.183514 185.566359 
+L 356.397122 183.312256 
+L 412.61073 181.998135 
+" clip-path="url(#pdf8b875346)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="m60e85d9501" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #ff4444"/>
+    </defs>
+    <g clip-path="url(#pdf8b875346)">
+     <use xlink:href="#m60e85d9501" x="75.329082" y="262.622869" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="131.54269" y="211.681834" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="187.756298" y="198.139216" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="243.969906" y="190.178141" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="300.183514" y="185.566359" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="356.397122" y="183.312256" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="412.61073" y="181.998135" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_18">
+    <path d="M 58.465 162.82647 
+L 429.474812 162.82647 
+" clip-path="url(#pdf8b875346)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 58.465 334.546471 
+L 58.465 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 429.474812 334.546471 
+L 429.474812 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 58.465 334.546471 
+L 429.474812 334.546471 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 58.465 144.816 
+L 429.474812 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_13">
+    <!-- RMSNorm (fp32 weight) -->
+    <g transform="translate(136.266625 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-52" d="M 2841 2188 
+Q 3044 2119 3236 1894 
+Q 3428 1669 3622 1275 
+L 4263 0 
+L 3584 0 
+L 2988 1197 
+Q 2756 1666 2539 1819 
+Q 2322 1972 1947 1972 
+L 1259 1972 
+L 1259 0 
+L 628 0 
+L 628 4666 
+L 2053 4666 
+Q 2853 4666 3247 4331 
+Q 3641 3997 3641 3322 
+Q 3641 2881 3436 2590 
+Q 3231 2300 2841 2188 
+z
+M 1259 4147 
+L 1259 2491 
+L 2053 2491 
+Q 2509 2491 2742 2702 
+Q 2975 2913 2975 3322 
+Q 2975 3731 2742 3939 
+Q 2509 4147 2053 4147 
+L 1259 4147 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-53" d="M 3425 4513 
+L 3425 3897 
+Q 3066 4069 2747 4153 
+Q 2428 4238 2131 4238 
+Q 1616 4238 1336 4038 
+Q 1056 3838 1056 3469 
+Q 1056 3159 1242 3001 
+Q 1428 2844 1947 2747 
+L 2328 2669 
+Q 3034 2534 3370 2195 
+Q 3706 1856 3706 1288 
+Q 3706 609 3251 259 
+Q 2797 -91 1919 -91 
+Q 1588 -91 1214 -16 
+Q 841 59 441 206 
+L 441 856 
+Q 825 641 1194 531 
+Q 1563 422 1919 422 
+Q 2459 422 2753 634 
+Q 3047 847 3047 1241 
+Q 3047 1584 2836 1778 
+Q 2625 1972 2144 2069 
+L 1759 2144 
+Q 1053 2284 737 2584 
+Q 422 2884 422 3419 
+Q 422 4038 858 4394 
+Q 1294 4750 2059 4750 
+Q 2388 4750 2728 4690 
+Q 3069 4631 3425 4513 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-4e" d="M 628 4666 
+L 1478 4666 
+L 3547 763 
+L 3547 4666 
+L 4159 4666 
+L 4159 0 
+L 3309 0 
+L 1241 3903 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
+z
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-52"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(69.482422 0)"/>
+     <use xlink:href="#DejaVuSans-53" transform="translate(155.761719 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(219.238281 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(294.042969 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(355.224609 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(394.587891 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(492 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(523.787109 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(562.800781 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(598.005859 0)"/>
+     <use xlink:href="#DejaVuSans-33" transform="translate(661.482422 0)"/>
+     <use xlink:href="#DejaVuSans-32" transform="translate(725.105469 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(788.728516 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(820.515625 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(902.302734 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(963.826172 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(991.609375 0)"/>
+     <use xlink:href="#DejaVuSans-68" transform="translate(1055.085938 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(1118.464844 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1157.673828 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_2">
+   <g id="patch_7">
+    <path d="M 474.7075 334.546471 
+L 845.717312 334.546471 
+L 845.717312 144.816 
+L 474.7075 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_3">
+    <g id="xtick_8">
+     <g id="line2d_19">
+      <g>
+       <use xlink:href="#mec2ce50703" x="491.571582" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_14">
+      <!-- (8K, 4K) -->
+      <g transform="translate(493.042143 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_20">
+      <g>
+       <use xlink:href="#mec2ce50703" x="547.78519" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_15">
+      <!-- (16K, 4K) -->
+      <g transform="translate(549.255751 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_10">
+     <g id="line2d_21">
+      <g>
+       <use xlink:href="#mec2ce50703" x="603.998798" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_16">
+      <!-- (32K, 4K) -->
+      <g transform="translate(605.469359 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_11">
+     <g id="line2d_22">
+      <g>
+       <use xlink:href="#mec2ce50703" x="660.212406" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_17">
+      <!-- (64K, 4K) -->
+      <g transform="translate(661.682967 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_12">
+     <g id="line2d_23">
+      <g>
+       <use xlink:href="#mec2ce50703" x="716.426014" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_18">
+      <!-- (128K, 4K) -->
+      <g transform="translate(717.896575 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_13">
+     <g id="line2d_24">
+      <g>
+       <use xlink:href="#mec2ce50703" x="772.639622" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_19">
+      <!-- (256K, 4K) -->
+      <g transform="translate(774.110183 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_14">
+     <g id="line2d_25">
+      <g>
+       <use xlink:href="#mec2ce50703" x="828.85323" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_20">
+      <!-- (512K, 4K) -->
+      <g transform="translate(830.323791 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_4">
+    <g id="ytick_5">
+     <g id="line2d_26">
+      <path d="M 474.7075 334.546471 
+L 845.717312 334.546471 
+" clip-path="url(#p612a449b4b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_27">
+      <g>
+       <use xlink:href="#maae9073dec" x="474.7075" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_28">
+      <path d="M 474.7075 286.445772 
+L 845.717312 286.445772 
+" clip-path="url(#p612a449b4b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_29">
+      <g>
+       <use xlink:href="#maae9073dec" x="474.7075" y="286.445772" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="line2d_30">
+      <path d="M 474.7075 238.345074 
+L 845.717312 238.345074 
+" clip-path="url(#p612a449b4b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_31">
+      <g>
+       <use xlink:href="#maae9073dec" x="474.7075" y="238.345074" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="line2d_32">
+      <path d="M 474.7075 190.244375 
+L 845.717312 190.244375 
+" clip-path="url(#p612a449b4b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_33">
+      <g>
+       <use xlink:href="#maae9073dec" x="474.7075" y="190.244375" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_34">
+    <path d="M 491.571582 202.057638 
+L 547.78519 183.758023 
+L 603.998798 172.825625 
+L 660.212406 165.487666 
+L 716.426014 160.523944 
+L 772.639622 159.448332 
+L 828.85323 158.870109 
+" clip-path="url(#p612a449b4b)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p612a449b4b)">
+     <use xlink:href="#m6f9dc476d7" x="491.571582" y="202.057638" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="547.78519" y="183.758023" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="603.998798" y="172.825625" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="660.212406" y="165.487666" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="716.426014" y="160.523944" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="772.639622" y="159.448332" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="828.85323" y="158.870109" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_35">
+    <path d="M 491.571582 253.538211 
+L 547.78519 243.56458 
+L 603.998798 237.216323 
+L 660.212406 233.149543 
+L 716.426014 231.360007 
+L 772.639622 230.323982 
+L 828.85323 229.714593 
+" clip-path="url(#p612a449b4b)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p612a449b4b)">
+     <use xlink:href="#m60e85d9501" x="491.571582" y="253.538211" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="547.78519" y="243.56458" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="603.998798" y="237.216323" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="660.212406" y="233.149543" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="716.426014" y="231.360007" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="772.639622" y="230.323982" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="828.85323" y="229.714593" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_36">
+    <path d="M 474.7075 162.82647 
+L 845.717312 162.82647 
+" clip-path="url(#p612a449b4b)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_8">
+    <path d="M 474.7075 334.546471 
+L 474.7075 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_9">
+    <path d="M 845.717312 334.546471 
+L 845.717312 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_10">
+    <path d="M 474.7075 334.546471 
+L 845.717312 334.546471 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_11">
+    <path d="M 474.7075 144.816 
+L 845.717312 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_21">
+    <!-- Softmax (fwd+bwd) -->
+    <g transform="translate(570.547094 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-78" d="M 3513 3500 
+L 2247 1797 
+L 3578 0 
+L 2900 0 
+L 1881 1375 
+L 863 0 
+L 184 0 
+L 1544 1831 
+L 300 3500 
+L 978 3500 
+L 1906 2253 
+L 2834 3500 
+L 3513 3500 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2b" d="M 2944 4013 
+L 2944 2272 
+L 4684 2272 
+L 4684 1741 
+L 2944 1741 
+L 2944 0 
+L 2419 0 
+L 2419 1741 
+L 678 1741 
+L 678 2272 
+L 2419 2272 
+L 2419 4013 
+L 2944 4013 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-62" d="M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+M 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2969 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-53"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(63.476562 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(124.658203 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(158.113281 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(197.322266 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(294.734375 0)"/>
+     <use xlink:href="#DejaVuSans-78" transform="translate(356.013672 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(415.193359 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(446.980469 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(485.994141 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(519.449219 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(601.236328 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(664.712891 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(748.501953 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(811.978516 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(893.765625 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(957.242188 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_3">
+   <g id="patch_12">
+    <path d="M 890.95 334.546471 
+L 1261.959812 334.546471 
+L 1261.959812 144.816 
+L 890.95 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_5">
+    <g id="xtick_15">
+     <g id="line2d_37">
+      <g>
+       <use xlink:href="#mec2ce50703" x="907.814082" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_22">
+      <!-- (8K, 4K) -->
+      <g transform="translate(909.284643 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_16">
+     <g id="line2d_38">
+      <g>
+       <use xlink:href="#mec2ce50703" x="964.02769" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_23">
+      <!-- (16K, 4K) -->
+      <g transform="translate(965.498251 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_17">
+     <g id="line2d_39">
+      <g>
+       <use xlink:href="#mec2ce50703" x="1020.241298" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_24">
+      <!-- (32K, 4K) -->
+      <g transform="translate(1021.711859 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_18">
+     <g id="line2d_40">
+      <g>
+       <use xlink:href="#mec2ce50703" x="1076.454906" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_25">
+      <!-- (64K, 4K) -->
+      <g transform="translate(1077.925467 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_19">
+     <g id="line2d_41">
+      <g>
+       <use xlink:href="#mec2ce50703" x="1132.668514" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_26">
+      <!-- (128K, 4K) -->
+      <g transform="translate(1134.139075 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_20">
+     <g id="line2d_42">
+      <g>
+       <use xlink:href="#mec2ce50703" x="1188.882122" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_27">
+      <!-- (256K, 4K) -->
+      <g transform="translate(1190.352683 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_21">
+     <g id="line2d_43">
+      <g>
+       <use xlink:href="#mec2ce50703" x="1245.09573" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_28">
+      <!-- (512K, 4K) -->
+      <g transform="translate(1246.566291 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_6">
+    <g id="ytick_9">
+     <g id="line2d_44">
+      <path d="M 890.95 334.546471 
+L 1261.959812 334.546471 
+" clip-path="url(#pe932b3d384)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_45">
+      <g>
+       <use xlink:href="#maae9073dec" x="890.95" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_10">
+     <g id="line2d_46">
+      <path d="M 890.95 286.445772 
+L 1261.959812 286.445772 
+" clip-path="url(#pe932b3d384)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_47">
+      <g>
+       <use xlink:href="#maae9073dec" x="890.95" y="286.445772" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_11">
+     <g id="line2d_48">
+      <path d="M 890.95 238.345074 
+L 1261.959812 238.345074 
+" clip-path="url(#pe932b3d384)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_49">
+      <g>
+       <use xlink:href="#maae9073dec" x="890.95" y="238.345074" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_12">
+     <g id="line2d_50">
+      <path d="M 890.95 190.244375 
+L 1261.959812 190.244375 
+" clip-path="url(#pe932b3d384)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_51">
+      <g>
+       <use xlink:href="#maae9073dec" x="890.95" y="190.244375" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_52">
+    <path d="M 907.814082 222.943712 
+L 964.02769 204.314758 
+L 1020.241298 191.219424 
+L 1076.454906 182.451321 
+L 1132.668514 178.283732 
+L 1188.882122 176.054929 
+L 1245.09573 174.938735 
+" clip-path="url(#pe932b3d384)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pe932b3d384)">
+     <use xlink:href="#m6f9dc476d7" x="907.814082" y="222.943712" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="964.02769" y="204.314758" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="1020.241298" y="191.219424" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="1076.454906" y="182.451321" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="1132.668514" y="178.283732" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="1188.882122" y="176.054929" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="1245.09573" y="174.938735" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_53">
+    <path d="M 907.814082 304.694328 
+L 964.02769 274.948864 
+L 1020.241298 242.308425 
+L 1076.454906 235.803157 
+L 1132.668514 232.360584 
+L 1188.882122 230.561004 
+L 1245.09573 229.638023 
+" clip-path="url(#pe932b3d384)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pe932b3d384)">
+     <use xlink:href="#m60e85d9501" x="907.814082" y="304.694328" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="964.02769" y="274.948864" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="1020.241298" y="242.308425" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="1076.454906" y="235.803157" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="1132.668514" y="232.360584" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="1188.882122" y="230.561004" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="1245.09573" y="229.638023" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_54">
+    <path d="M 890.95 162.82647 
+L 1261.959812 162.82647 
+" clip-path="url(#pe932b3d384)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_13">
+    <path d="M 890.95 334.546471 
+L 890.95 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_14">
+    <path d="M 1261.959812 334.546471 
+L 1261.959812 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_15">
+    <path d="M 890.95 334.546471 
+L 1261.959812 334.546471 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_16">
+    <path d="M 890.95 144.816 
+L 1261.959812 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_29">
+    <!-- Cross-Entropy (fwd+bwd) -->
+    <g transform="translate(961.281625 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-43" d="M 4122 4306 
+L 4122 3641 
+Q 3803 3938 3442 4084 
+Q 3081 4231 2675 4231 
+Q 1875 4231 1450 3742 
+Q 1025 3253 1025 2328 
+Q 1025 1406 1450 917 
+Q 1875 428 2675 428 
+Q 3081 428 3442 575 
+Q 3803 722 4122 1019 
+L 4122 359 
+Q 3791 134 3420 21 
+Q 3050 -91 2638 -91 
+Q 1578 -91 968 557 
+Q 359 1206 359 2328 
+Q 359 3453 968 4101 
+Q 1578 4750 2638 4750 
+Q 3056 4750 3426 4639 
+Q 3797 4528 4122 4306 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-45" d="M 628 4666 
+L 3578 4666 
+L 3578 4134 
+L 1259 4134 
+L 1259 2753 
+L 3481 2753 
+L 3481 2222 
+L 1259 2222 
+L 1259 531 
+L 3634 531 
+L 3634 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-43"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(69.824219 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(108.6875 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(169.869141 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(221.96875 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(274.068359 0)"/>
+     <use xlink:href="#DejaVuSans-45" transform="translate(310.152344 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(373.335938 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(436.714844 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(475.923828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(514.787109 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(575.96875 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(639.445312 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(698.625 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(730.412109 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(769.425781 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(802.880859 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(884.667969 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(948.144531 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(1031.933594 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(1095.410156 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1177.197266 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1240.673828 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_4">
+   <g id="patch_17">
+    <path d="M 1307.1925 334.546471 
+L 1678.202312 334.546471 
+L 1678.202312 144.816 
+L 1307.1925 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_7">
+    <g id="xtick_22">
+     <g id="line2d_55">
+      <g>
+       <use xlink:href="#mec2ce50703" x="1324.056582" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_30">
+      <!-- (8K, 4K) -->
+      <g transform="translate(1325.527143 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_23">
+     <g id="line2d_56">
+      <g>
+       <use xlink:href="#mec2ce50703" x="1380.27019" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_31">
+      <!-- (16K, 4K) -->
+      <g transform="translate(1381.740751 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_24">
+     <g id="line2d_57">
+      <g>
+       <use xlink:href="#mec2ce50703" x="1436.483798" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_32">
+      <!-- (32K, 4K) -->
+      <g transform="translate(1437.954359 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_25">
+     <g id="line2d_58">
+      <g>
+       <use xlink:href="#mec2ce50703" x="1492.697406" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_33">
+      <!-- (64K, 4K) -->
+      <g transform="translate(1494.167967 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_26">
+     <g id="line2d_59">
+      <g>
+       <use xlink:href="#mec2ce50703" x="1548.911014" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_34">
+      <!-- (128K, 4K) -->
+      <g transform="translate(1550.381575 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_27">
+     <g id="line2d_60">
+      <g>
+       <use xlink:href="#mec2ce50703" x="1605.124622" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_35">
+      <!-- (256K, 4K) -->
+      <g transform="translate(1606.595183 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_28">
+     <g id="line2d_61">
+      <g>
+       <use xlink:href="#mec2ce50703" x="1661.33823" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_36">
+      <!-- (512K, 4K) -->
+      <g transform="translate(1662.808791 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_8">
+    <g id="ytick_13">
+     <g id="line2d_62">
+      <path d="M 1307.1925 334.546471 
+L 1678.202312 334.546471 
+" clip-path="url(#pb89e28d010)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_63">
+      <g>
+       <use xlink:href="#maae9073dec" x="1307.1925" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_14">
+     <g id="line2d_64">
+      <path d="M 1307.1925 286.445772 
+L 1678.202312 286.445772 
+" clip-path="url(#pb89e28d010)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_65">
+      <g>
+       <use xlink:href="#maae9073dec" x="1307.1925" y="286.445772" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_15">
+     <g id="line2d_66">
+      <path d="M 1307.1925 238.345074 
+L 1678.202312 238.345074 
+" clip-path="url(#pb89e28d010)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_67">
+      <g>
+       <use xlink:href="#maae9073dec" x="1307.1925" y="238.345074" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_16">
+     <g id="line2d_68">
+      <path d="M 1307.1925 190.244375 
+L 1678.202312 190.244375 
+" clip-path="url(#pb89e28d010)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_69">
+      <g>
+       <use xlink:href="#maae9073dec" x="1307.1925" y="190.244375" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_70">
+    <path d="M 1324.056582 219.630151 
+L 1380.27019 198.357027 
+L 1436.483798 181.770115 
+L 1492.697406 172.193214 
+L 1548.911014 166.676459 
+L 1605.124622 163.765018 
+L 1661.33823 162.35205 
+" clip-path="url(#pb89e28d010)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pb89e28d010)">
+     <use xlink:href="#m6f9dc476d7" x="1324.056582" y="219.630151" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="1380.27019" y="198.357027" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="1436.483798" y="181.770115" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="1492.697406" y="172.193214" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="1548.911014" y="166.676459" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="1605.124622" y="163.765018" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6f9dc476d7" x="1661.33823" y="162.35205" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_71">
+    <path d="M 1324.056582 261.524285 
+L 1380.27019 219.387867 
+L 1436.483798 208.08357 
+L 1492.697406 201.67327 
+L 1548.911014 197.843958 
+L 1605.124622 196.054971 
+L 1661.33823 195.155263 
+" clip-path="url(#pb89e28d010)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pb89e28d010)">
+     <use xlink:href="#m60e85d9501" x="1324.056582" y="261.524285" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="1380.27019" y="219.387867" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="1436.483798" y="208.08357" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="1492.697406" y="201.67327" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="1548.911014" y="197.843958" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="1605.124622" y="196.054971" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m60e85d9501" x="1661.33823" y="195.155263" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_72">
+    <path d="M 1307.1925 162.82647 
+L 1678.202312 162.82647 
+" clip-path="url(#pb89e28d010)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_18">
+    <path d="M 1307.1925 334.546471 
+L 1307.1925 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_19">
+    <path d="M 1678.202312 334.546471 
+L 1678.202312 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_20">
+    <path d="M 1307.1925 334.546471 
+L 1678.202312 334.546471 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_21">
+    <path d="M 1307.1925 144.816 
+L 1678.202312 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_37">
+    <!-- LayerNorm (fwd) -->
+    <g transform="translate(1417.086156 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-4c" d="M 628 4666 
+L 1259 4666 
+L 1259 531 
+L 3531 531 
+L 3531 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4c"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(55.712891 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(116.992188 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(176.171875 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(237.695312 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(278.808594 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(353.613281 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(414.794922 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(454.158203 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(551.570312 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(583.357422 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(622.371094 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(655.826172 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(737.613281 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(801.089844 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="text_38">
+   <!-- SM103 / GB300 BF16 Kernel Benchmarks (Oink vs Quack) — Quack-suite (+LayerNorm) -->
+   <g transform="translate(372.682656 18.156563) scale(0.22 -0.22)">
+    <defs>
+     <path id="DejaVuSans-46" d="M 628 4666 
+L 3309 4666 
+L 3309 4134 
+L 1259 4134 
+L 1259 2759 
+L 3109 2759 
+L 3109 2228 
+L 1259 2228 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-6b" d="M 581 4863 
+L 1159 4863 
+L 1159 1991 
+L 2875 3500 
+L 3609 3500 
+L 1753 1863 
+L 3688 0 
+L 2938 0 
+L 1159 1709 
+L 1159 0 
+L 581 0 
+L 581 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-4f" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1225 4090 567 
+Q 3503 -91 2522 -91 
+Q 1538 -91 948 565 
+Q 359 1222 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-76" d="M 191 3500 
+L 800 3500 
+L 1894 563 
+L 2988 3500 
+L 3597 3500 
+L 2284 0 
+L 1503 0 
+L 191 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-51" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 3406 84 
+L 4238 -825 
+L 3475 -825 
+L 2784 -78 
+Q 2681 -84 2626 -87 
+Q 2572 -91 2522 -91 
+Q 1538 -91 948 567 
+Q 359 1225 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1516 4351 937 
+Q 4025 359 3406 84 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-2014" d="M 313 1978 
+L 6088 1978 
+L 6088 1528 
+L 313 1528 
+L 313 1978 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSans-53"/>
+    <use xlink:href="#DejaVuSans-4d" transform="translate(63.476562 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(149.755859 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(213.378906 0)"/>
+    <use xlink:href="#DejaVuSans-33" transform="translate(277.001953 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(340.625 0)"/>
+    <use xlink:href="#DejaVuSans-2f" transform="translate(372.412109 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(406.103516 0)"/>
+    <use xlink:href="#DejaVuSans-47" transform="translate(437.890625 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(515.380859 0)"/>
+    <use xlink:href="#DejaVuSans-33" transform="translate(583.984375 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(647.607422 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(711.230469 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(774.853516 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(806.640625 0)"/>
+    <use xlink:href="#DejaVuSans-46" transform="translate(875.244141 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(932.763672 0)"/>
+    <use xlink:href="#DejaVuSans-36" transform="translate(996.386719 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1060.009766 0)"/>
+    <use xlink:href="#DejaVuSans-4b" transform="translate(1091.796875 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(1152.373047 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(1213.896484 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1253.259766 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(1316.638672 0)"/>
+    <use xlink:href="#DejaVuSans-6c" transform="translate(1378.162109 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1405.945312 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(1437.732422 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(1506.335938 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1567.859375 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(1631.238281 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(1686.21875 0)"/>
+    <use xlink:href="#DejaVuSans-6d" transform="translate(1749.597656 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(1847.009766 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(1908.289062 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1949.402344 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(2007.3125 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2059.412109 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(2091.199219 0)"/>
+    <use xlink:href="#DejaVuSans-4f" transform="translate(2130.212891 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(2208.923828 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(2236.707031 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(2300.085938 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2357.996094 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(2389.783203 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(2448.962891 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2501.0625 0)"/>
+    <use xlink:href="#DejaVuSans-51" transform="translate(2532.849609 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(2611.560547 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(2674.939453 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(2736.21875 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(2791.199219 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(2849.109375 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2888.123047 0)"/>
+    <use xlink:href="#DejaVuSans-2014" transform="translate(2919.910156 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(3019.910156 0)"/>
+    <use xlink:href="#DejaVuSans-51" transform="translate(3051.697266 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(3130.408203 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(3193.787109 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(3255.066406 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(3310.046875 0)"/>
+    <use xlink:href="#DejaVuSans-2d" transform="translate(3367.957031 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(3404.041016 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(3456.140625 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(3519.519531 0)"/>
+    <use xlink:href="#DejaVuSans-74" transform="translate(3547.302734 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(3586.511719 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(3648.035156 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(3679.822266 0)"/>
+    <use xlink:href="#DejaVuSans-2b" transform="translate(3718.835938 0)"/>
+    <use xlink:href="#DejaVuSans-4c" transform="translate(3802.625 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(3858.337891 0)"/>
+    <use xlink:href="#DejaVuSans-79" transform="translate(3919.617188 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(3978.796875 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(4040.320312 0)"/>
+    <use xlink:href="#DejaVuSans-4e" transform="translate(4081.433594 0)"/>
+    <use xlink:href="#DejaVuSans-6f" transform="translate(4156.238281 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(4217.419922 0)"/>
+    <use xlink:href="#DejaVuSans-6d" transform="translate(4256.783203 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(4354.195312 0)"/>
+   </g>
+  </g>
+  <g id="legend_1">
+   <g id="line2d_73">
+    <path d="M 557.495625 46.691969 
+L 573.745625 46.691969 
+L 589.995625 46.691969 
+" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#m6f9dc476d7" x="573.745625" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="text_39">
+    <!-- KernelAgent-Oink (ours) -->
+    <g transform="translate(600.395625 51.241969) scale(0.13 -0.13)">
+     <defs>
+      <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4b"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(60.576172 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(122.099609 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(161.462891 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(224.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6c" transform="translate(286.365234 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(314.148438 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(382.556641 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(446.033203 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(507.556641 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(570.935547 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(610.144531 0)"/>
+     <use xlink:href="#DejaVuSans-4f" transform="translate(648.978516 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(727.689453 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(755.472656 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(818.851562 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(876.761719 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(908.548828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(947.5625 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(1008.744141 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(1072.123047 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1113.236328 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1165.335938 0)"/>
+    </g>
+   </g>
+   <g id="line2d_74">
+    <path d="M 782.960313 46.691969 
+L 799.210313 46.691969 
+L 815.460313 46.691969 
+" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#m60e85d9501" x="799.210313" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="text_40">
+    <!-- Quack -->
+    <g transform="translate(825.860313 51.241969) scale(0.13 -0.13)">
+     <use xlink:href="#DejaVuSans-51"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(78.710938 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(142.089844 0)"/>
+     <use xlink:href="#DejaVuSans-63" transform="translate(203.369141 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(258.349609 0)"/>
+    </g>
+   </g>
+   <g id="line2d_75">
+    <path d="M 892.974844 46.691969 
+L 909.224844 46.691969 
+L 925.474844 46.691969 
+" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="text_41">
+    <!-- HBM peak (measured 7.140 TB/s) -->
+    <g transform="translate(935.874844 51.241969) scale(0.13 -0.13)">
+     <defs>
+      <path id="DejaVuSans-48" d="M 628 4666 
+L 1259 4666 
+L 1259 2753 
+L 3553 2753 
+L 3553 4666 
+L 4184 4666 
+L 4184 0 
+L 3553 0 
+L 3553 2222 
+L 1259 2222 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-37" d="M 525 4666 
+L 3525 4666 
+L 3525 4397 
+L 1831 0 
+L 1172 0 
+L 2766 4134 
+L 525 4134 
+L 525 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2e" d="M 684 794 
+L 1344 794 
+L 1344 0 
+L 684 0 
+L 684 794 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-54" d="M -19 4666 
+L 3928 4666 
+L 3928 4134 
+L 2272 4134 
+L 2272 0 
+L 1638 0 
+L 1638 4134 
+L -19 4134 
+L -19 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-48"/>
+     <use xlink:href="#DejaVuSans-42" transform="translate(75.195312 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(143.798828 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(230.078125 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(261.865234 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(325.341797 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(386.865234 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(448.144531 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(506.054688 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(537.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(576.855469 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(674.267578 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(735.791016 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(797.070312 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(849.169922 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(912.548828 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(951.412109 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1012.935547 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(1076.412109 0)"/>
+     <use xlink:href="#DejaVuSans-37" transform="translate(1108.199219 0)"/>
+     <use xlink:href="#DejaVuSans-2e" transform="translate(1171.822266 0)"/>
+     <use xlink:href="#DejaVuSans-31" transform="translate(1203.609375 0)"/>
+     <use xlink:href="#DejaVuSans-34" transform="translate(1267.232422 0)"/>
+     <use xlink:href="#DejaVuSans-30" transform="translate(1330.855469 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(1394.478516 0)"/>
+     <use xlink:href="#DejaVuSans-54" transform="translate(1426.265625 0)"/>
+     <use xlink:href="#DejaVuSans-42" transform="translate(1487.349609 0)"/>
+     <use xlink:href="#DejaVuSans-2f" transform="translate(1555.953125 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1589.644531 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1641.744141 0)"/>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="pdf8b875346">
+   <rect x="58.465" y="144.816" width="371.009812" height="189.730471"/>
+  </clipPath>
+  <clipPath id="p612a449b4b">
+   <rect x="474.7075" y="144.816" width="371.009812" height="189.730471"/>
+  </clipPath>
+  <clipPath id="pe932b3d384">
+   <rect x="890.95" y="144.816" width="371.009812" height="189.730471"/>
+  </clipPath>
+  <clipPath id="pb89e28d010">
+   <rect x="1307.1925" y="144.816" width="371.009812" height="189.730471"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/oink/benchmarks/readme/plot_quack_style_svg.py b/oink/benchmarks/readme/plot_quack_style_svg.py
index 88eebdf3..5e0224a8 100644
--- a/oink/benchmarks/readme/plot_quack_style_svg.py
+++ b/oink/benchmarks/readme/plot_quack_style_svg.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 """
-Generate Quack-style SVG performance plots (Oink vs Quack) from the SM100 suite
-JSON artifacts under `/tmp/kernelagent_oink_sm100_suite_{bf16,fp16}`.
+Generate Quack-style SVG performance plots (Oink vs Quack) from SM10x suite
+JSON artifacts under a suite output directory.
 
 The intent is to match Quack's README visual style:
   - 3 horizontal panels (suite-dependent):
@@ -240,12 +240,13 @@ def _plot(
             label="Quack",
         )
         if roofline_gbps is not None:
+            roof_label = f"HBM peak (measured {roofline_gbps / 1000.0:.3f} TB/s)"
             ax.axhline(
                 roofline_gbps,
                 color=COLOR_ROOF,
                 linewidth=3,
                 linestyle=(0, (4, 6)),
-                label="HBM peak (measured)" if ax is axes[0] else None,
+                label=roof_label if ax is axes[0] else None,
             )
             max_y = max(max_y, float(roofline_gbps))
 
@@ -389,7 +390,19 @@ def main() -> None:
         "--roofline-json",
         type=str,
         default=None,
-        help="Optional /tmp/hbm_roofline_sm100_*.json path",
+        help="Optional measured roofline JSON path from benchmark_hbm_roofline_sm100.py",
+    )
+    p.add_argument(
+        "--roofline-gbps",
+        type=float,
+        default=None,
+        help="Optional measured roofline in GB/s (mutually exclusive with --roofline-json).",
+    )
+    p.add_argument(
+        "--arch-label",
+        type=str,
+        default="SM100",
+        help="Architecture label used in auto-generated titles, e.g. 'SM103 / GB300'.",
     )
     p.add_argument("--out", type=str, required=True, help="Output SVG path")
     p.add_argument(
@@ -401,8 +414,12 @@ def main() -> None:
     if not os.path.isdir(in_dir):
         raise SystemExit(f"--in-dir is not a directory: {in_dir}")
 
+    if args.roofline_json is not None and args.roofline_gbps is not None:
+        raise SystemExit("Use only one of --roofline-json or --roofline-gbps.")
     roofline_gbps = (
-        _read_roofline_gbps(args.roofline_json) if args.roofline_json else None
+        float(args.roofline_gbps)
+        if args.roofline_gbps is not None
+        else (_read_roofline_gbps(args.roofline_json) if args.roofline_json else None)
     )
 
     panel_files = list(_panel_files_for_suite(str(args.suite)))
@@ -451,10 +468,11 @@ def main() -> None:
             if (args.suite == "quack_suite" and args.include_layernorm)
             else ""
         )
+        arch_label = str(args.arch_label)
         if args.suite == "dsv3_cross_entropy":
-            title = f"SM100 {dtype.upper()} — {suite_name}{suffix}"
+            title = f"{arch_label} {dtype.upper()} — {suite_name}{suffix}"
         else:
-            title = f"SM100 {dtype.upper()} Kernel Benchmarks (Oink vs Quack) — {suite_name}{suffix}"
+            title = f"{arch_label} {dtype.upper()} Kernel Benchmarks (Oink vs Quack) — {suite_name}{suffix}"
 
     _plot(
         panels=panels,
diff --git a/oink/benchmarks/readme/run_sm100_suite.py b/oink/benchmarks/readme/run_sm100_suite.py
index 05bd2116..71920e38 100644
--- a/oink/benchmarks/readme/run_sm100_suite.py
+++ b/oink/benchmarks/readme/run_sm100_suite.py
@@ -56,6 +56,11 @@ def main() -> None:
         action="store_true",
         help="Skip correctness checks (Oink/Quack vs PyTorch / pure-PyTorch references)",
     )
+    p.add_argument(
+        "--include-dsv4",
+        action="store_true",
+        help="Also run DeepSeek-V4-Flash norm workloads (RMSNorm N={7168,1536,512}; LayerNorm/fused-add N=7168).",
+    )
     p.add_argument(
         "--dry-run", action="store_true", help="Print commands without executing them"
     )
@@ -83,7 +88,115 @@ def script(name: str) -> str:
     if args.skip_verify:
         common = [*common, "--skip-verify"]
 
-    runs: List[Tuple[str, List[str]]] = [
+    runs: List[Tuple[str, List[str]]] = []
+
+    if args.include_dsv4:
+        runs.extend(
+            [
+                (
+                    "rmsnorm_fwd_dsv4_wfp32",
+                    [
+                        py,
+                        script("benchmark_rmsnorm_sm100.py"),
+                        *common,
+                        "--weight-dtype",
+                        "fp32",
+                        "--dsv4",
+                        "--iters",
+                        "200",
+                        "--warmup-ms",
+                        "25",
+                        "--json",
+                        os.path.join(out_dir, "rmsnorm_fwd_dsv4_wfp32.json"),
+                    ],
+                ),
+                (
+                    "rmsnorm_fwd_dsv4_wsame",
+                    [
+                        py,
+                        script("benchmark_rmsnorm_sm100.py"),
+                        *common,
+                        "--weight-dtype",
+                        "same",
+                        "--dsv4",
+                        "--iters",
+                        "200",
+                        "--warmup-ms",
+                        "25",
+                        "--json",
+                        os.path.join(out_dir, "rmsnorm_fwd_dsv4_wsame.json"),
+                    ],
+                ),
+                (
+                    "rmsnorm_bwd_dsv4_wfp32",
+                    [
+                        py,
+                        script("benchmark_rmsnorm_bwd_sm100.py"),
+                        *common,
+                        "--weight-dtype",
+                        "fp32",
+                        "--dsv4",
+                        "--iters",
+                        "100",
+                        "--warmup-ms",
+                        "25",
+                        "--json",
+                        os.path.join(out_dir, "rmsnorm_bwd_dsv4_wfp32.json"),
+                    ],
+                ),
+                (
+                    "rmsnorm_bwd_dsv4_wsame",
+                    [
+                        py,
+                        script("benchmark_rmsnorm_bwd_sm100.py"),
+                        *common,
+                        "--weight-dtype",
+                        "same",
+                        "--dsv4",
+                        "--iters",
+                        "100",
+                        "--warmup-ms",
+                        "25",
+                        "--json",
+                        os.path.join(out_dir, "rmsnorm_bwd_dsv4_wsame.json"),
+                    ],
+                ),
+                (
+                    "fused_add_rmsnorm_dsv4",
+                    [
+                        py,
+                        script("benchmark_fused_add_rmsnorm_sm100.py"),
+                        *common,
+                        "--dsv4",
+                        "--quack-baseline",
+                        "kernel_inplace",
+                        "--iters",
+                        "200",
+                        "--warmup-ms",
+                        "25",
+                        "--json",
+                        os.path.join(out_dir, "fused_add_rmsnorm_dsv4.json"),
+                    ],
+                ),
+                (
+                    "layernorm_fwd_dsv4",
+                    [
+                        py,
+                        script("benchmark_layernorm_sm100.py"),
+                        *common,
+                        "--dsv4",
+                        "--iters",
+                        "200",
+                        "--warmup-ms",
+                        "25",
+                        "--json",
+                        os.path.join(out_dir, "layernorm_fwd_dsv4.json"),
+                    ],
+                ),
+            ]
+        )
+
+    runs.extend([
         (
             "rmsnorm_fwd_quack_suite_wfp32",
             [
@@ -336,7 +449,7 @@ def script(name: str) -> str:
                 os.path.join(out_dir, "layernorm_fwd_dsv3.json"),
             ],
         ),
-    ]
+    ])
 
     print(f"Writing results to: {out_dir}", flush=True)
     for name, cmd in runs:
diff --git a/oink/benchmarks/readme/summarize_results.py b/oink/benchmarks/readme/summarize_results.py
index 684694d6..efc92e4e 100644
--- a/oink/benchmarks/readme/summarize_results.py
+++ b/oink/benchmarks/readme/summarize_results.py
@@ -232,7 +232,7 @@ def main() -> None:
         raise SystemExit(f"No .json files found under: {in_dir}")
 
     out_parts: List[str] = []
-    out_parts.append("# KernelAgent-Oink SM100 Benchmark Summary")
+    out_parts.append("# KernelAgent-Oink SM10x Benchmark Summary")
     out_parts.append("")
     out_parts.append(f"Input directory: `{in_dir}`")
     out_parts.append("")
diff --git a/oink/pyproject.toml b/oink/pyproject.toml
index e0f19270..9dedd92f 100644
--- a/oink/pyproject.toml
+++ b/oink/pyproject.toml
@@ -5,12 +5,12 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "kernelagent-oink"
 version = "0.1.0"
-description = "CuTeDSL kernels for Blackwell (SM100), shipped as a vLLM plugin"
+description = "CuTeDSL kernels for Blackwell SM10x (SM100-SM103), shipped as a vLLM plugin"
 readme = "README.md"
 requires-python = ">=3.10"
 license = {text = "Apache-2.0"}
 authors = [{name = "PyTorch Labs"}]
-keywords = ["cuda", "cutlass", "cute", "cutedsl", "blackwell", "sm100", "vllm"]
+keywords = ["cuda", "cutlass", "cute", "cutedsl", "blackwell", "sm100", "sm103", "gb300", "vllm"]
 classifiers = [
   "License :: OSI Approved :: Apache Software License",
   "Programming Language :: Python :: 3",
@@ -27,7 +27,7 @@ classifiers = [
 # We intentionally do NOT depend on `torch` here because vLLM already pins and
 # provides a compatible PyTorch build.
 dependencies = [
-  "nvidia-cutlass-dsl>=4.2.1",
+  "nvidia-cutlass-dsl>=4.4.2",
   "cuda-python",
 ]
 
diff --git a/oink/src/kernelagent_oink/__init__.py b/oink/src/kernelagent_oink/__init__.py
index d61b60e4..4fe33b1c 100644
--- a/oink/src/kernelagent_oink/__init__.py
+++ b/oink/src/kernelagent_oink/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 """
-KernelAgent-Oink: SM100 CuTeDSL kernels + optional vLLM plugin.
+KernelAgent-Oink: Blackwell SM10x CuTeDSL kernels + optional vLLM plugin.
 
 This package can be loaded as a vLLM "general plugin" (entrypoint group
 `vllm.general_plugins`). In that mode it registers Oink custom ops only when
@@ -135,7 +135,7 @@ def register(*, force: bool = False) -> None:
 def register_all_kernels(*, force: bool = False) -> None:
     """Override aten ops with Oink's kernels.
 
-    Checks CUDA/SM100/deps, sets up the CuTeDSL environment, then overrides
+    Checks CUDA/Blackwell SM10x/deps, sets up the CuTeDSL environment, then overrides
     ``aten::_fused_rms_norm`` and ``aten::_fused_rms_norm_backward`` on CUDA.
 
     Does NOT register ``torch.ops.oink.*`` custom ops — use :func:`register`
diff --git a/oink/src/kernelagent_oink/blackwell/_cutedsl_cache.py b/oink/src/kernelagent_oink/blackwell/_cutedsl_cache.py
new file mode 100644
index 00000000..2e2536d7
--- /dev/null
+++ b/oink/src/kernelagent_oink/blackwell/_cutedsl_cache.py
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""CuTeDSL cache setup shared by Blackwell kernel modules.
+
+CuTeDSL cache bytecode is version-sensitive.  The default global cache
+(``/tmp/$USER/cutlass_python_cache``) can be shared across environments with
+incompatible ``nvidia-cutlass-dsl`` versions, producing noisy warnings and
+lost cache reuse.  Call this helper before importing ``cutlass`` in modules
+that compile CuTeDSL kernels.
+"""
+
+from __future__ import annotations
+
+import importlib.metadata
+import os
+import re
+
+
+def ensure_versioned_cutedsl_cache_dir() -> None:
+    """Set a version-scoped CuTeDSL cache directory when the user did not.
+
+    The path format intentionally matches the historical per-module logic:
+    ``$TMPDIR/$USER/cutlass_python_cache_<nvidia-cutlass-dsl-version>``.
+    If ``CUTE_DSL_CACHE_DIR`` is already set, leave it untouched.
+    """
+    if "CUTE_DSL_CACHE_DIR" in os.environ:
+        return
+    try:
+        dsl_ver = importlib.metadata.version("nvidia-cutlass-dsl")
+    except Exception:
+        dsl_ver = "unknown"
+    dsl_ver = re.sub(r"[^0-9A-Za-z]+", "_", dsl_ver)
+    user = os.environ.get("USER") or os.environ.get("USERNAME") or "user"
+    tmp = os.environ.get("TMPDIR") or "/tmp"
+    os.environ["CUTE_DSL_CACHE_DIR"] = os.path.join(
+        tmp, user, f"cutlass_python_cache_{dsl_ver}"
+    )
diff --git a/oink/src/kernelagent_oink/blackwell/_rmsnorm_impl.py b/oink/src/kernelagent_oink/blackwell/_rmsnorm_impl.py
index cbea22d8..3af5ea3b 100644
--- a/oink/src/kernelagent_oink/blackwell/_rmsnorm_impl.py
+++ b/oink/src/kernelagent_oink/blackwell/_rmsnorm_impl.py
@@ -16,25 +16,14 @@
 
 from __future__ import annotations
 
-import importlib.metadata
 import os
-import re
 from dataclasses import dataclass, replace
 
 # Vendored/adapted from Quack's SM100 RMSNorm with Oink-specific B200 tuning.
 
-# CuTeDSL cache bytecode is version-sensitive, so isolate the default cache.
-if "CUTE_DSL_CACHE_DIR" not in os.environ:
-    try:
-        _dsl_ver = importlib.metadata.version("nvidia-cutlass-dsl")
-    except Exception:
-        _dsl_ver = "unknown"
-    _dsl_ver = re.sub(r"[^0-9A-Za-z]+", "_", _dsl_ver)
-    _user = os.environ.get("USER") or os.environ.get("USERNAME") or "user"
-    _tmp = os.environ.get("TMPDIR") or "/tmp"
-    os.environ["CUTE_DSL_CACHE_DIR"] = os.path.join(
-        _tmp, _user, f"cutlass_python_cache_{_dsl_ver}"
-    )
+from kernelagent_oink.blackwell._cutedsl_cache import ensure_versioned_cutedsl_cache_dir
+
+ensure_versioned_cutedsl_cache_dir()
 
 try:
     import cutlass  # type: ignore  # noqa: F401
@@ -182,7 +171,16 @@ def _resolve_forward_launch_config(
     weight_dtype: type[cutlass.Numeric] | None,
     aligned_tensors: tuple[Tensor | None, ...],
 ) -> _ForwardLaunchConfig:
-    direct_gmem_default = bool(dtype.width == 16 and N in {128, 4096, 6144, 7168, 8192})
+    direct_gmem_default = bool(
+        dtype.width == 16 and N in {128, 512, 4096, 6144, 7168, 8192}
+    )
+    if (
+        dtype.width == 16
+        and N == 1536
+        and weight_dtype is not None
+        and weight_dtype.width == 16
+    ):
+        direct_gmem_default = True
     if weight_dtype is not None and weight_dtype.width == 32 and N == 7168:
         direct_gmem_default = False
     direct_gmem = _direct_gmem_from_policy(default=direct_gmem_default)
@@ -197,7 +195,7 @@ def _resolve_forward_launch_config(
     default_copy_bits = 256 if can_use_256 else 128
     if dtype.width == 16 and N == 128:
         default_copy_bits = 128
-    if dtype.width == 16 and N == 4096:
+    if dtype.width == 16 and N in {512, 1536, 4096}:
         default_copy_bits = 128
     if dtype.width == 16 and weight_dtype is not None and weight_dtype.width == 32:
         default_copy_bits = 128 if N == 4096 else 64
@@ -205,6 +203,12 @@ def _resolve_forward_launch_config(
     copy_bits = _copy_bits_from_policy(
         default=default_copy_bits, can_use_256=can_use_256
     )
+    # cp.async supports at most 128 bits per instruction.  The copy atom clamps
+    # async copies to 128b, so keep the TV layout's vector width in sync with the
+    # emitted copy width; otherwise shapes such as DSv4 N=1536 can leave half of
+    # each logical vector tile uninitialized.
+    if use_async and copy_bits > 128:
+        copy_bits = 128
     if use_async and copy_bits < 128:
         use_async = False
 
@@ -284,6 +288,16 @@ def _forward_launch_overrides(
     nt_default: int | None = None
     cluster_n_default: int | None = None
 
+    if (
+        dtype.width == 16
+        and weight_dtype is not None
+        and weight_dtype.width == 16
+        and N == 1536
+        and direct_gmem
+        and M >= 4096
+    ):
+        tpr_default = 32
+        nt_default = 32
     if (
         dtype.width == 16
         and weight_dtype is not None
diff --git a/oink/src/kernelagent_oink/blackwell/_rmsnorm_simple_weightonly.py b/oink/src/kernelagent_oink/blackwell/_rmsnorm_simple_weightonly.py
index 2a790f6f..d9d10734 100644
--- a/oink/src/kernelagent_oink/blackwell/_rmsnorm_simple_weightonly.py
+++ b/oink/src/kernelagent_oink/blackwell/_rmsnorm_simple_weightonly.py
@@ -1,8 +1,22 @@
+"""Measured same-dtype bf16 RMSNorm forward specializations.
+
+This module implements a narrow fast path for row-major bf16 tensors with a
+bf16 1D weight and no residual/bias/rstd outputs.  The math is
+``y = x * rsqrt(mean(x * x) + eps) * weight`` with fp32 reduction/multiply and
+bf16 output.  The shape table below is deliberately measured and narrow; shapes
+not listed fall back to the generic RMSNorm pointer path.
+"""
+
+from dataclasses import dataclass
+
 import cuda.bindings.driver as cuda
 import torch
-from dataclasses import dataclass
 from torch import Tensor
 
+from kernelagent_oink.blackwell._cutedsl_cache import ensure_versioned_cutedsl_cache_dir
+
+ensure_versioned_cutedsl_cache_dir()
+
 import cutlass
 import cutlass.cute as cute
 import cutlass.utils as utils
@@ -19,6 +33,10 @@
 _COMPILED_CACHE: dict[tuple[object, int, int, int], object] = {}
 
 _SIMPLE_WEIGHTONLY_SHAPES: dict[tuple[int, int], tuple[int, int]] = {
+    # DeepSeek-V4-Flash q_lora same-dtype RMSNorm shape.  Larger M and the
+    # kv/per-head N=512 cases are faster through the generic pointer path on SM103.
+    (4096, 1536): (96, 96),
+    # DeepSeek-V3 hidden-state same-dtype RMSNorm shapes.
     (4096, 6144): (192, 192),
     (4096, 7168): (224, 224),
     (4096, 8192): (256, 256),
@@ -66,7 +84,12 @@ def cache_key(self) -> tuple[object, int, int, int]:
         )
 
     @staticmethod
-    def make_tv_layout(threads_per_row, rows_per_block, vec_size, num_vec_blocks):
+    def make_tv_layout(
+        threads_per_row: int,
+        rows_per_block: int,
+        vec_size: int,
+        num_vec_blocks: int,
+    ):
         shape = ((threads_per_row, rows_per_block), (vec_size, num_vec_blocks))
         stride = (
             (vec_size * rows_per_block, 1),
@@ -74,7 +97,7 @@ def make_tv_layout(threads_per_row, rows_per_block, vec_size, num_vec_blocks):
         )
         return shape, stride
 
-    def smem_bytes(self):
+    def smem_bytes(self) -> int:
         return (
             self.rows_per_block * self.cols_per_tile * (self.dtype.width // 8)
             + self.rows_per_block * self.warps_per_row * 4
@@ -210,9 +233,6 @@ def kernel(
             cute.copy(copy_atom_store, tXrO, tXgO)
 
 
-def _can_use_simple_weightonly(x: Tensor, weight: Tensor, out: Tensor) -> bool:
-    return _get_simple_weightonly_config(x, weight, out) is not None
-
 
 def _get_simple_weightonly_config(
     x: Tensor,
diff --git a/oink/src/kernelagent_oink/blackwell/cross_entropy.py b/oink/src/kernelagent_oink/blackwell/cross_entropy.py
index d8b37ea2..06f72084 100644
--- a/oink/src/kernelagent_oink/blackwell/cross_entropy.py
+++ b/oink/src/kernelagent_oink/blackwell/cross_entropy.py
@@ -39,10 +39,8 @@
 
 from __future__ import annotations
 
-import importlib.metadata
 import math
 import os
-import re
 from typing import Literal, Optional, Type
 
 import torch
@@ -50,21 +48,9 @@
 
 import cuda.bindings.driver as cuda  # provided by NVIDIA cuda-python
 
-# CuTeDSL caches generated MLIR into a tempdir under a global default
-# (`/tmp/$USER/cutlass_python_cache`). The cache bytecode format can differ across
-# `nvidia-cutlass-dsl` versions, and cross-version cache sharing causes noisy
-# warnings (and disables cache reuse).
-if "CUTE_DSL_CACHE_DIR" not in os.environ:
-    try:
-        _dsl_ver = importlib.metadata.version("nvidia-cutlass-dsl")
-    except Exception:
-        _dsl_ver = "unknown"
-    _dsl_ver = re.sub(r"[^0-9A-Za-z]+", "_", _dsl_ver)
-    _user = os.environ.get("USER") or os.environ.get("USERNAME") or "user"
-    _tmp = os.environ.get("TMPDIR") or "/tmp"
-    os.environ["CUTE_DSL_CACHE_DIR"] = os.path.join(
-        _tmp, _user, f"cutlass_python_cache_{_dsl_ver}"
-    )
+from kernelagent_oink.blackwell._cutedsl_cache import ensure_versioned_cutedsl_cache_dir
+
+ensure_versioned_cutedsl_cache_dir()
 
 try:
     import cutlass  # type: ignore  # noqa: F401
diff --git a/oink/src/kernelagent_oink/blackwell/layernorm.py b/oink/src/kernelagent_oink/blackwell/layernorm.py
index ada51ecd..4e5190b7 100644
--- a/oink/src/kernelagent_oink/blackwell/layernorm.py
+++ b/oink/src/kernelagent_oink/blackwell/layernorm.py
@@ -30,10 +30,7 @@
 
 from __future__ import annotations
 
-import importlib.metadata
 import math
-import os
-import re
 import operator
 from typing import Optional, Tuple, Type
 
@@ -42,21 +39,9 @@
 
 import cuda.bindings.driver as cuda  # provided by NVIDIA cuda-python
 
-# CuTeDSL caches generated MLIR into a tempdir under a global default
-# (`/tmp/$USER/cutlass_python_cache`). The cache bytecode format can differ across
-# `nvidia-cutlass-dsl` versions, and cross-version cache sharing causes noisy
-# warnings (and disables cache reuse).
-if "CUTE_DSL_CACHE_DIR" not in os.environ:
-    try:
-        _dsl_ver = importlib.metadata.version("nvidia-cutlass-dsl")
-    except Exception:
-        _dsl_ver = "unknown"
-    _dsl_ver = re.sub(r"[^0-9A-Za-z]+", "_", _dsl_ver)
-    _user = os.environ.get("USER") or os.environ.get("USERNAME") or "user"
-    _tmp = os.environ.get("TMPDIR") or "/tmp"
-    os.environ["CUTE_DSL_CACHE_DIR"] = os.path.join(
-        _tmp, _user, f"cutlass_python_cache_{_dsl_ver}"
-    )
+from kernelagent_oink.blackwell._cutedsl_cache import ensure_versioned_cutedsl_cache_dir
+
+ensure_versioned_cutedsl_cache_dir()
 
 try:
     import cutlass  # type: ignore  # noqa: F401
diff --git a/oink/src/kernelagent_oink/blackwell/oink_custom_ops.py b/oink/src/kernelagent_oink/blackwell/oink_custom_ops.py
index 282514d3..348b4f57 100644
--- a/oink/src/kernelagent_oink/blackwell/oink_custom_ops.py
+++ b/oink/src/kernelagent_oink/blackwell/oink_custom_ops.py
@@ -16,7 +16,7 @@
 Torch custom ops wrapping Oink's Blackwell RMSNorm kernels.
 
 These ops are designed to be:
-- Architecture-aware (use CuTeDSL SM100 kernels when available, fall back
+- Architecture-aware (use CuTeDSL Blackwell SM10x kernels when available, fall back
   to a safe reference elsewhere).
 - Layout-preserving for 2D row-major inputs, including padded MLA-style
   layouts where stride(0) > N and stride(1) == 1.
@@ -69,7 +69,7 @@ def _get_rmsnorm_mod():
 
 
 def _get_sm(device: torch.device | None = None) -> int:
-    """Return SM version as an int (e.g., 100 for SM100 / Blackwell)."""
+    """Return SM version as an int (e.g., 103 for SM103 / Blackwell)."""
     if device is None:
         device = torch.device("cuda")
     major, minor = torch.cuda.get_device_capability(device)
@@ -95,7 +95,7 @@ def oink_rmsnorm(
     dimension stride(0) may be larger than N (padded-row layouts), and
     will be preserved on the fast CuTeDSL path.
 
-    On SM100 (and newer), this dispatches to the tuned CuTeDSL Blackwell
+    On Blackwell SM10x (SM100 and newer), this dispatches to the tuned CuTeDSL Blackwell
     RMSNorm kernel in rmsnorm.rmsnorm_forward, which in turn selects the
     best internal schedule (including DSv3-specific stage-2 kernels where
     applicable) and preserves the input's 2D stride when using the
@@ -111,7 +111,7 @@ def oink_rmsnorm(
     sm = _get_sm(x.device)
     _rms = _get_rmsnorm_mod()
     if sm >= 100:
-        # Use the tuned CuTeDSL SM100 kernel. The public API already
+        # Use the tuned CuTeDSL Blackwell kernel. The public API already
         # contains all necessary gating and layout checks internally.
         y, _rstd, _res = _rms.rmsnorm_forward(
             x,
@@ -186,13 +186,13 @@ def oink_fused_add_rms_norm(
     _rms = _get_rmsnorm_mod()
 
     if sm < 100:
-        # Non-SM100 fallback: keep semantics in-place (correctness-first).
+        # Non-SM10x fallback: keep semantics in-place (correctness-first).
         residual.add_(x)
         y = _rms.rmsnorm_ref(residual, w=weight, b=None, residual=None, eps=eps)
         x.copy_(y)
         return None
 
-    # SM100+: prefer the lowest-overhead in-place entrypoint (returns None).
+    # SM10x+: prefer the lowest-overhead in-place entrypoint (returns None).
     if hasattr(_rms, "fused_add_rmsnorm_inplace_"):
         _rms.fused_add_rmsnorm_inplace_(  # type: ignore[misc]
             x,
diff --git a/oink/src/kernelagent_oink/blackwell/softmax.py b/oink/src/kernelagent_oink/blackwell/softmax.py
index 394ab486..3ee93c56 100644
--- a/oink/src/kernelagent_oink/blackwell/softmax.py
+++ b/oink/src/kernelagent_oink/blackwell/softmax.py
@@ -26,9 +26,6 @@
 
 from __future__ import annotations
 
-import importlib.metadata
-import os
-import re
 from typing import Type
 
 import torch
@@ -36,21 +33,9 @@
 
 import cuda.bindings.driver as cuda  # provided by NVIDIA cuda-python
 
-# CuTeDSL caches generated MLIR into a tempdir under a global default
-# (`/tmp/$USER/cutlass_python_cache`). The cache bytecode format can differ across
-# `nvidia-cutlass-dsl` versions, and cross-version cache sharing causes noisy
-# warnings (and disables cache reuse).
-if "CUTE_DSL_CACHE_DIR" not in os.environ:
-    try:
-        _dsl_ver = importlib.metadata.version("nvidia-cutlass-dsl")
-    except Exception:
-        _dsl_ver = "unknown"
-    _dsl_ver = re.sub(r"[^0-9A-Za-z]+", "_", _dsl_ver)
-    _user = os.environ.get("USER") or os.environ.get("USERNAME") or "user"
-    _tmp = os.environ.get("TMPDIR") or "/tmp"
-    os.environ["CUTE_DSL_CACHE_DIR"] = os.path.join(
-        _tmp, _user, f"cutlass_python_cache_{_dsl_ver}"
-    )
+from kernelagent_oink.blackwell._cutedsl_cache import ensure_versioned_cutedsl_cache_dir
+
+ensure_versioned_cutedsl_cache_dir()
 
 try:
     import cutlass  # type: ignore  # noqa: F401

From 88b85ffe2bd125cbfe897db80e1a4c1b132246fb Mon Sep 17 00:00:00 2001
From: Laura Wang <3700467+Laurawly@users.noreply.github.com>
Date: Wed, 29 Apr 2026 14:41:33 -0700
Subject: [PATCH 2/4] Document fused RMSNorm benchmark results

---
 oink/benchmarks/README.md | 56 ++++++++++++++++++++++++++++++++-------
 1 file changed, 47 insertions(+), 9 deletions(-)

diff --git a/oink/benchmarks/README.md b/oink/benchmarks/README.md
index c99966e1..9685000b 100644
--- a/oink/benchmarks/README.md
+++ b/oink/benchmarks/README.md
@@ -153,19 +153,57 @@ python benchmarks/benchmark/benchmark_rmsnorm_sm100.py --dtype bf16 --weight-dty
 ### Fused Add + RMSNorm (vLLM-style, in-place)
 
 This is a good roofline case study kernel (heavy read/write traffic, very little
-extra math):
+extra math). Oink exposes an **in-place** fused op that updates `x` and
+`residual`. Quack's fused kernel writes separate `out` and `residual_out`
+buffers, so the default benchmark baseline (`--quack-baseline kernel_inplace`)
+times Quack plus the copies needed to match Oink's in-place semantics. Use
+`--quack-baseline kernel` to time only the Quack kernel with preallocated
+outputs.
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py --dtype bf16 --M 65536 --N 4096 \
-  --json /tmp/fused_add_rmsnorm_sm100_bf16.json
+# DeepSeek-V3 hidden-size sweep
+PYTHONNOUSERSITE=1 CUTE_DSL_ARCH=sm_103a \
+  python benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py \
+    --dtype bf16 --dsv3 --iters 80 --warmup-ms 15 \
+    --quack-baseline kernel_inplace \
+    --json /tmp/oink_sm103_fused_add_rmsnorm_dsv3_bf16.json
+
+# DeepSeek-V4-Flash hidden-state sweep (N=7168)
+PYTHONNOUSERSITE=1 CUTE_DSL_ARCH=sm_103a \
+  python benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py \
+    --dtype bf16 --dsv4 --iters 80 --warmup-ms 15 \
+    --quack-baseline kernel_inplace \
+    --json /tmp/oink_sm103_fused_add_rmsnorm_dsv4_bf16.json
 ```
 
-Note on the Quack baseline: Oink exposes an **in-place** fused op (updates `x`
-and `residual`). Quack’s fused kernel produces `out` and `residual_out`
-out-of-place, so by default the benchmark times `quack::_rmsnorm_fwd` **plus**
-two explicit copies (`x.copy_(out)`, `residual.copy_(residual_out)`) to match the
-in-place semantics. Use `--quack-baseline kernel` to time only the Quack fused
-kernel with preallocated outputs.
+Current GB300 / SM103 BF16 results from correctness-gated runs:
+
+| suite | rows | speedup vs Quack (min / geomean / max) |
+|---|---:|---:|
+| DSv3 fused-add RMSNorm | 9 | 2.022x / 2.045x / 2.089x |
+| DSv4 fused-add RMSNorm | 3 | 2.030x / 2.192x / 2.521x |
+
+DSv3 per-shape results:
+
+| M | N | Oink ms | Quack ms | speedup | Oink TB/s |
+|---:|---:|---:|---:|---:|---:|
+| 4096 | 6144 | 0.0360 | 0.0727 | 2.022x | 5.598 |
+| 4096 | 7168 | 0.0396 | 0.0828 | 2.089x | 5.926 |
+| 4096 | 8192 | 0.0479 | 0.0993 | 2.076x | 5.610 |
+| 16384 | 6144 | 0.1206 | 0.2463 | 2.043x | 6.678 |
+| 16384 | 7168 | 0.1393 | 0.2830 | 2.031x | 6.742 |
+| 16384 | 8192 | 0.1574 | 0.3212 | 2.040x | 6.821 |
+| 65536 | 6144 | 0.4575 | 0.9285 | 2.030x | 7.041 |
+| 65536 | 7168 | 0.5329 | 1.0785 | 2.024x | 7.052 |
+| 65536 | 8192 | 0.6077 | 1.2466 | 2.052x | 7.068 |
+
+DSv4 per-shape results:
+
+| M | N | Oink ms | Quack ms | speedup | Oink TB/s |
+|---:|---:|---:|---:|---:|---:|
+| 4096 | 7168 | 0.0415 | 0.1047 | 2.521x | 5.655 |
+| 16384 | 7168 | 0.1388 | 0.2855 | 2.057x | 6.769 |
+| 65536 | 7168 | 0.5314 | 1.0785 | 2.030x | 7.072 |
 
 ### RMSNorm backward
 

From 8fd781906bd5e498e69474d2a89df4edbb8f217d Mon Sep 17 00:00:00 2001
From: Laura Wang <3700467+Laurawly@users.noreply.github.com>
Date: Wed, 29 Apr 2026 14:56:11 -0700
Subject: [PATCH 3/4] Fix Oink ruff import-order checks

---
 .../kernelagent_oink/blackwell/_rmsnorm_simple_weightonly.py    | 2 ++
 oink/src/kernelagent_oink/blackwell/cross_entropy.py            | 2 +-
 oink/src/kernelagent_oink/blackwell/layernorm.py                | 1 +
 oink/src/kernelagent_oink/blackwell/softmax.py                  | 1 +
 4 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/oink/src/kernelagent_oink/blackwell/_rmsnorm_simple_weightonly.py b/oink/src/kernelagent_oink/blackwell/_rmsnorm_simple_weightonly.py
index d9d10734..6c7d9065 100644
--- a/oink/src/kernelagent_oink/blackwell/_rmsnorm_simple_weightonly.py
+++ b/oink/src/kernelagent_oink/blackwell/_rmsnorm_simple_weightonly.py
@@ -6,6 +6,8 @@
 bf16 output.  The shape table below is deliberately measured and narrow; shapes
 not listed fall back to the generic RMSNorm pointer path.
 """
+# ruff: noqa: E402  # CuTeDSL cache setup must run before importing cutlass.
+
 
 from dataclasses import dataclass
 
diff --git a/oink/src/kernelagent_oink/blackwell/cross_entropy.py b/oink/src/kernelagent_oink/blackwell/cross_entropy.py
index 06f72084..5bb15685 100644
--- a/oink/src/kernelagent_oink/blackwell/cross_entropy.py
+++ b/oink/src/kernelagent_oink/blackwell/cross_entropy.py
@@ -1,3 +1,4 @@
+# ruff: noqa: E402  # CuTeDSL cache setup must run before importing cutlass.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -40,7 +41,6 @@
 from __future__ import annotations
 
 import math
-import os
 from typing import Literal, Optional, Type
 
 import torch
diff --git a/oink/src/kernelagent_oink/blackwell/layernorm.py b/oink/src/kernelagent_oink/blackwell/layernorm.py
index 4e5190b7..6b4b9c72 100644
--- a/oink/src/kernelagent_oink/blackwell/layernorm.py
+++ b/oink/src/kernelagent_oink/blackwell/layernorm.py
@@ -1,3 +1,4 @@
+# ruff: noqa: E402  # CuTeDSL cache setup must run before importing cutlass.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/oink/src/kernelagent_oink/blackwell/softmax.py b/oink/src/kernelagent_oink/blackwell/softmax.py
index 3ee93c56..d364c5cf 100644
--- a/oink/src/kernelagent_oink/blackwell/softmax.py
+++ b/oink/src/kernelagent_oink/blackwell/softmax.py
@@ -1,3 +1,4 @@
+# ruff: noqa: E402  # CuTeDSL cache setup must run before importing cutlass.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

From 956884ad07cd2fa14a636817e597b2acad23396e Mon Sep 17 00:00:00 2001
From: Laura Wang <3700467+Laurawly@users.noreply.github.com>
Date: Wed, 29 Apr 2026 15:02:54 -0700
Subject: [PATCH 4/4] Apply ruff formatting to Oink files

---
 oink/benchmarks/readme/run_sm100_suite.py     | 510 +++++++++---------
 .../blackwell/_rmsnorm_simple_weightonly.py   |   2 -
 2 files changed, 256 insertions(+), 256 deletions(-)

diff --git a/oink/benchmarks/readme/run_sm100_suite.py b/oink/benchmarks/readme/run_sm100_suite.py
index 71920e38..4ef1bf46 100644
--- a/oink/benchmarks/readme/run_sm100_suite.py
+++ b/oink/benchmarks/readme/run_sm100_suite.py
@@ -196,260 +196,262 @@ def script(name: str) -> str:
             ]
         )
 
-    runs.extend([
-        (
-            "rmsnorm_fwd_quack_suite_wfp32",
-            [
-                py,
-                script("benchmark_rmsnorm_sm100.py"),
-                *common,
-                "--weight-dtype",
-                "fp32",
-                "--quack-suite",
-                "--iters",
-                "200",
-                "--warmup-ms",
-                "25",
-                "--json",
-                os.path.join(out_dir, "rmsnorm_fwd_quack_suite_wfp32.json"),
-            ],
-        ),
-        (
-            "rmsnorm_fwd_dsv3_wfp32",
-            [
-                py,
-                script("benchmark_rmsnorm_sm100.py"),
-                *common,
-                "--weight-dtype",
-                "fp32",
-                "--dsv3",
-                "--iters",
-                "200",
-                "--warmup-ms",
-                "25",
-                "--json",
-                os.path.join(out_dir, "rmsnorm_fwd_dsv3_wfp32.json"),
-            ],
-        ),
-        (
-            "rmsnorm_bwd_quack_suite_wfp32",
-            [
-                py,
-                script("benchmark_rmsnorm_bwd_sm100.py"),
-                *common,
-                "--weight-dtype",
-                "fp32",
-                "--quack-suite",
-                "--iters",
-                "100",
-                "--warmup-ms",
-                "25",
-                "--json",
-                os.path.join(out_dir, "rmsnorm_bwd_quack_suite_wfp32.json"),
-            ],
-        ),
-        (
-            "rmsnorm_bwd_dsv3_wfp32",
-            [
-                py,
-                script("benchmark_rmsnorm_bwd_sm100.py"),
-                *common,
-                "--weight-dtype",
-                "fp32",
-                "--dsv3",
-                "--iters",
-                "100",
-                "--warmup-ms",
-                "25",
-                "--json",
-                os.path.join(out_dir, "rmsnorm_bwd_dsv3_wfp32.json"),
-            ],
-        ),
-        # vLLM inference-style RMSNorm (weight dtype == activation dtype).
-        (
-            "rmsnorm_fwd_quack_suite_wsame",
-            [
-                py,
-                script("benchmark_rmsnorm_sm100.py"),
-                *common,
-                "--weight-dtype",
-                "same",
-                "--quack-suite",
-                "--iters",
-                "200",
-                "--warmup-ms",
-                "25",
-                "--json",
-                os.path.join(out_dir, "rmsnorm_fwd_quack_suite_wsame.json"),
-            ],
-        ),
-        (
-            "rmsnorm_fwd_dsv3_wsame",
-            [
-                py,
-                script("benchmark_rmsnorm_sm100.py"),
-                *common,
-                "--weight-dtype",
-                "same",
-                "--dsv3",
-                "--iters",
-                "200",
-                "--warmup-ms",
-                "25",
-                "--json",
-                os.path.join(out_dir, "rmsnorm_fwd_dsv3_wsame.json"),
-            ],
-        ),
-        (
-            "rmsnorm_bwd_quack_suite_wsame",
-            [
-                py,
-                script("benchmark_rmsnorm_bwd_sm100.py"),
-                *common,
-                "--weight-dtype",
-                "same",
-                "--quack-suite",
-                "--iters",
-                "100",
-                "--warmup-ms",
-                "25",
-                "--json",
-                os.path.join(out_dir, "rmsnorm_bwd_quack_suite_wsame.json"),
-            ],
-        ),
-        (
-            "rmsnorm_bwd_dsv3_wsame",
-            [
-                py,
-                script("benchmark_rmsnorm_bwd_sm100.py"),
-                *common,
-                "--weight-dtype",
-                "same",
-                "--dsv3",
-                "--iters",
-                "100",
-                "--warmup-ms",
-                "25",
-                "--json",
-                os.path.join(out_dir, "rmsnorm_bwd_dsv3_wsame.json"),
-            ],
-        ),
-        (
-            "fused_add_rmsnorm_dsv3",
-            [
-                py,
-                script("benchmark_fused_add_rmsnorm_sm100.py"),
-                *common,
-                "--dsv3",
-                "--quack-baseline",
-                "kernel_inplace",
-                "--iters",
-                "200",
-                "--warmup-ms",
-                "25",
-                "--json",
-                os.path.join(out_dir, "fused_add_rmsnorm_dsv3.json"),
-            ],
-        ),
-        (
-            "softmax_fwd_bwd_quack_suite",
-            [
-                py,
-                script("benchmark_softmax_sm100.py"),
-                *common,
-                "--mode",
-                "fwd_bwd",
-                "--quack-suite",
-                "--iters",
-                "50",
-                "--warmup-ms",
-                "25",
-                "--json",
-                os.path.join(out_dir, "softmax_fwd_bwd_quack_suite.json"),
-            ],
-        ),
-        (
-            "softmax_fwd_bwd_dsv3",
-            [
-                py,
-                script("benchmark_softmax_sm100.py"),
-                *common,
-                "--mode",
-                "fwd_bwd",
-                "--dsv3",
-                "--iters",
-                "50",
-                "--warmup-ms",
-                "25",
-                "--json",
-                os.path.join(out_dir, "softmax_fwd_bwd_dsv3.json"),
-            ],
-        ),
-        (
-            "cross_entropy_fwd_bwd_quack_suite",
-            [
-                py,
-                script("benchmark_cross_entropy_sm100.py"),
-                *common,
-                "--mode",
-                "fwd_bwd",
-                "--quack-suite",
-                "--iters",
-                "50",
-                "--warmup-ms",
-                "25",
-                "--json",
-                os.path.join(out_dir, "cross_entropy_fwd_bwd_quack_suite.json"),
-            ],
-        ),
-        (
-            "cross_entropy_fwd_bwd_dsv3",
-            [
-                py,
-                script("benchmark_cross_entropy_sm100.py"),
-                *common,
-                "--mode",
-                "fwd_bwd",
-                "--dsv3",
-                "--iters",
-                "50",
-                "--warmup-ms",
-                "25",
-                "--json",
-                os.path.join(out_dir, "cross_entropy_fwd_bwd_dsv3.json"),
-            ],
-        ),
-        (
-            "layernorm_fwd_quack_suite",
-            [
-                py,
-                script("benchmark_layernorm_sm100.py"),
-                *common,
-                "--quack-suite",
-                "--iters",
-                "200",
-                "--warmup-ms",
-                "25",
-                "--json",
-                os.path.join(out_dir, "layernorm_fwd_quack_suite.json"),
-            ],
-        ),
-        (
-            "layernorm_fwd_dsv3",
-            [
-                py,
-                script("benchmark_layernorm_sm100.py"),
-                *common,
-                "--dsv3",
-                "--iters",
-                "200",
-                "--warmup-ms",
-                "25",
-                "--json",
-                os.path.join(out_dir, "layernorm_fwd_dsv3.json"),
-            ],
-        ),
-    ])
+    runs.extend(
+        [
+            (
+                "rmsnorm_fwd_quack_suite_wfp32",
+                [
+                    py,
+                    script("benchmark_rmsnorm_sm100.py"),
+                    *common,
+                    "--weight-dtype",
+                    "fp32",
+                    "--quack-suite",
+                    "--iters",
+                    "200",
+                    "--warmup-ms",
+                    "25",
+                    "--json",
+                    os.path.join(out_dir, "rmsnorm_fwd_quack_suite_wfp32.json"),
+                ],
+            ),
+            (
+                "rmsnorm_fwd_dsv3_wfp32",
+                [
+                    py,
+                    script("benchmark_rmsnorm_sm100.py"),
+                    *common,
+                    "--weight-dtype",
+                    "fp32",
+                    "--dsv3",
+                    "--iters",
+                    "200",
+                    "--warmup-ms",
+                    "25",
+                    "--json",
+                    os.path.join(out_dir, "rmsnorm_fwd_dsv3_wfp32.json"),
+                ],
+            ),
+            (
+                "rmsnorm_bwd_quack_suite_wfp32",
+                [
+                    py,
+                    script("benchmark_rmsnorm_bwd_sm100.py"),
+                    *common,
+                    "--weight-dtype",
+                    "fp32",
+                    "--quack-suite",
+                    "--iters",
+                    "100",
+                    "--warmup-ms",
+                    "25",
+                    "--json",
+                    os.path.join(out_dir, "rmsnorm_bwd_quack_suite_wfp32.json"),
+                ],
+            ),
+            (
+                "rmsnorm_bwd_dsv3_wfp32",
+                [
+                    py,
+                    script("benchmark_rmsnorm_bwd_sm100.py"),
+                    *common,
+                    "--weight-dtype",
+                    "fp32",
+                    "--dsv3",
+                    "--iters",
+                    "100",
+                    "--warmup-ms",
+                    "25",
+                    "--json",
+                    os.path.join(out_dir, "rmsnorm_bwd_dsv3_wfp32.json"),
+                ],
+            ),
+            # vLLM inference-style RMSNorm (weight dtype == activation dtype).
+            (
+                "rmsnorm_fwd_quack_suite_wsame",
+                [
+                    py,
+                    script("benchmark_rmsnorm_sm100.py"),
+                    *common,
+                    "--weight-dtype",
+                    "same",
+                    "--quack-suite",
+                    "--iters",
+                    "200",
+                    "--warmup-ms",
+                    "25",
+                    "--json",
+                    os.path.join(out_dir, "rmsnorm_fwd_quack_suite_wsame.json"),
+                ],
+            ),
+            (
+                "rmsnorm_fwd_dsv3_wsame",
+                [
+                    py,
+                    script("benchmark_rmsnorm_sm100.py"),
+                    *common,
+                    "--weight-dtype",
+                    "same",
+                    "--dsv3",
+                    "--iters",
+                    "200",
+                    "--warmup-ms",
+                    "25",
+                    "--json",
+                    os.path.join(out_dir, "rmsnorm_fwd_dsv3_wsame.json"),
+                ],
+            ),
+            (
+                "rmsnorm_bwd_quack_suite_wsame",
+                [
+                    py,
+                    script("benchmark_rmsnorm_bwd_sm100.py"),
+                    *common,
+                    "--weight-dtype",
+                    "same",
+                    "--quack-suite",
+                    "--iters",
+                    "100",
+                    "--warmup-ms",
+                    "25",
+                    "--json",
+                    os.path.join(out_dir, "rmsnorm_bwd_quack_suite_wsame.json"),
+                ],
+            ),
+            (
+                "rmsnorm_bwd_dsv3_wsame",
+                [
+                    py,
+                    script("benchmark_rmsnorm_bwd_sm100.py"),
+                    *common,
+                    "--weight-dtype",
+                    "same",
+                    "--dsv3",
+                    "--iters",
+                    "100",
+                    "--warmup-ms",
+                    "25",
+                    "--json",
+                    os.path.join(out_dir, "rmsnorm_bwd_dsv3_wsame.json"),
+                ],
+            ),
+            (
+                "fused_add_rmsnorm_dsv3",
+                [
+                    py,
+                    script("benchmark_fused_add_rmsnorm_sm100.py"),
+                    *common,
+                    "--dsv3",
+                    "--quack-baseline",
+                    "kernel_inplace",
+                    "--iters",
+                    "200",
+                    "--warmup-ms",
+                    "25",
+                    "--json",
+                    os.path.join(out_dir, "fused_add_rmsnorm_dsv3.json"),
+                ],
+            ),
+            (
+                "softmax_fwd_bwd_quack_suite",
+                [
+                    py,
+                    script("benchmark_softmax_sm100.py"),
+                    *common,
+                    "--mode",
+                    "fwd_bwd",
+                    "--quack-suite",
+                    "--iters",
+                    "50",
+                    "--warmup-ms",
+                    "25",
+                    "--json",
+                    os.path.join(out_dir, "softmax_fwd_bwd_quack_suite.json"),
+                ],
+            ),
+            (
+                "softmax_fwd_bwd_dsv3",
+                [
+                    py,
+                    script("benchmark_softmax_sm100.py"),
+                    *common,
+                    "--mode",
+                    "fwd_bwd",
+                    "--dsv3",
+                    "--iters",
+                    "50",
+                    "--warmup-ms",
+                    "25",
+                    "--json",
+                    os.path.join(out_dir, "softmax_fwd_bwd_dsv3.json"),
+                ],
+            ),
+            (
+                "cross_entropy_fwd_bwd_quack_suite",
+                [
+                    py,
+                    script("benchmark_cross_entropy_sm100.py"),
+                    *common,
+                    "--mode",
+                    "fwd_bwd",
+                    "--quack-suite",
+                    "--iters",
+                    "50",
+                    "--warmup-ms",
+                    "25",
+                    "--json",
+                    os.path.join(out_dir, "cross_entropy_fwd_bwd_quack_suite.json"),
+                ],
+            ),
+            (
+                "cross_entropy_fwd_bwd_dsv3",
+                [
+                    py,
+                    script("benchmark_cross_entropy_sm100.py"),
+                    *common,
+                    "--mode",
+                    "fwd_bwd",
+                    "--dsv3",
+                    "--iters",
+                    "50",
+                    "--warmup-ms",
+                    "25",
+                    "--json",
+                    os.path.join(out_dir, "cross_entropy_fwd_bwd_dsv3.json"),
+                ],
+            ),
+            (
+                "layernorm_fwd_quack_suite",
+                [
+                    py,
+                    script("benchmark_layernorm_sm100.py"),
+                    *common,
+                    "--quack-suite",
+                    "--iters",
+                    "200",
+                    "--warmup-ms",
+                    "25",
+                    "--json",
+                    os.path.join(out_dir, "layernorm_fwd_quack_suite.json"),
+                ],
+            ),
+            (
+                "layernorm_fwd_dsv3",
+                [
+                    py,
+                    script("benchmark_layernorm_sm100.py"),
+                    *common,
+                    "--dsv3",
+                    "--iters",
+                    "200",
+                    "--warmup-ms",
+                    "25",
+                    "--json",
+                    os.path.join(out_dir, "layernorm_fwd_dsv3.json"),
+                ],
+            ),
+        ]
+    )
 
     print(f"Writing results to: {out_dir}", flush=True)
     for name, cmd in runs:
diff --git a/oink/src/kernelagent_oink/blackwell/_rmsnorm_simple_weightonly.py b/oink/src/kernelagent_oink/blackwell/_rmsnorm_simple_weightonly.py
index 6c7d9065..63e50832 100644
--- a/oink/src/kernelagent_oink/blackwell/_rmsnorm_simple_weightonly.py
+++ b/oink/src/kernelagent_oink/blackwell/_rmsnorm_simple_weightonly.py
@@ -8,7 +8,6 @@
 """
 # ruff: noqa: E402  # CuTeDSL cache setup must run before importing cutlass.
 
-
 from dataclasses import dataclass
 
 import cuda.bindings.driver as cuda
@@ -235,7 +234,6 @@ def kernel(
             cute.copy(copy_atom_store, tXrO, tXgO)
 
 
-
 def _get_simple_weightonly_config(
     x: Tensor,
     weight: Tensor,