openai · spatnala18 · Mar 20, 2026 · Mar 22, 2026 · Mar 22, 2026
diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ Happy training!
 | Muon WD + 10 layer | 1.1748 | notapplica | Includes prev. wins + Spectral embed init + resid mix | 2026-03-19 | [info](records/track_10min_16mb/2026-03-19_SlidingWindow_FP16Emb_10L_MuonWD_OvertoneInit/README.md) |
 | Sliding Window Eval | 1.1925 | Matthew Li | Sliding window evaluation at stride=64, increasing context for eval | 2026-03-19 | [info](records/track_10min_16mb/2026-03-19_SlidingWindowEval/README.md) |
 | Lora TTT | 1.1928 | samacqua | Test-time training with LORAs | 2026-03-19 | [info](records/track_10min_16mb/2026-03-17_LoRA_TTT/README.md) |
-| 4k seq length| 1.2014 | Spokane Way | 4k seq length + better hypers | 2026-03-19 | [info](records/track_10min_16mb/2026-03-18_LongContextSeq2048/README.md) |
+| 4k seq length| 1.2014 | Spokane Way | 4k seq length + better hypers | 2026-03-19 | [info](records/track_10min_16mb/2026-03-19_TrainingOptSeq4096/README.md) |
 | 2048 seq length | 1.206 | Spokane Way | 2048 seq length (train + val) | 2026-03-18 | [info](records/track_10min_16mb/2026-03-18_LongContextSeq2048/README.md) |
 | int6 mixed precision | 1.2147 | Nan Liu | 10 layers, mixed int8/int6 | 2026-03-18 | [info](records/track_10min_16mb/2026-03-19_10L_MixedPrecision/README.md) |
 | fp16 Embed | 1.2197 | Renier Velazco | FP16 Tied Embedding + LR/Warmdown Tuning | 2026-03-18 | [info](records/track_10min_16mb/2026-03-18_FP16Embed_WD3600/README.md) |

diff --git a/modal_repro_longcontext.py b/modal_repro_longcontext.py
@@ -0,0 +1,220 @@
+from __future__ import annotations
+
+import json
+import os
+import re
+import subprocess
+
+import modal
+
+APP_NAME = "parameter-golf-repro-longcontext-8h100"
+REPO_REMOTE_PATH = "/workspace/parameter-golf"
+TARGET_SCRIPT = "records/track_10min_16mb/2026-03-18_LongContextSeq2048/train_gpt.py"
+
+STEP_RE = re.compile(r"step:(\d+)/(\d+).*step_avg:([0-9.]+)ms")
+CONFIG_RE = re.compile(
+    r"train_batch_tokens:(\d+)\s+train_seq_len:(\d+)\s+iterations:(\d+)\s+warmup_steps:(\d+)\s+"
+    r"max_wallclock_seconds:([0-9.]+)"
+)
+FINAL_RE = re.compile(r"final_int8_zlib_roundtrip_exact\s+val_loss:([0-9.]+)\s+val_bpb:([0-9.]+)")
+SIZE_RE = re.compile(r"Total submission size int8\+zlib:\s*([0-9]+)\s*bytes")
+STOP_RE = re.compile(r"stopping_early: wallclock_cap .* step:([0-9]+)/([0-9]+)")
+
+app = modal.App(APP_NAME)
+image = (
+    # The devel image includes the CUDA toolchain pieces that torch.compile / Triton
+    # tend to expect on tuned boxes; the runtime image is more likely to underperform.
+    modal.Image.from_registry("pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel")
+    .apt_install("build-essential")
+    .pip_install(
+        "numpy",
+        "tqdm",
+        "huggingface-hub",
+        "kernels",
+        "setuptools",
+        "typing-extensions==4.15.0",
+        "datasets",
+        "tiktoken",
+        "sentencepiece",
+    )
+    .add_local_dir(".", remote_path=REPO_REMOTE_PATH)
+)
+
+
+def _run_checked(cmd: list[str], *, env: dict[str, str] | None = None) -> None:
+    subprocess.run(cmd, check=True, env=env)
+
+
+def _print_probe(command: list[str]) -> None:
+    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, check=False)
+    joined = " ".join(command)
+    print(f"\n=== PROBE: {joined} ===")
+    print(result.stdout.rstrip())
+
+
+@app.function(image=image, gpu="H100:8", timeout=90 * 60, cpu=64, memory=196608)
+def run(
+    run_id: str = "modal_longcontext_8h100_repro",
+    target_script: str = TARGET_SCRIPT,
+    data_variant: str = "sp1024",
+    max_wallclock_seconds: int = 600,
+    expected_train_seq_len: int = 2048,
+    expected_max_step_avg_ms: float = 60.0,
+    gate_check_step: int = 1000,
+    enable_throughput_gate: bool = False,
+    nccl_ib_disable: int | None = None,
+    extra_env_json: str = "{}",
+) -> dict[str, object]:
+    os.chdir(REPO_REMOTE_PATH)
+
+    _run_checked(["python", "data/cached_challenge_fineweb.py", "--variant", data_variant])
+
+    env = os.environ.copy()
+    env.update(
+        {
+            "RUN_ID": run_id,
+            "DATA_PATH": f"./data/datasets/fineweb10B_{data_variant}",
+            "TOKENIZER_PATH": "./data/tokenizers/fineweb_1024_bpe.model",
+            "VOCAB_SIZE": "1024",
+            "MAX_WALLCLOCK_SECONDS": str(max_wallclock_seconds),
+            "OMP_NUM_THREADS": "1",
+            "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1",
+            "CC": "gcc",
+            "CXX": "g++",
+        }
+    )
+    if nccl_ib_disable is None:
+        env.pop("NCCL_IB_DISABLE", None)
+    else:
+        env["NCCL_IB_DISABLE"] = str(int(nccl_ib_disable))
+
+    extra_env = json.loads(extra_env_json)
+    if not isinstance(extra_env, dict):
+        raise TypeError("extra_env_json must decode to a JSON object")
+    env.update({str(k): str(v) for k, v in extra_env.items()})
+
+    _print_probe(["python", "-c", "import torch; print(torch.__version__)"])
+    _print_probe(["python", "-c", "import triton; print(triton.__version__)"])
+    _print_probe(["bash", "-lc", "command -v ptxas || true"])
+    _print_probe(["nvidia-smi", "topo", "-m"])
+    _print_probe(["python", "-c", "import os; print(os.cpu_count())"])
+
+    print("\n=== REPRO ENV ===")
+    print(
+        {
+            key: env[key]
+            for key in (
+                "RUN_ID",
+                "DATA_PATH",
+                "TOKENIZER_PATH",
+                "VOCAB_SIZE",
+                "MAX_WALLCLOCK_SECONDS",
+                "OMP_NUM_THREADS",
+                "TORCH_NCCL_ASYNC_ERROR_HANDLING",
+                "NCCL_IB_DISABLE",
+                "CC",
+                "CXX",
+            )
+            if key in env
+        }
+    )
+
+    cmd = ["torchrun", "--standalone", "--nproc_per_node=8", target_script]
+    proc = subprocess.Popen(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1)
+
+    lines: list[str] = []
+    last_step = 0
+    last_step_avg_ms = 0.0
+    observed_train_seq_len: int | None = None
+    gate_checked = False
+
+    assert proc.stdout is not None
+    for line in proc.stdout:
+        print(line, end="")
+        lines.append(line)
+
+        step_match = STEP_RE.search(line)
+        if step_match:
+            last_step = int(step_match.group(1))
+            last_step_avg_ms = float(step_match.group(3))
+            if enable_throughput_gate and (not gate_checked) and last_step >= gate_check_step:
+                gate_checked = True
+                if last_step_avg_ms > expected_max_step_avg_ms:
+                    proc.terminate()
+                    try:
+                        proc.wait(timeout=30)
+                    except subprocess.TimeoutExpired:
+                        proc.kill()
+                    raise RuntimeError(
+                        "Throughput gate failed: "
+                        f"step={last_step} step_avg_ms={last_step_avg_ms:.2f} "
+                        f"threshold_ms={expected_max_step_avg_ms:.2f}"
+                    )
+
+        config_match = CONFIG_RE.search(line)
+        if config_match:
+            observed_train_seq_len = int(config_match.group(2))
+
+    rc = proc.wait()
+    if rc != 0:
+        raise RuntimeError(f"Training failed with exit code {rc}")
+
+    if observed_train_seq_len is None:
+        raise RuntimeError("Could not parse train_seq_len from training log")
+    if observed_train_seq_len != expected_train_seq_len:
+        raise RuntimeError(
+            f"Unexpected TRAIN_SEQ_LEN in log: expected {expected_train_seq_len}, got {observed_train_seq_len}"
+        )
+    if enable_throughput_gate and not gate_checked:
+        raise RuntimeError(f"Throughput gate was enabled but log never reached step {gate_check_step}")
+
+    log = "".join(lines)
+    exact = FINAL_RE.search(log)
+    size = SIZE_RE.search(log)
+    stop = STOP_RE.search(log)
+
+    out = {
+        "run_id": run_id,
+        "target_script": target_script,
+        "observed_train_seq_len": observed_train_seq_len,
+        "last_step_seen": last_step,
+        "last_step_avg_ms": last_step_avg_ms,
+        "val_loss": float(exact.group(1)) if exact else None,
+        "val_bpb": float(exact.group(2)) if exact else None,
+        "bytes_total_int8_zlib": int(size.group(1)) if size else None,
+        "steps_done": int(stop.group(1)) if stop else None,
+        "steps_target": int(stop.group(2)) if stop else None,
+        "nccl_ib_disable": env.get("NCCL_IB_DISABLE"),
+    }
+    print("\n=== REPRO SUMMARY ===")
+    print(out)
+    return out
+
+
+@app.local_entrypoint()
+def main(
+    run_id: str = "modal_longcontext_8h100_repro",
+    target_script: str = TARGET_SCRIPT,
+    data_variant: str = "sp1024",
+    max_wallclock_seconds: int = 600,
+    expected_train_seq_len: int = 2048,
+    expected_max_step_avg_ms: float = 60.0,
+    gate_check_step: int = 1000,
+    enable_throughput_gate: bool = False,
+    nccl_ib_disable: int | None = None,
+    extra_env_json: str = "{}",
+) -> None:
+    print(
+        run.remote(
+            run_id=run_id,
+            target_script=target_script,
+            data_variant=data_variant,
+            max_wallclock_seconds=max_wallclock_seconds,
+            expected_train_seq_len=expected_train_seq_len,
+            expected_max_step_avg_ms=expected_max_step_avg_ms,
+            gate_check_step=gate_check_step,
+            enable_throughput_gate=enable_throughput_gate,
+            nccl_ib_disable=nccl_ib_disable,
+            extra_env_json=extra_env_json,
+        )
+    )
diff --git a/records/track_10min_16mb/2026-03-19_WIP_PLACEHOLDER/README.md b/records/track_10min_16mb/2026-03-19_WIP_PLACEHOLDER/README.md
@@ -0,0 +1,63 @@
+# WIP Placeholder Submission
+
+This is a working submission scaffold for the 10-minute / 16MB track.
+Rename this folder later once you have final results.
+
+## Goal
+
+- Track: `track_10min_16mb`
+- Objective: improve `val_bpb` while staying under the 16,000,000-byte artifact cap
+- Budget: reproducible <= 10 minutes training on 8xH100 (SXM)
+
+## Current Status
+
+- Status: work in progress
+- Baseline script source: root `train_gpt.py` copied into this folder
+- Final metrics: pending
+
+## Planned Changes
+
+- [ ] Model/optimizer changes
+- [ ] Data/tokenizer changes (if any)
+- [ ] Eval method changes (if any)
+- [ ] Compression/export changes (if any)
+
+## Run Command (Template)
+
+```bash
+RUN_ID=wip_placeholder \
+DATA_PATH=./data/datasets/fineweb10B_sp1024 \
+TOKENIZER_PATH=./data/tokenizers/fineweb_1024_bpe.model \
+VOCAB_SIZE=1024 \
+MAX_WALLCLOCK_SECONDS=600 \
+TRAIN_LOG_EVERY=200 \
+VAL_LOSS_EVERY=1000 \
+torchrun --standalone --nproc_per_node=8 \
+  records/track_10min_16mb/2026-03-19_WIP_PLACEHOLDER/train_gpt.py | tee records/track_10min_16mb/2026-03-19_WIP_PLACEHOLDER/train.log
+```
+
+## Required Files Checklist
+
+- [x] `train_gpt.py`
+- [ ] `train.log` (generate after run)
+- [x] `README.md`
+- [x] `submission.json`
+- [ ] extra seed logs for SOTA significance (if needed)
+
+## Results (Fill In)
+
+Primary run:
+- seed: 
+- steps reached in 600s: 
+- pre-quant `val_bpb`: 
+- post-quant `val_bpb` (`final_int8_zlib_roundtrip_exact`): 
+- `Total submission size int8+zlib`: 
+
+Extra reproducibility runs (if claiming SOTA):
+- `train_seedXXXX.log`: 
+- `train_seedYYYY.log`: 
+
+## Notes
+
+- Keep logs and code in this folder so the PR is self-contained.
+- If tokenizer or dataset changes are made, include proof that `val_bpb` is computed correctly.
diff --git a/records/track_10min_16mb/2026-03-19_WIP_PLACEHOLDER/submission.json b/records/track_10min_16mb/2026-03-19_WIP_PLACEHOLDER/submission.json
@@ -0,0 +1,11 @@
+{
+  "author": "YOUR_NAME",
+  "github_id": "YOUR_GITHUB_ID",
+  "name": "WIP Placeholder Submission",
+  "blurb": "Short summary of your approach and what changed from baseline.",
+  "date": "2026-03-19",
+  "val_loss": 0.0,
+  "val_bpb": 0.0,
+  "bytes_total": 0,
+  "bytes_code": 0
+}