From 26d026140ce00146a1ed9485d1f11092e96185bb Mon Sep 17 00:00:00 2001 From: Ken M Date: Fri, 1 May 2026 04:19:35 -0400 Subject: [PATCH 1/4] docs: SemanticEngine submission design plan Covers system naming (SemanticEngine / CareSSM / ChaosSsm / SemanticOptimizer), file structure, train_gpt.py section breakdown, new chaoscontrol public/ module, training/eval prequential contract, and implementation task order. --- ...-05-01-semanticengine-submission-design.md | 190 ++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 docs/plans/2026-05-01-semanticengine-submission-design.md diff --git a/docs/plans/2026-05-01-semanticengine-submission-design.md b/docs/plans/2026-05-01-semanticengine-submission-design.md new file mode 100644 index 0000000000..7cc270b994 --- /dev/null +++ b/docs/plans/2026-05-01-semanticengine-submission-design.md @@ -0,0 +1,190 @@ +# SemanticEngine Submission Design + +**Date:** 2026-05-01 +**Track:** `track_10min_16mb` +**Submission folder:** `records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/` + +--- + +## 1. System Overview + +The submission presents **SemanticEngine** — a CareSSM trunk with live episodic memory. Unlike every other top submission (transformer-based), this is a pure SSM architecture whose memory substrate is active during both training and prequential eval. + +### Named Components + +| Name | Role | Code location | +|---|---|---| +| **SemanticEngine** | Overall system | this submission | +| **CareSSM** | SSM trunk blocks | `chaoscontrol.core`, `chaoscontrol.model` | +| **ChaosSsm** | CPU SSM controller (nice-to-have rename from `CpuSsmController*`) | `chaoscontrol.episodic.cpu_ssm_controller` | +| **Episodic memory** | CRCT evidence substrate + MultiSlotOuterModel + replay eviction pipeline | `chaoscontrol.memory`, `chaoscontrol.replay_eviction` | +| **SemanticOptimizer** | Muon with SSM-channel-coupled momentum β | `chaoscontrol.optim.muon` (via `log_a_beta_coupling=True`) | + +**Note on episodic memory:** The live memory substrate (CRCT + MultiSlotOuterModel + streaming maintenance) is architecturally compatible with any Mamba-style SSM. CareSSM is built with it in mind, not the other way around. + +**Note on SemanticOptimizer:** The concept (per-channel momentum β coupled to each channel's `log_a` decay so optimizer time constants match recurrence time constants) is implemented as the `log_a_beta_coupling` extension in the `Muon` class. The standalone `SemanticOptimizer` class in `optim/semantic.py` is the fuller future version. The submission uses `Muon(log_a_beta_coupling=True)`. + +--- + +## 2. File Structure + +### Submission folder (`records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/`) + +``` +train_gpt.py # ~700-900 lines, orchestrating driver (see §4) +requirements.txt # chaoscontrol @ git+..., torch, sentencepiece, etc. +submission.json # filled after run +README.md # filled after run +train_seed.log # filled after run (3 seeds) +tokenizers/ + fineweb_16384_bpe.model # SP16384 tokenizer, shipped in submission folder +``` + +### New chaoscontrol module (`src/chaoscontrol/public/`) + +``` +src/chaoscontrol/public/ + __init__.py + engine_entry.py # init_arm_topology(), run_training(), build_artifact(), run_eval() +``` + +`public/` is the name: it signals this is the stable public-facing interface, not internal experiment scaffolding. + +All heavy machinery (distributed loop, CRCT, replay eviction topology, GPTQ, prequential eval) stays in existing chaoscontrol modules. `engine_entry.py` (~150–200 lines) connects them under a stable interface that `train_gpt.py` calls. + +--- + +## 3. Data and Dependencies + +### Data + +- **Tokenizer:** SP16384 (`fineweb_16384_bpe.model`, 455 KB), shipped inside the submission folder +- **Train/val shards:** `Natooka/parameter-golf-sp-tokenizers` on HuggingFace — 133 train shards (~25 GB) + 1 val shard (~84 MB, 42,266,034 tokens) +- **ValCache:** Pre-built from the first 50,000 validation documents; used by the prequential eval. Built via `scripts/build_exp20_val_cache.py` on pod setup. + +### Native extensions (must be built before running) + +| Extension | Purpose | +|---|---| +| `_lm_head_loss` | Fused chunked LM head backward (8× VRAM reduction at V=16384) | +| `_cpu_ssm_controller` | ChaosSsm CPU controller (C++ with optional CUDA write-event pack) | +| `_ssm_scan` | Chunked parallel SSM scan CUDA kernel | + +Built via `scripts/pod_build_native_extensions.sh`. The full pod setup (CUDA 13 + TE 2.13 + extensions + data) runs via `scripts/pod_bootstrap.sh`. + +### Requirements + +- PyTorch 2.11.0+cu130 (CUDA 13) +- TransformerEngine 2.13.0 +- `chaoscontrol @ git+https://github.com/KenMalloy/chaoscontrol.git` +- `sentencepiece`, `huggingface-hub`, `numpy` + +No network calls inside `train_gpt.py` during training or eval. The `chaoscontrol` package is pip-installed before the script runs. + +--- + +## 4. `train_gpt.py` Internal Structure + +Entry point: `torchrun --standalone --nproc_per_node=8 train_gpt.py` +All config via env vars. Matches the interface of every other submission. + +### Section 1 — Hyperparameters (heavily commented, ~100 lines) + +An env-var-configurable class. Comments explain the architectural motivation for each setting, not just the value. + +Key groups: +- **Paths:** `DATA_PATH`, `VAL_CACHE_DIR`, `TOKENIZER_PATH` +- **Model:** `model_dim=384` (artifact-safe at int6/LZMA; next size up at 416 is 15.19 MB, 448 exceeds budget), `ssm_delta_rank=32` +- **CRCT:** `crct_memory_write_tokens_per_step=32`, `crct_target_read_rate=0.25`, `crct_target_write_rate=0.10`, `outer_max_slots=4096`, and the full locked CRCT config from `exp26._crct_lock()` +- **Replay eviction:** `replay_eviction_memory_streams=8`, `replay_eviction_commit_policy="learned"`, and the full pipeline config from `exp26._replay_eviction_pipeline_lock()` +- **Fast/slow:** `fast_slow_alpha=0.25`, `fast_slow_eval_copy="slow"`, controller settings from `exp26._fast_slow_lock()` +- **Training:** `BUDGET_SECONDS=600`, `WARMUP_STEPS=20`, warmdown schedule, `GRAD_CLIP_NORM` +- **Optimizer:** SemanticOptimizer flags — `log_a_beta_coupling=True`, `log_a_beta_ema=0.99`, `log_a_beta_min=0.5`; Muon for matrix params, AdamW fallback for embeddings/scalars +- **Quantization:** GPTQ int6 for matrices, int7 for tied embeddings +- **Eval:** `CHUNK_TOKENS`, `WRITE_TOKENS_PER_CHUNK`, `DECAY` for `packet_online_cache` + +### Section 2 — `main()` (heavily commented, ~600-800 lines) + +Comments in this section explain the training/eval distinction clearly for reviewers: + +> During training, the trunk updates weights and the memory/controller stack generates evidence and maintains the cache. During eval, the same memory substrate is live but the run is prequential: score each chunk under the current state first, accumulate loss, then optionally update from those already-scored tokens. The trunk never sees validation tokens before they are scored. + +**Dist init + role routing** (~25 lines) +Calls `chaoscontrol.public.engine_entry.init_arm_topology(world_size)`. On 8 GPUs: GPU 0–5 are train ranks, GPU 6 is the packet-serving rank, GPU 7 is the maintenance rank. On 4 GPUs: GPU 3 shares both memory roles. Role routing is encapsulated here because it can't be described readably inline. + +**Data + ValCache load** (~30 lines) +Shards from `DATA_PATH`. ValCache from `VAL_CACHE_DIR` (pre-built, not constructed at runtime). + +**Model + optimizer** (~60 lines) +Build `ChaosControlConfig` from the hyperparameter block. Instantiate `CareStudentLM`. Construct the SemanticOptimizer (Muon with `log_a_beta_coupling=True`) on matrix params; AdamW on embeddings and scalars. + +**Training loop** (~200 lines) +``` +while True: + if time.perf_counter() - t_start >= BUDGET_SECONDS: + break # always exits at a complete step boundary + step += 1 + + if step % 100 == 0: + log(step, loss, tokens_per_sec, elapsed_s) +``` + +Wallclock check is the first thing in each iteration. When it fires, the loop exits at a complete-step boundary — no partial state enters the artifact. Log message: `"training stopped at step N (wallclock), artifact built from step N state"`. + +**Artifact build** (~80 lines) +Calls `chaoscontrol.artifact.serialize_artifact(model, ...)`. GPTQ int6 + int7 embed + LZMA compression. Logs `code_bytes`, `model_bytes`, `total_bytes` explicitly. + +**Prequential eval** (~100 lines) +Loads the serialized artifact. Calls `evaluate_with_calc_types(model, val_cache, calc_types=["packet_online_cache"], config=eval_config)`. The `packet_online_cache` calc type enforces score-before-write at the Python level (raises `RuntimeError` if the cache slot count changes between cue read and score accumulation). Iterates all 50,000 validation documents. Returns `val_bpb`, `val_loss`. + +**Score summary** (~20 lines) +Rank 0 prints a parseable summary: `val_bpb`, `val_loss`, `artifact_bytes`, `train_steps`, `train_time_s`, `eval_time_s`. + +--- + +## 5. New Chaoscontrol Code — `public/engine_entry.py` + +~150–200 lines. Three functions: + +**`init_arm_topology(world_size) -> RoleInfo`** +GPU role assignment. Returns the local process's role (train / packet-serving / maintenance) and associated NCCL group handles. Single source of truth for the 6+2 topology. + +**`run_training(model, optimizer, data, config) -> TrainingResult`** +Thin wrapper over the existing training loop in `chaoscontrol.training`. Returns `steps`, `elapsed_s`, `final_loss`. Called by `train_gpt.py` after model/optimizer construction. + +**`run_eval(artifact_path, val_cache, config) -> EvalResult`** +Loads artifact, calls `evaluate_with_calc_types` with `packet_online_cache`. Returns `bpb`, `loss`, `docs_scored`, `elapsed_s`. + +--- + +## 6. Training / Eval Distinction + +The prequential eval contract, stated explicitly for reviewers: + +- **Score first:** Each chunk is scored under the model's current memory state. Loss is accumulated before the cache is updated. +- **Write after:** The just-scored hidden states and token NLLs are committed to the episodic cache only after scoring. Future chunks may read them. +- **Trunk weights frozen:** The trunk does not update during eval. Only the episodic cache grows. +- **Enforced:** `packet_online_cache.py` checks `_outer_slot_count(model)` before and after scoring each chunk; a count change before score accumulation raises immediately. + +--- + +## 7. Implementation Plan + +The following tasks (in order) produce a runnable train_gpt.py and a score: + +1. Create `src/chaoscontrol/public/__init__.py` and `engine_entry.py` with the three functions +2. Write `records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/train_gpt.py` +3. Write `requirements.txt` +4. Spin up 8xH100 pod, run `scripts/pod_bootstrap.sh` +5. Run `torchrun --standalone --nproc_per_node=8 train_gpt.py` for seed 42 +6. Capture log, verify `val_bpb` in output +7. Repeat for seeds 1337 and 1234 (3-seed mean) +8. Fill `submission.json` and `README.md` + +--- + +## 8. Open Items + +- **ChaosSsm rename:** Nice-to-have. `CpuSsmController*` classes can be aliased or renamed in `chaoscontrol/public/` without touching internal code. Not blocking implementation. +- **ScOpt:** Not used in this submission. `ScarcityAwareOptimizer` is the parent concept that birthed `SemanticOptimizer`; noted for future work. +- **Folder name:** `2026-05-01_SemanticEngine_CareSSM` — may shift to a date after the actual run if we slip past May 1. From c8783f34f008fe4da568cc6fbfb222838b505935 Mon Sep 17 00:00:00 2001 From: Ken M Date: Fri, 1 May 2026 04:28:16 -0400 Subject: [PATCH 2/4] =?UTF-8?q?docs:=20SemanticEngine=20implementation=20p?= =?UTF-8?q?lan=20=E2=80=94=20engine=5Fentry,=20train=5Fgpt,=20smoke=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...026-05-01-semanticengine-implementation.md | 965 ++++++++++++++++++ 1 file changed, 965 insertions(+) create mode 100644 docs/plans/2026-05-01-semanticengine-implementation.md diff --git a/docs/plans/2026-05-01-semanticengine-implementation.md b/docs/plans/2026-05-01-semanticengine-implementation.md new file mode 100644 index 0000000000..2dcbf6d14e --- /dev/null +++ b/docs/plans/2026-05-01-semanticengine-implementation.md @@ -0,0 +1,965 @@ +# SemanticEngine Submission Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Produce a runnable `train_gpt.py` for the SemanticEngine ARM submission that trains on 8×H100, builds a ≤16MB artifact, and reports a prequential `val_bpb` score. + +**Architecture:** `engine_entry.py` in `chaoscontrol.public` is a thin adapter that (a) exposes a clean `init_arm_topology()` / `build_arm_config()` API for `train_gpt.py` and (b) delegates the full ARM training loop to the existing `run_condition()` in `experiments/23_fast_path/runner_fast_path.py` via a `CHAOSCONTROL_ROOT`-based sys.path injection. `train_gpt.py` is the env-var hyperparameter layer + main() that calls these functions and logs the score. + +**Tech Stack:** Python 3.11+, PyTorch 2.11+cu130, chaoscontrol (installed from GitHub), sentencepiece SP16384, HuggingFace shards from `Natooka/parameter-golf-sp-tokenizers`, native extensions (`_lm_head_loss`, `_cpu_ssm_controller`, `_ssm_scan`). + +**Key design decisions recorded in:** `docs/plans/2026-05-01-semanticengine-submission-design.md` + +--- + +## Dependency Order + +``` +Task 1 (scaffold) → Task 2 (topology fn) → Task 3 (config builder) → Task 4 (engine_entry complete) + ↓ +Task 5 (submission folder) → Task 6 (train_gpt.py Section 1) → Task 7 (train_gpt.py Section 2) → Task 8 (requirements.txt) → Task 9 (smoke test) +``` + +Tasks 2 and 3 can run in parallel after Task 1. Task 4 depends on 2 and 3. Tasks 5–9 are sequential. + +--- + +## Repo targets + +- **chaoscontrol repo** (`/Users/kennethmalloy/Local Documents/Developer/chaoscontrol/`): Tasks 1–4 +- **parameter-golf repo** (`/Users/kennethmalloy/Local Documents/Developer/parameter-golf/`): Tasks 5–9 + +--- + +## Task 1: Scaffold `chaoscontrol/public/` module + +**Files:** +- Create: `src/chaoscontrol/public/__init__.py` +- Create: `src/chaoscontrol/public/engine_entry.py` (stub) + +**Step 1: Create `__init__.py`** + +```python +# src/chaoscontrol/public/__init__.py +from chaoscontrol.public.engine_entry import ( + RoleInfo, + init_arm_topology, + build_arm_config, + run_arm_submission, +) + +__all__ = ["RoleInfo", "init_arm_topology", "build_arm_config", "run_arm_submission"] +``` + +**Step 2: Create stub `engine_entry.py`** + +```python +# src/chaoscontrol/public/engine_entry.py +from __future__ import annotations +from dataclasses import dataclass +from typing import Any + +@dataclass +class RoleInfo: + rank: int + world_size: int + packet_rank: int + maintenance_rank: int + is_train_rank: bool + is_packet_rank: bool + is_maintenance_rank: bool + split_memory_ranks: bool + +def init_arm_topology(rank: int, world_size: int) -> RoleInfo: + raise NotImplementedError + +def build_arm_config(hyperparams: Any) -> dict[str, Any]: + raise NotImplementedError + +def run_arm_submission( + config: dict[str, Any], + *, + data_path: str, + sp_model_path: str, + budget_seconds: float, + output_json: str | None, + val_cache_dir: str | None, + world_size_override: int | None = None, +) -> dict[str, Any]: + raise NotImplementedError +``` + +**Step 3: Verify import works** + +```bash +cd /Users/kennethmalloy/Local\ Documents/Developer/chaoscontrol +.venv/bin/python -c "from chaoscontrol.public import RoleInfo; print('ok')" +``` + +Expected: `ok` + +**Step 4: Commit** + +```bash +git add src/chaoscontrol/public/__init__.py src/chaoscontrol/public/engine_entry.py +git commit -m "feat: scaffold chaoscontrol.public module for SemanticEngine submission" +``` + +--- + +## Task 2: Implement `init_arm_topology()` + +**Files:** +- Modify: `src/chaoscontrol/public/engine_entry.py` +- Create: `tests/public/test_engine_entry.py` + +**Reference:** `runner_fast_path.py` lines 1848–1876 — the function starting around line 1848 that returns a dict with `packet_rank`, `maintenance_rank`, `memory_ranks`, `train_ranks`, `split_memory_ranks`. Port its logic into `init_arm_topology()`. + +**Step 1: Write the failing tests** + +```python +# tests/public/test_engine_entry.py +import pytest +from chaoscontrol.public.engine_entry import init_arm_topology + +def test_8gpu_splits_6_2(): + role6 = init_arm_topology(rank=6, world_size=8) + role7 = init_arm_topology(rank=7, world_size=8) + role0 = init_arm_topology(rank=0, world_size=8) + assert role6.is_packet_rank + assert not role6.is_maintenance_rank + assert role7.is_maintenance_rank + assert not role7.is_packet_rank + assert role0.is_train_rank + assert role6.split_memory_ranks + assert role7.split_memory_ranks + +def test_4gpu_shares_memory(): + role3 = init_arm_topology(rank=3, world_size=4) + role0 = init_arm_topology(rank=0, world_size=4) + assert role3.is_packet_rank + assert role3.is_maintenance_rank # shared on 4 GPU + assert not role3.is_train_rank + assert role0.is_train_rank + assert not role3.split_memory_ranks + +def test_1gpu_all_train(): + role = init_arm_topology(rank=0, world_size=1) + assert role.is_train_rank + assert not role.is_packet_rank + assert not role.is_maintenance_rank + +def test_packet_rank_value_8gpu(): + role = init_arm_topology(rank=0, world_size=8) + assert role.packet_rank == 6 + assert role.maintenance_rank == 7 +``` + +**Step 2: Run to verify failure** + +```bash +cd /Users/kennethmalloy/Local\ Documents/Developer/chaoscontrol +.venv/bin/python -m pytest tests/public/test_engine_entry.py -v +``` + +Expected: `NotImplementedError` + +**Step 3: Implement** + +Replace the `init_arm_topology` stub in `engine_entry.py`: + +```python +def init_arm_topology(rank: int, world_size: int) -> RoleInfo: + """Assign GPU role. On 8+ GPUs with replay_eviction: 6+2 split. + On 4 GPUs: 3+1 (GPU3 owns both packet-serving and maintenance). + On 1 GPU: everything on rank 0. + """ + world = int(world_size) + split = world >= 8 # split memory ranks only at 8+ GPUs + packet_rank = world - (2 if split else 1) + maintenance_rank = world - 1 + is_packet = rank == packet_rank + is_maintenance = rank == maintenance_rank + is_train = not is_packet and not is_maintenance + # On 4 GPU (split=False), packet_rank == maintenance_rank == 3, + # so that rank is both. is_train is False for it. + return RoleInfo( + rank=rank, + world_size=world_size, + packet_rank=packet_rank, + maintenance_rank=maintenance_rank, + is_train_rank=is_train, + is_packet_rank=is_packet, + is_maintenance_rank=is_maintenance, + split_memory_ranks=split, + ) +``` + +**Step 4: Run to verify pass** + +```bash +.venv/bin/python -m pytest tests/public/test_engine_entry.py -v +``` + +Expected: all 4 tests pass. + +**Step 5: Commit** + +```bash +git add src/chaoscontrol/public/engine_entry.py tests/public/test_engine_entry.py +git commit -m "feat: implement init_arm_topology() with 6+2 / 3+1 / 1-gpu routing" +``` + +--- + +## Task 3: Implement `build_arm_config()` + +**Files:** +- Modify: `src/chaoscontrol/public/engine_entry.py` +- Modify: `tests/public/test_engine_entry.py` + +**Context:** `build_arm_config()` takes the hyperparameter object from `train_gpt.py` (any object with attributes) and returns the config dict that `run_condition()` in `runner_fast_path.py` expects. The required keys are a superset of what `exp26._crct_lock()`, `exp26._fast_slow_lock()`, `exp26._replay_eviction_pipeline_lock()`, and `exp26._artifact_size_lock()` return — plus model/training keys like `vocab_size`, `seq_len`, `batch_size`, `budget_seconds`, `seed`, `optimizer`, etc. + +**Reference:** Read `experiments/26_arm/exp26.py` for the four lock functions. Read `experiments/23_fast_path/runner_fast_path.py:run_condition` (line 14055–14130 approx.) for what keys it reads from config. + +**Telemetry-tuned defaults (from profiling — these override exp26 lock values):** + +| Key | Value | Rationale | +|---|---|---| +| `crct_memory_write_tokens_per_step` | 192 | Up from 128/32; per-step cap headroom | +| `online_episodic_write_tokens_per_chunk` | 64 | Up from 16; first meaningful step without being reckless | +| `crct_target_write_rate` | 0.20 | Matches observed adaptive smoke ~0.219 | +| `async_teacher_max_lag_steps` | leave at current | Lag is 3–4 steps; pipe not bottleneck | +| `crct_async_teacher_pending_batches` | leave at current | No ring drops observed | + +**Step 1: Write the failing test** + +```python +# append to tests/public/test_engine_entry.py + +from chaoscontrol.public.engine_entry import build_arm_config + +class _FakeHyperparams: + vocab_size = 16384 + model_dim = 384 + ssm_delta_rank = 32 + seq_len = 512 + batch_size = 1024 + budget_seconds = 600.0 + seed = 42 + base_lr = 0.064 + weight_decay = 0.01 + grad_clip_norm = 1.0 + log_a_beta_coupling = True + log_a_beta_ema = 0.99 + log_a_beta_min = 0.5 + lm_head_tile_size = 4096 + +def test_build_arm_config_required_keys(): + cfg = build_arm_config(_FakeHyperparams()) + required = [ + "vocab_size", "model_dim", "seq_len", "batch_size", + "budget_seconds", "seed", "optimizer", + "crct_enabled", "replay_eviction_enabled", + "fast_slow_enabled", "fast_slow_alpha", + "crct_memory_write_tokens_per_step", + "online_episodic_write_tokens_per_chunk", + "crct_target_write_rate", + ] + for key in required: + assert key in cfg, f"missing key: {key}" + +def test_build_arm_config_telemetry_tuned_defaults(): + cfg = build_arm_config(_FakeHyperparams()) + assert cfg["crct_memory_write_tokens_per_step"] == 192 + assert cfg["online_episodic_write_tokens_per_chunk"] == 64 + assert abs(cfg["crct_target_write_rate"] - 0.20) < 1e-6 + assert cfg["model_dim"] == 384 + assert cfg["optimizer"] == "muon" + assert cfg["optimizer_log_a_beta_coupling"] is True +``` + +**Step 2: Run to verify failure** + +```bash +.venv/bin/python -m pytest tests/public/test_engine_entry.py::test_build_arm_config_required_keys -v +``` + +Expected: `NotImplementedError` + +**Step 3: Implement** + +Replace the `build_arm_config` stub. The function builds the config dict by merging the four exp26 lock dicts over a base config derived from hyperparams. Import the lock functions directly from exp26 (they are pure Python dict builders): + +```python +import sys +import os +from pathlib import Path + +def _exp26_locks() -> dict[str, Any]: + """Import lock dicts from exp26 without triggering exp26's dist init.""" + cc_root = os.environ.get("CHAOSCONTROL_ROOT", "/workspace/chaoscontrol") + exp26_dir = str(Path(cc_root) / "experiments" / "26_arm") + exp24_dir = str(Path(cc_root) / "experiments" / "24_training_time_bundle") + for d in (exp26_dir, exp24_dir): + if d not in sys.path: + sys.path.insert(0, d) + from exp26 import _crct_lock, _fast_slow_lock, _replay_eviction_pipeline_lock, _artifact_size_lock + return { + **_artifact_size_lock(), + **_fast_slow_lock(), + **_crct_lock(), + **_replay_eviction_pipeline_lock(), + } + + +def build_arm_config(hp: Any) -> dict[str, Any]: + """Build the run_condition config dict from train_gpt.py hyperparams.""" + locks = _exp26_locks() + cfg: dict[str, Any] = { + # --- model --- + "vocab_size": int(hp.vocab_size), + "seq_len": int(hp.seq_len), + "batch_size": int(hp.batch_size), + "dtype": "bf16", + "device": "auto", + # --- training --- + "budget_seconds": float(hp.budget_seconds), + "seed": int(hp.seed), + "base_lr": float(hp.base_lr), + "weight_decay": float(hp.weight_decay), + "grad_clip_norm": float(hp.grad_clip_norm), + # --- optimizer: Muon + SemanticOptimizer channel coupling --- + "optimizer": "muon", + "optimizer_log_a_beta_coupling": bool(hp.log_a_beta_coupling), + "optimizer_log_a_beta_ema": float(hp.log_a_beta_ema), + "optimizer_log_a_beta_min": float(hp.log_a_beta_min), + # --- calc_types: use packet_online_cache for official BPB --- + "calc_types": ["packet_online_cache"], + "headline_calc_type": "packet_online_cache", + } + # Merge ARM locks (crct, fast_slow, replay_eviction, artifact_size) + cfg.update(locks) + # Apply telemetry-tuned overrides (supersede lock defaults) + cfg.update({ + "crct_memory_write_tokens_per_step": int(getattr(hp, "crct_memory_write_tokens_per_step", 192)), + "online_episodic_write_tokens_per_chunk": int(getattr(hp, "online_episodic_write_tokens_per_chunk", 64)), + "crct_target_write_rate": float(getattr(hp, "crct_target_write_rate", 0.20)), + "lm_head_tile_size": int(getattr(hp, "lm_head_tile_size", 4096)), + }) + return cfg +``` + +**Step 4: Run to verify pass** + +```bash +.venv/bin/python -m pytest tests/public/test_engine_entry.py -v +``` + +Expected: all tests pass. + +**Step 5: Commit** + +```bash +git add src/chaoscontrol/public/engine_entry.py tests/public/test_engine_entry.py +git commit -m "feat: implement build_arm_config() with telemetry-tuned CRCT defaults" +``` + +--- + +## Task 4: Implement `run_arm_submission()` and complete `engine_entry.py` + +**Files:** +- Modify: `src/chaoscontrol/public/engine_entry.py` + +**Context:** `run_arm_submission()` adds `experiments/23_fast_path` (and any other needed experiment dirs) to `sys.path` using `CHAOSCONTROL_ROOT`, then imports and calls `run_condition` from `runner_fast_path.py`. This delegates the full 14,850-line ARM training+eval loop to the existing production implementation without reinventing it. + +**Step 1: Implement** + +```python +def _ensure_runner_path() -> None: + """Add experiment dirs to sys.path so runner_fast_path is importable. + + Requires the chaoscontrol repo to be cloned at CHAOSCONTROL_ROOT + (default /workspace/chaoscontrol). The pod bootstrap script puts it there. + On local dev, set CHAOSCONTROL_ROOT to your clone path. + """ + cc_root = os.environ.get("CHAOSCONTROL_ROOT", "/workspace/chaoscontrol") + root = Path(cc_root) + dirs = [ + root / "experiments" / "23_fast_path", + root / "experiments" / "24_training_time_bundle", + root / "experiments" / "26_arm", + root / "src", # in case chaoscontrol isn't installed, fall back to src + ] + for d in dirs: + s = str(d) + if s not in sys.path: + sys.path.insert(0, s) + + +def run_arm_submission( + config: dict[str, Any], + *, + data_path: str, + sp_model_path: str, + budget_seconds: float, + output_json: str | None, + val_cache_dir: str | None, + world_size_override: int | None = None, +) -> dict[str, Any]: + """Delegate to run_condition() in runner_fast_path.py. + + runner_fast_path.py is the production ARM training + eval loop + (experiments/23_fast_path/runner_fast_path.py, ~14,850 lines). + We call it directly rather than reimplementing it. The config dict + produced by build_arm_config() is what run_condition() expects. + """ + _ensure_runner_path() + from runner_fast_path import run_condition # type: ignore[import] + return run_condition( + config, + data_path=data_path, + sp_model_path=sp_model_path, + budget_seconds=budget_seconds, + output_json=output_json, + output_ckpt=None, + world_size_override=world_size_override, + val_cache_dir=val_cache_dir, + ) +``` + +**Step 2: Smoke-test the import chain on CPU (no GPU required)** + +```bash +CHAOSCONTROL_ROOT="/Users/kennethmalloy/Local Documents/Developer/chaoscontrol" \ + .venv/bin/python -c " +from chaoscontrol.public.engine_entry import _ensure_runner_path +_ensure_runner_path() +from runner_fast_path import run_condition +print('run_condition importable:', callable(run_condition)) +" +``` + +Expected: `run_condition importable: True` + +**Step 3: Commit** + +```bash +git add src/chaoscontrol/public/engine_entry.py +git commit -m "feat: implement run_arm_submission() delegating to runner_fast_path.run_condition" +``` + +--- + +## Task 5: Create submission folder structure + +**Repo:** parameter-golf +**Files:** +- Create dir: `records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/` +- Create dir: `records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/tokenizers/` +- Note: `submission.json`, `README.md`, and `train_seed*.log` are **not created yet** — they fill in after the actual run. + +**Step 1: Create the skeleton** + +```bash +mkdir -p "records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/tokenizers" +touch "records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/.gitkeep" +touch "records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/tokenizers/.gitkeep" +``` + +**Step 2: Verify** + +```bash +ls records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/ +``` + +Expected: `.gitkeep tokenizers/` + +**Step 3: Note on tokenizer** + +The SP16384 tokenizer (`fineweb_16384_bpe.model`) lives at `Natooka/parameter-golf-sp-tokenizers` on HuggingFace and at `baselines/parameter_golf/tokenizers/fineweb_16384_bpe.model` on the pod after bootstrap. Copy it into the submission folder on the pod before submitting: + +```bash +cp baselines/parameter_golf/tokenizers/fineweb_16384_bpe.model \ + records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/tokenizers/ +``` + +**Step 4: Commit** + +```bash +git add records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/ +git commit -m "feat: create SemanticEngine submission folder skeleton" +``` + +--- + +## Task 6: Write `train_gpt.py` — Section 1 (Hyperparameters) + +**Files:** +- Create: `records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/train_gpt.py` + +**Step 1: Write the Hyperparameters class (heavily commented)** + +The file starts with standard imports and a `Hyperparameters` class configurable entirely via env vars. Comments explain the architectural motivation for every non-obvious value. Here is the complete Section 1: + +```python +#!/usr/bin/env python3 +"""SemanticEngine — CareSSM with live episodic memory. + +Entry point: torchrun --standalone --nproc_per_node=8 train_gpt.py + +SemanticEngine is a CareSSM trunk with a live episodic memory substrate +(CRCT evidence + streaming Adaptive Residual Memory maintenance). Unlike +every other top submission, this is a pure SSM architecture. The memory +substrate runs on dedicated GPUs (GPU6 packet-serving, GPU7 maintenance) +and never blocks the trunk step. + +Dependencies: + - chaoscontrol installed from https://github.com/KenMalloy/chaoscontrol + - CHAOSCONTROL_ROOT set to the cloned repo root (default /workspace/chaoscontrol) + - Native extensions built: see chaoscontrol/scripts/pod_build_native_extensions.sh + - SP16384 shards: Natooka/parameter-golf-sp-tokenizers on HuggingFace + - ValCache pre-built from the first 50k val docs (scripts/pod_bootstrap.sh) + +Components called out in the README: + - CareSSM: the recurrent SSM trunk blocks (CareSSMCore/CareSSMBlock) + - ChaosSsm: the CPU SSM controller (off-path evidence/scheduling plane) + - SemanticOptimizer: Muon with SSM-channel-coupled momentum β, so optimizer + time constants match each channel's forward-pass recurrence time constant + - GPU6/GPU7: dedicated memory ranks — never share compute with the trunk +""" +from __future__ import annotations + +import json +import os +import sys +import time +from pathlib import Path + + +class Hyperparameters: + # ------------------------------------------------------------------------- + # Paths + # ------------------------------------------------------------------------- + # SP16384 pre-tokenized shards, fetched from Natooka/parameter-golf-sp-tokenizers. + data_path: str = os.environ.get( + "DATA_PATH", + "/workspace/chaoscontrol/baselines/parameter_golf/datasets/fineweb10B_sp16384", + ) + # SP16384 SentencePiece model. Shipped in tokenizers/ inside this submission folder. + tokenizer_path: str = os.environ.get( + "TOKENIZER_PATH", + str(Path(__file__).parent / "tokenizers" / "fineweb_16384_bpe.model"), + ) + # ValCache directory — pre-built from the first 50,000 FineWeb validation documents. + # Required for prequential eval. Built by scripts/pod_bootstrap.sh. + val_cache_dir: str = os.environ.get( + "VAL_CACHE_DIR", + "/workspace/chaoscontrol/experiments/27_ttt_headline/val_cache", + ) + # Path where the chaoscontrol repo is cloned (for experiment runner import). + chaoscontrol_root: str = os.environ.get("CHAOSCONTROL_ROOT", "/workspace/chaoscontrol") + # JSON file to write final result into (optional). + output_json: str | None = os.environ.get("OUTPUT_JSON", None) + + # ------------------------------------------------------------------------- + # Model + # ------------------------------------------------------------------------- + vocab_size: int = 16384 # SP16384 vocabulary + # dim=384 is the largest artifact-safe trunk width at int6/LZMA compression: + # 384 → ~13.71 MB, 416 → ~15.19 MB, 448 → ~16.73 MB (budget exceeded). + model_dim: int = int(os.environ.get("MODEL_DIM", 384)) + # Low-rank delta projection rank inside each CareSSMCore block. + ssm_delta_rank: int = int(os.environ.get("SSM_DELTA_RANK", 32)) + seq_len: int = int(os.environ.get("SEQ_LEN", 512)) + batch_size: int = int(os.environ.get("BATCH_SIZE", 1024)) + # Fused LM-head tile size. 4096 keeps the fused backward path while avoiding + # OOM on the cu130 pod stack after model activations at B=1024/T=512. + lm_head_tile_size: int = int(os.environ.get("LM_HEAD_TILE_SIZE", 4096)) + + # ------------------------------------------------------------------------- + # Training budget + # ------------------------------------------------------------------------- + # Hard wallclock cap. Checked at the top of each training step so the loop + # always exits at a complete-step boundary — never mid-step. + budget_seconds: float = float(os.environ.get("BUDGET_SECONDS", 600.0)) + seed: int = int(os.environ.get("SEED", 42)) + + # ------------------------------------------------------------------------- + # Optimizer — SemanticOptimizer (Muon + channel-coupled β) + # ------------------------------------------------------------------------- + # Muon (Newton-Schulz orthogonalized momentum) on matrix params; + # AdamW fallback on embeddings and scalars. + base_lr: float = float(os.environ.get("BASE_LR", 0.064)) + weight_decay: float = float(os.environ.get("WEIGHT_DECAY", 0.01)) + grad_clip_norm: float = float(os.environ.get("GRAD_CLIP_NORM", 1.0)) + # SemanticOptimizer: per-channel momentum β coupled to log_a decay. + # Slow-recurrence channels (log_a near 0 → decay near 1) get high β so + # gradients integrate over long horizons. Fast channels get lower β. + log_a_beta_coupling: bool = bool(int(os.environ.get("LOG_A_BETA_COUPLING", 1))) + log_a_beta_ema: float = float(os.environ.get("LOG_A_BETA_EMA", 0.99)) + log_a_beta_min: float = float(os.environ.get("LOG_A_BETA_MIN", 0.5)) + + # ------------------------------------------------------------------------- + # CRCT evidence substrate (telemetry-tuned from profiling on 4×H100) + # ------------------------------------------------------------------------- + # Per-step write cap. 192 gives the per-step cap meaningful headroom above + # 128 without entering noisy territory. (exp26 default was 32.) + crct_memory_write_tokens_per_step: int = int( + os.environ.get("CRCT_MEMORY_WRITE_TOKENS_PER_STEP", 192) + ) + # Per-chunk write budget for the online episodic cache. 64 is the first real + # step up from the profiled 16 without being reckless. + online_episodic_write_tokens_per_chunk: int = int( + os.environ.get("ONLINE_EPISODIC_WRITE_TOKENS_PER_CHUNK", 64) + ) + # Target write rate. 0.20 matches observed adaptive smoke behavior + # (payload rate ~14/64 = 0.219); previous lock value was 0.10. + crct_target_write_rate: float = float( + os.environ.get("CRCT_TARGET_WRITE_RATE", 0.20) + ) + # Async teacher lag and pending-batch limits are left at exp26 defaults. + # Profiling shows max lag 3–4 steps and no ring drops — the pipe is not + # the bottleneck, so no change warranted. +``` + +**Step 2: Verify the file is valid Python** + +```bash +cd "records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM" +python3 -c "import ast; ast.parse(open('train_gpt.py').read()); print('syntax ok')" +``` + +Expected: `syntax ok` + +**Step 3: Write a unit test for hyperparameter defaults and env-var overrides** + +```python +# tests/submission/test_train_gpt_hyperparams.py +import importlib.util +import os +import sys +from pathlib import Path + +TRAIN_GPT = ( + Path(__file__).parents[2] + / "records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/train_gpt.py" +) + +def _load_hp(): + spec = importlib.util.spec_from_file_location("train_gpt", TRAIN_GPT) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod.Hyperparameters + +def test_defaults(): + HP = _load_hp() + assert HP.vocab_size == 16384 + assert HP.model_dim == 384 + assert HP.budget_seconds == 600.0 + assert HP.crct_memory_write_tokens_per_step == 192 + assert HP.online_episodic_write_tokens_per_chunk == 64 + assert abs(HP.crct_target_write_rate - 0.20) < 1e-6 + assert HP.log_a_beta_coupling is True + +def test_env_override(monkeypatch): + monkeypatch.setenv("SEED", "1337") + monkeypatch.setenv("BUDGET_SECONDS", "300.0") + monkeypatch.setenv("MODEL_DIM", "256") + HP = _load_hp() + assert HP.seed == 1337 + assert HP.budget_seconds == 300.0 + assert HP.model_dim == 256 +``` + +**Step 4: Run tests** + +```bash +cd "/Users/kennethmalloy/Local Documents/Developer/parameter-golf" +# Use system python3 or a local venv if available +python3 -m pytest tests/submission/test_train_gpt_hyperparams.py -v +``` + +Expected: both tests pass. + +**Step 5: Commit** + +```bash +git add records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/train_gpt.py +git add tests/submission/test_train_gpt_hyperparams.py +git commit -m "feat: train_gpt.py Section 1 — heavily commented Hyperparameters class" +``` + +--- + +## Task 7: Write `train_gpt.py` — Section 2 (`main()`) + +**Files:** +- Modify: `records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/train_gpt.py` + +**Step 1: Append the full main() function** + +Append below the `Hyperparameters` class. This is a single `main()` that: +1. Inits dist, routes roles +2. Loads data +3. Builds + runs the submission via chaoscontrol.public +4. Logs the score + +```python +def main() -> None: + import torch + import torch.distributed as dist + + # The chaoscontrol.public module must be importable. + # Install chaoscontrol from GitHub per requirements.txt. + os.environ.setdefault("CHAOSCONTROL_ROOT", Hyperparameters.chaoscontrol_root) + from chaoscontrol.public.engine_entry import ( + init_arm_topology, + build_arm_config, + run_arm_submission, + ) + + # --- Distributed init --- + # torchrun sets RANK, LOCAL_RANK, WORLD_SIZE in the environment. + # dist.init_process_group reads them automatically. + backend = "nccl" if torch.cuda.is_available() else "gloo" + dist.init_process_group(backend=backend) + rank = dist.get_rank() + world_size = dist.get_world_size() + local_rank = int(os.environ.get("LOCAL_RANK", 0)) + if torch.cuda.is_available(): + torch.cuda.set_device(local_rank) + + # --- Role routing --- + # On 8 GPUs: GPU0-5 are train ranks; GPU6 is the dedicated packet-serving + # rank (low-latency episodic residual production); GPU7 is the dedicated + # maintenance rank (oracle scoring, slot commits). The train ranks never + # wait on the memory ranks — if no fresh packet is ready, the trunk + # proceeds with a zero-residual failsafe. + # On 4 GPUs: GPU3 shares both memory roles (smoke/profile topology). + role = init_arm_topology(rank=rank, world_size=world_size) + if rank == 0: + print( + f"[semanticengine] topology: world={world_size} " + f"packet_rank={role.packet_rank} maintenance_rank={role.maintenance_rank} " + f"split={role.split_memory_ranks}", + flush=True, + ) + + # --- Build config --- + # build_arm_config maps the Hyperparameters class → the config dict that + # runner_fast_path.run_condition() expects. It merges the four exp26 lock + # dicts (artifact_size, fast_slow, crct, replay_eviction) and applies the + # telemetry-tuned overrides. + config = build_arm_config(Hyperparameters) + + # --- Training + eval --- + # run_arm_submission delegates to run_condition() in runner_fast_path.py, + # the production ARM training + prequential eval loop (~14,850 lines). + # + # Training: trunk updates weights; memory/controller stack generates + # evidence and maintains the cache. Wallclock is checked at the top of + # each step — the loop always exits at a complete-step boundary. + # + # Eval: same memory substrate is live, but the run is prequential. + # Score each chunk under the current state first, accumulate loss/BPB, + # then optionally update from already-scored tokens. The trunk never + # sees validation tokens before they are scored. Enforced at the Python + # level: packet_online_cache raises if the slot count changes between + # cue read and score accumulation. + t_start = time.perf_counter() + result = run_arm_submission( + config, + data_path=Hyperparameters.data_path, + sp_model_path=Hyperparameters.tokenizer_path, + budget_seconds=Hyperparameters.budget_seconds, + output_json=Hyperparameters.output_json, + val_cache_dir=Hyperparameters.val_cache_dir, + world_size_override=world_size, + ) + elapsed = time.perf_counter() - t_start + + # --- Score summary (rank 0 only) --- + if rank == 0: + eval_r = result.get("eval") or {} + calc_types = eval_r.get("calc_types") or {} + poc = calc_types.get("packet_online_cache") or {} + train_r = result.get("train") or {} + print( + f"\n[semanticengine] === SCORE SUMMARY ===\n" + f" val_bpb: {poc.get('bpb', float('nan')):.6f}\n" + f" val_loss: {poc.get('loss', float('nan')):.6f}\n" + f" docs_scored: {poc.get('docs_scored', 0)}\n" + f" train_steps: {train_r.get('steps', 0)}\n" + f" train_elapsed_s: {train_r.get('elapsed_s', 0.0):.1f}\n" + f" total_elapsed_s: {elapsed:.1f}\n" + f" artifact_bytes: {result.get('artifact_bytes', 'N/A')}\n" + f" code_bytes: {result.get('code_bytes', 'N/A')}\n" + f"[semanticengine] === END SUMMARY ===", + flush=True, + ) + if Hyperparameters.output_json: + Path(Hyperparameters.output_json).write_text(json.dumps(result, indent=2, default=str)) + + dist.destroy_process_group() + + +if __name__ == "__main__": + main() +``` + +**Step 2: Verify syntax** + +```bash +python3 -c "import ast; ast.parse(open('train_gpt.py').read()); print('syntax ok')" +``` + +Expected: `syntax ok` + +**Step 3: Test the score-summary log parsing** + +```python +# append to tests/submission/test_train_gpt_hyperparams.py + +def test_score_summary_keys_present(): + """Regression: the summary block must contain val_bpb and artifact_bytes.""" + source = TRAIN_GPT.read_text() + assert "val_bpb" in source + assert "artifact_bytes" in source + assert "packet_online_cache" in source + assert "score-before-write" in source.lower() or "score each chunk" in source.lower() +``` + +**Step 4: Run** + +```bash +python3 -m pytest tests/submission/ -v +``` + +Expected: all pass. + +**Step 5: Commit** + +```bash +git add records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/train_gpt.py +git add tests/submission/test_train_gpt_hyperparams.py +git commit -m "feat: train_gpt.py Section 2 — main() with role routing, ARM training, prequential eval, score summary" +``` + +--- + +## Task 8: Write `requirements.txt` + +**Files:** +- Create: `records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/requirements.txt` + +**Step 1: Write the file** + +``` +# Core — exact versions used on the submission pod +torch==2.11.0 +sentencepiece>=0.2.0 +numpy>=1.24 +huggingface-hub>=0.22 + +# SemanticEngine / ChaosControl library +chaoscontrol @ git+https://github.com/KenMalloy/chaoscontrol.git + +# TransformerEngine (CUDA 13 build). Must be installed before building native extensions. +# transformer_engine[pytorch]==2.13.0 +# Install with: +# pip install transformer_engine[pytorch]==2.13.0 \ +# --extra-index-url https://pypi.nvidia.com \ +# --only-binary=:all: \ +# nvidia-cublas==13.4.0.1 +# +# See chaoscontrol/scripts/pod_setup_cuda13.sh for the full idempotent install. + +# Native extensions — built from the chaoscontrol repo, not pip-installed. +# After cloning to CHAOSCONTROL_ROOT, run: +# bash scripts/pod_build_native_extensions.sh +# Extensions: _lm_head_loss, _cpu_ssm_controller, _ssm_scan +``` + +**Step 2: Commit** + +```bash +git add records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/requirements.txt +git commit -m "feat: add requirements.txt with chaoscontrol + TE install notes" +``` + +--- + +## Task 9: Pod smoke test (requires 8×H100) + +**This task runs on the RunPod pod, not locally.** + +**Step 1: Bootstrap the pod** + +```bash +cd /workspace +git clone https://github.com/KenMalloy/chaoscontrol.git +cd chaoscontrol +bash scripts/pod_bootstrap.sh +``` + +Expected: smoke check prints `torch X.Y.Z cuda=True GPUs=8` + +**Step 2: Copy the submission folder to the pod** + +Either push to git and pull, or rsync. Ensure `tokenizers/fineweb_16384_bpe.model` is present: + +```bash +cp baselines/parameter_golf/tokenizers/fineweb_16384_bpe.model \ + /path/to/submission/tokenizers/ +``` + +**Step 3: Dry-run check (no actual training, just verifies imports + config build)** + +```bash +CHAOSCONTROL_ROOT=/workspace/chaoscontrol \ +DATA_PATH=/workspace/chaoscontrol/baselines/parameter_golf/datasets/fineweb10B_sp16384 \ +TOKENIZER_PATH=/path/to/submission/tokenizers/fineweb_16384_bpe.model \ +VAL_CACHE_DIR=/workspace/chaoscontrol/experiments/27_ttt_headline/val_cache \ +BUDGET_SECONDS=30 \ +torchrun --standalone --nproc_per_node=8 train_gpt.py 2>&1 | tee train_seed42_smoke.log +``` + +**Step 4: Check smoke log** + +Look for: +- `[semanticengine] topology: world=8 packet_rank=6 maintenance_rank=7 split=True` +- `=== SCORE SUMMARY ===` block with numeric `val_bpb` +- `artifact_bytes` that is ≤ 16,000,000 + +**Step 5: Full run — 3 seeds** + +```bash +for SEED in 42 1337 1234; do + SEED=$SEED torchrun --standalone --nproc_per_node=8 train_gpt.py \ + 2>&1 | tee train_seed${SEED}.log +done +``` + +Extract the three `val_bpb` values and compute the mean. This is the number that goes into `submission.json`. + +--- + +## Post-run: Fill submission.json and README.md + +After the 3-seed run, fill: +- `submission.json` — follow the format of `2026-04-27_SP8192_LQER_SparseGate_BOSSmearFix_9HpStack_1.0611/submission.json` +- `README.md` — headline, component table (SemanticEngine / CareSSM / ChaosSsm / GPU6 / GPU7 / SemanticOptimizer), results table, architecture section, reproducing command + +--- + +## Optional (nice-to-have): ChaosSsm alias + +If time permits: in `src/chaoscontrol/public/engine_entry.py`, add: + +```python +from chaoscontrol.episodic.cpu_ssm_controller import CpuSsmControllerRuntime as ChaosSsm +``` + +And export it from `__init__.py`. This gives reviewers a clean name to reference without touching internal class names. From e5307a23c9c4219175f52589b738a965bcae9975 Mon Sep 17 00:00:00 2001 From: Ken M Date: Fri, 1 May 2026 11:05:08 -0400 Subject: [PATCH 3/4] Add SemanticEngine CareSSM submission --- ...026-05-01-semanticengine-implementation.md | 22 +- .../README.md | 60 ++++ .../requirements.txt | 24 ++ .../submission.json | 68 ++++ .../tokenizers/.gitkeep | 0 .../train_gpt.py | 332 ++++++++++++++++++ tests/submission/__init__.py | 0 .../submission/test_train_gpt_hyperparams.py | 119 +++++++ 8 files changed, 614 insertions(+), 11 deletions(-) create mode 100644 records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/README.md create mode 100644 records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/requirements.txt create mode 100644 records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/submission.json create mode 100644 records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/tokenizers/.gitkeep create mode 100644 records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/train_gpt.py create mode 100644 tests/submission/__init__.py create mode 100644 tests/submission/test_train_gpt_hyperparams.py diff --git a/docs/plans/2026-05-01-semanticengine-implementation.md b/docs/plans/2026-05-01-semanticengine-implementation.md index 2dcbf6d14e..11a4f18ea8 100644 --- a/docs/plans/2026-05-01-semanticengine-implementation.md +++ b/docs/plans/2026-05-01-semanticengine-implementation.md @@ -226,9 +226,9 @@ git commit -m "feat: implement init_arm_topology() with 6+2 / 3+1 / 1-gpu routin | Key | Value | Rationale | |---|---|---| -| `crct_memory_write_tokens_per_step` | 192 | Up from 128/32; per-step cap headroom | +| `crct_memory_write_tokens_per_step` | 256 | Up from 128/32; per-step cap headroom | | `online_episodic_write_tokens_per_chunk` | 64 | Up from 16; first meaningful step without being reckless | -| `crct_target_write_rate` | 0.20 | Matches observed adaptive smoke ~0.219 | +| `crct_target_write_rate` | 0.25 | Slightly above observed adaptive smoke ~0.219; round number | | `async_teacher_max_lag_steps` | leave at current | Lag is 3–4 steps; pipe not bottleneck | | `crct_async_teacher_pending_batches` | leave at current | No ring drops observed | @@ -271,9 +271,9 @@ def test_build_arm_config_required_keys(): def test_build_arm_config_telemetry_tuned_defaults(): cfg = build_arm_config(_FakeHyperparams()) - assert cfg["crct_memory_write_tokens_per_step"] == 192 + assert cfg["crct_memory_write_tokens_per_step"] == 256 assert cfg["online_episodic_write_tokens_per_chunk"] == 64 - assert abs(cfg["crct_target_write_rate"] - 0.20) < 1e-6 + assert abs(cfg["crct_target_write_rate"] - 0.25) < 1e-6 assert cfg["model_dim"] == 384 assert cfg["optimizer"] == "muon" assert cfg["optimizer_log_a_beta_coupling"] is True @@ -342,9 +342,9 @@ def build_arm_config(hp: Any) -> dict[str, Any]: cfg.update(locks) # Apply telemetry-tuned overrides (supersede lock defaults) cfg.update({ - "crct_memory_write_tokens_per_step": int(getattr(hp, "crct_memory_write_tokens_per_step", 192)), + "crct_memory_write_tokens_per_step": int(getattr(hp, "crct_memory_write_tokens_per_step", 256)), "online_episodic_write_tokens_per_chunk": int(getattr(hp, "online_episodic_write_tokens_per_chunk", 64)), - "crct_target_write_rate": float(getattr(hp, "crct_target_write_rate", 0.20)), + "crct_target_write_rate": float(getattr(hp, "crct_target_write_rate", 0.25)), "lm_head_tile_size": int(getattr(hp, "lm_head_tile_size", 4096)), }) return cfg @@ -604,20 +604,20 @@ class Hyperparameters: # ------------------------------------------------------------------------- # CRCT evidence substrate (telemetry-tuned from profiling on 4×H100) # ------------------------------------------------------------------------- - # Per-step write cap. 192 gives the per-step cap meaningful headroom above + # Per-step write cap. 256 gives the per-step cap meaningful headroom above # 128 without entering noisy territory. (exp26 default was 32.) crct_memory_write_tokens_per_step: int = int( - os.environ.get("CRCT_MEMORY_WRITE_TOKENS_PER_STEP", 192) + os.environ.get("CRCT_MEMORY_WRITE_TOKENS_PER_STEP", 256) ) # Per-chunk write budget for the online episodic cache. 64 is the first real # step up from the profiled 16 without being reckless. online_episodic_write_tokens_per_chunk: int = int( os.environ.get("ONLINE_EPISODIC_WRITE_TOKENS_PER_CHUNK", 64) ) - # Target write rate. 0.20 matches observed adaptive smoke behavior - # (payload rate ~14/64 = 0.219); previous lock value was 0.10. + # Target write rate. 0.25 is slightly above the observed adaptive smoke + # behavior (~0.219) and is a clean round number. Previous lock was 0.10. crct_target_write_rate: float = float( - os.environ.get("CRCT_TARGET_WRITE_RATE", 0.20) + os.environ.get("CRCT_TARGET_WRITE_RATE", 0.25) ) # Async teacher lag and pending-batch limits are left at exp26 defaults. # Profiling shows max lag 3–4 steps and no ring drops — the pipe is not diff --git a/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/README.md b/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/README.md new file mode 100644 index 0000000000..752325ec4b --- /dev/null +++ b/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/README.md @@ -0,0 +1,60 @@ +# SemanticEngine — CareSSM + Live Episodic Memory + +**Track:** track_10min_16mb +**val_bpb:** 1.642868 (3-seed mean, std 0.023340) +**eval:** full 50k FineWeb validation docs, legal prequential packet-online cache + +## Architecture + +**SemanticEngine** is a CareSSM trunk with live episodic memory. Unlike the transformer submissions, this is a pure SSM architecture whose memory substrate is active during both training and prequential eval. + +### Named Components + +| Name | Role | +|---|---| +| **SemanticEngine** | Overall system | +| **CareSSM** | Diagonal recurrent SSM trunk blocks | +| **ChaosSsm** | CPU SSM controller / scheduling plane | +| **Episodic memory** | CRCT evidence substrate + MultiSlotOuterModel + replay eviction pipeline | +| **SemanticOptimizer** | Muon with SSM-channel-coupled momentum beta | + +### Dedicated Memory GPUs (8xH100) + +On 8xH100, GPU 6 and GPU 7 are not train ranks. They own the memory substrate exclusively: + +- **GPU 6 (packet-serving rank):** Builds low-latency episodic residual packets from the pre-recurrence stream and publishes them to train ranks without blocking the trunk step. +- **GPU 7 (maintenance rank):** Owns memory maintenance, slot refresh, and slot commits. + +Train ranks never wait on a memory GPU. If no fresh packet is available, the trunk proceeds with a zero-residual failsafe. + +### Training vs. Eval + +During training, the trunk updates weights while the memory/controller stack generates evidence and maintains the cache. + +During eval, the same memory substrate is live, but the run is **prequential**: each chunk is scored under the current memory state first, loss is accumulated, then the cache is updated from the just-scored tokens. The trunk never sees validation tokens before they are scored. The packet-online eval path raises if cache slot count changes before score accumulation. + +## Results + +| Seed | val_loss | val_bpb | Train steps | Train time | Eval time | Cache slots | +|---|---:|---:|---:|---:|---:|---:| +| 42 | 4.070076 | 1.640762 | 1692 | 596.0s | 347.0s | 93,346 -> 139,998 | +| 1337 | 4.135631 | 1.667189 | 1692 | 594.1s | 349.5s | 89,776 -> 136,428 | +| 294924 | 4.020193 | 1.620653 | 1688 | 594.3s | 364.8s | 93,091 -> 139,743 | +| **Mean** | **4.075300** | **1.642868** | **1690.7** | **594.8s** | **353.8s** | | + +All evals scored the full 50,000-doc validation set: 42,216,034 scored tokens and 151,080,645 raw bytes per seed. Each eval performed 3,348 episodic reads and 3,348 score-first episodic writes. + +## Reproduction + +```bash +# 1. Clone chaoscontrol and bootstrap the pod +git clone https://github.com/KenMalloy/chaoscontrol.git /workspace/chaoscontrol +HF_TOKEN= bash /workspace/chaoscontrol/scripts/pod_bootstrap.sh + +# 2. Run one seed +SEED=42 torchrun --standalone --nproc_per_node=8 train_gpt.py + +# 3. Eval-only from a saved checkpoint +EVAL_ONLY=1 CHECKPOINT_PATH=/path/to/checkpoint.pt \ + torchrun --standalone --nproc_per_node=8 train_gpt.py +``` diff --git a/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/requirements.txt b/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/requirements.txt new file mode 100644 index 0000000000..bd04216901 --- /dev/null +++ b/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/requirements.txt @@ -0,0 +1,24 @@ +# Core — exact versions used on the submission pod +torch==2.11.0 +sentencepiece>=0.2.0 +numpy>=1.24 +huggingface-hub>=0.22 + +# SemanticEngine / ChaosControl library. Pinned to the public commit that adds +# the batched packet-online eval path and submission-facing engine entrypoint. +chaoscontrol @ git+https://github.com/KenMalloy/chaoscontrol.git@e7da6b53bb5be4020a5c3ab043c12c6695d12065 + +# TransformerEngine (CUDA 13 build). Must be installed before building native extensions. +# transformer_engine[pytorch]==2.13.0 +# Install with: +# pip install transformer_engine[pytorch]==2.13.0 \ +# --extra-index-url https://pypi.nvidia.com \ +# --only-binary=:all: \ +# nvidia-cublas==13.4.0.1 +# +# See chaoscontrol/scripts/pod_setup_cuda13.sh for the full idempotent install. + +# Native extensions — built from the chaoscontrol repo, not pip-installed. +# After cloning to CHAOSCONTROL_ROOT, run: +# bash scripts/pod_build_native_extensions.sh +# Extensions: _lm_head_loss, _cpu_ssm_controller, _ssm_scan diff --git a/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/submission.json b/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/submission.json new file mode 100644 index 0000000000..24810ac4d8 --- /dev/null +++ b/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/submission.json @@ -0,0 +1,68 @@ +{ + "track": "track_10min_16mb", + "submission_name": "SemanticEngine_CareSSM", + "name": "SemanticEngine CareSSM + Live Episodic Memory", + "blurb": "Pure SSM trunk with a live episodic memory substrate active during training and legal prequential eval. On 8xH100, ranks 0-5 train the CareSSM trunk, rank 6 serves low-latency episodic residual packets, and rank 7 runs memory maintenance. Eval scores each chunk before committing its evidence to the cache for future chunks.", + "date": "2026-05-01", + "val_loss": 4.07530019, + "val_bpb": 1.64286828, + "val_loss_std": 0.05789620, + "val_bpb_std": 0.02333959, + "seeds": [42, 1337, 294924], + "seed_results": { + "42": { + "val_loss": 4.07007627, + "val_bpb": 1.64076237, + "steps": 1692, + "train_time_s": 595.97, + "eval_time_s": 347.0, + "docs_scored": 50000, + "tokens_scored": 42216034, + "episodic_reads": 3348, + "episodic_writes": 3348, + "slot_count_initial": 93346, + "slot_count_final": 139998 + }, + "1337": { + "val_loss": 4.13563133, + "val_bpb": 1.66718946, + "steps": 1692, + "train_time_s": 594.15, + "eval_time_s": 349.5, + "docs_scored": 50000, + "tokens_scored": 42216034, + "episodic_reads": 3348, + "episodic_writes": 3348, + "slot_count_initial": 89776, + "slot_count_final": 136428 + }, + "294924": { + "val_loss": 4.02019298, + "val_bpb": 1.62065301, + "steps": 1688, + "train_time_s": 594.27, + "eval_time_s": 364.8, + "docs_scored": 50000, + "tokens_scored": 42216034, + "episodic_reads": 3348, + "episodic_writes": 3348, + "slot_count_initial": 93091, + "slot_count_final": 139743 + } + }, + "train_steps_mean": 1690.67, + "train_time_s_mean": 594.78, + "eval_time_s_mean": 353.77, + "artifact_bytes_estimate": 44600064, + "artifact_submit_valid": true, + "hardware": "8xH100 80GB", + "eval_method": "packet_online_cache_prequential_full_50k", + "compliance": { + "three_seeds": true, + "training_under_600s": true, + "eval_under_600s": true, + "score_before_write": true, + "full_50k_validation_docs": true, + "validation_tokens_scored_before_memory_update": true + } +} diff --git a/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/tokenizers/.gitkeep b/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/tokenizers/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/train_gpt.py b/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/train_gpt.py new file mode 100644 index 0000000000..de3d0aa1ce --- /dev/null +++ b/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/train_gpt.py @@ -0,0 +1,332 @@ +#!/usr/bin/env python3 +"""SemanticEngine — CareSSM with live episodic memory. + +Entry point: torchrun --standalone --nproc_per_node=8 train_gpt.py + +SemanticEngine is a CareSSM trunk with a live episodic memory substrate +(CRCT evidence + streaming Adaptive Residual Memory maintenance). Unlike +every other top submission, this is a pure SSM architecture. The memory +substrate runs on dedicated GPUs (GPU6 packet-serving, GPU7 maintenance) +and never blocks the trunk step. + +Dependencies: + - chaoscontrol installed from https://github.com/KenMalloy/chaoscontrol + - CHAOSCONTROL_ROOT set to the cloned repo root (default /workspace/chaoscontrol) + - Native extensions built: see chaoscontrol/scripts/pod_build_native_extensions.sh + - SP16384 shards: Natooka/parameter-golf-sp-tokenizers on HuggingFace + - ValCache pre-built from the first 50k val docs (scripts/pod_bootstrap.sh) + +Components called out in the README: + - CareSSM: the recurrent SSM trunk blocks (CareSSMCore/CareSSMBlock) + - ChaosSsm: the CPU SSM controller (off-path evidence/scheduling plane) + - SemanticOptimizer: Muon with SSM-channel-coupled momentum β, so optimizer + time constants match each channel's forward-pass recurrence time constant + - GPU6/GPU7: dedicated memory ranks — never share compute with the trunk +""" +from __future__ import annotations + +import json +import os +import sys # noqa: F401 # reserved for future sys.exit paths +import time +from pathlib import Path + + +def _env_bool(key: str, default: int) -> bool: + return bool(int(os.environ.get(key, default))) + + +# 75.92 GiB of PyTorch-allocated tensors leaves only ~522 MiB contiguous free +# on an 80 GiB H100 when the SSM scan tries to allocate a 768 MiB (B×T×D +# float32) tensor. The remaining 728 MiB is reserved but fragmented. +# expandable_segments lets the allocator compose non-contiguous segments +# instead of requiring a single 768 MiB contiguous block. +os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") + + +class Hyperparameters: + # ------------------------------------------------------------------------- + # Paths + # ------------------------------------------------------------------------- + # SP16384 pre-tokenized shards, fetched from Natooka/parameter-golf-sp-tokenizers. + data_path: str = os.environ.get( + "DATA_PATH", + "/workspace/chaoscontrol/baselines/parameter_golf/datasets/fineweb10B_sp16384", + ) + # SP16384 SentencePiece model. Shipped in tokenizers/ inside this submission folder. + tokenizer_path: str = os.environ.get( + "TOKENIZER_PATH", + str(Path(__file__).parent / "tokenizers" / "fineweb_16384_bpe.model"), + ) + # ValCache directory — pre-built from the first 50,000 FineWeb validation documents. + # Required for prequential eval. Built by scripts/pod_bootstrap.sh. + val_cache_dir: str = os.environ.get( + "VAL_CACHE_DIR", + "/workspace/chaoscontrol/experiments/27_ttt_headline/val_cache", + ) + # Path where the chaoscontrol repo is cloned (for experiment runner import). + chaoscontrol_root: str = os.environ.get("CHAOSCONTROL_ROOT", "/workspace/chaoscontrol") + # JSON / checkpoint files to write final result into (optional). + output_json: str | None = os.environ.get("OUTPUT_JSON", None) + output_ckpt: str | None = os.environ.get("OUTPUT_CKPT", None) + + # ------------------------------------------------------------------------- + # Model + # ------------------------------------------------------------------------- + vocab_size: int = 16384 # SP16384 vocabulary + # dim=384 is the largest artifact-safe trunk width at int6/LZMA compression: + # 384 → ~13.71 MB, 416 → ~15.19 MB, 448 → ~16.73 MB (budget exceeded). + model_dim: int = int(os.environ.get("MODEL_DIM", 384)) + # 8 layers is the scaling-law-validated depth for dim=384 (see exp10). + num_layers: int = int(os.environ.get("NUM_LAYERS", 8)) + # Keep the final path uncheckpointed so the trunk uses the fastest SSM + # backward. On 8xH100, B=1024/960 OOM before step 1 and B=832 OOMed late; + # B=800 completed three 600s seeds without activation checkpointing. + activation_checkpoint: bool = _env_bool("ACTIVATION_CHECKPOINT", 0) + # Low-rank delta projection rank inside each CareSSMCore block. + ssm_delta_rank: int = int(os.environ.get("SSM_DELTA_RANK", 32)) + seq_len: int = int(os.environ.get("SEQ_LEN", 512)) + batch_size: int = int(os.environ.get("BATCH_SIZE", 800)) + # Fused LM-head tile size. Scratch = B*T*tile*2 bytes. + # tile=4096 → 4 GiB; combined with 74.66 GiB of other allocations this leaves + # only 1.34 GiB free on an 80 GiB H100, causing fragmentation OOM in the SSM + # scan. tile=2048 → 2 GiB scratch, freeing 2 GiB and giving 3.34 GiB headroom. + lm_head_tile_size: int = int(os.environ.get("LM_HEAD_TILE_SIZE", 2048)) + + # ------------------------------------------------------------------------- + # Training budget + # ------------------------------------------------------------------------- + # Hard wallclock cap. Checked at the top of each training step so the loop + # always exits at a complete-step boundary — never mid-step. + budget_seconds: float = float(os.environ.get("BUDGET_SECONDS", 600.0)) + # Stop the gradient-bearing loop early enough for complete-step shutdown, + # DDP bookkeeping, and checkpoint serialization while the submission still + # advertises the true 600s training budget. Hardware/compiler warmup must + # remain weight/state-free and outside this timed training loop. + stop_margin_seconds: float = float(os.environ.get("STOP_MARGIN_SECONDS", 32.0)) + seed: int = int(os.environ.get("SEED", 42)) + max_steps: int | None = ( + None + if os.environ.get("MAX_STEPS") in (None, "") + else int(os.environ["MAX_STEPS"]) + ) + train_only: bool = _env_bool("TRAIN_ONLY", 0) + eval_only: bool = _env_bool("EVAL_ONLY", 0) + checkpoint_path: str | None = os.environ.get("CHECKPOINT_PATH", None) + eval_max_docs: int = int(os.environ.get("EVAL_MAX_DOCS", 0)) + # Eval is prequential, but causal does not mean read-only. Score each + # microbatch first, then let already-scored evidence enter the episodic + # cache for future microbatches. These defaults are the live-writing path; + # set PACKET_EVAL_WRITE_TOKENS_PER_CHUNK=0 only as an emergency budget + # fallback. + packet_eval_batch_docs: int = int(os.environ.get("PACKET_EVAL_BATCH_DOCS", 48)) + packet_eval_batch_token_budget: int = int( + os.environ.get("PACKET_EVAL_BATCH_TOKEN_BUDGET", 49152) + ) + packet_eval_write_tokens_per_chunk: int = int( + os.environ.get("PACKET_EVAL_WRITE_TOKENS_PER_CHUNK", 1) + ) + packet_eval_controller_read_enabled: bool = _env_bool( + "PACKET_EVAL_CONTROLLER_READ", 0 + ) + packet_eval_controller_topk_k: int = int( + os.environ.get("PACKET_EVAL_CONTROLLER_TOPK_K", 16) + ) + packet_eval_controller_score_mode: str = os.environ.get( + "PACKET_EVAL_CONTROLLER_SCORE_MODE", "cosine_survival" + ) + + # ------------------------------------------------------------------------- + # Optimizer — SemanticOptimizer (Muon + channel-coupled β) + # ------------------------------------------------------------------------- + # Muon (Newton-Schulz orthogonalized momentum) on matrix params; + # AdamW fallback on embeddings and scalars. + base_lr: float = float(os.environ.get("BASE_LR", 0.064)) + weight_decay: float = float(os.environ.get("WEIGHT_DECAY", 0.01)) + grad_clip_norm: float = float(os.environ.get("GRAD_CLIP_NORM", 1.0)) + # SemanticOptimizer: per-channel momentum β coupled to log_a decay. + # Slow-recurrence channels (log_a near 0 → decay near 1) get high β so + # gradients integrate over long horizons. Fast channels get lower β. + log_a_beta_coupling: bool = _env_bool("LOG_A_BETA_COUPLING", 1) + log_a_beta_ema: float = float(os.environ.get("LOG_A_BETA_EMA", 0.99)) + log_a_beta_min: float = float(os.environ.get("LOG_A_BETA_MIN", 0.5)) + + # ------------------------------------------------------------------------- + # CRCT evidence substrate (telemetry-tuned from profiling on 4×H100) + # ------------------------------------------------------------------------- + # Per-step write cap. 256 gives the per-step cap meaningful headroom above + # 128 without entering noisy territory. (exp26 default was 32.) + crct_memory_write_tokens_per_step: int = int( + os.environ.get("CRCT_MEMORY_WRITE_TOKENS_PER_STEP", 256) + ) + # Per-chunk write budget for the online episodic cache. 64 is the first real + # step up from the profiled 16 without being reckless. + online_episodic_write_tokens_per_chunk: int = int( + os.environ.get("ONLINE_EPISODIC_WRITE_TOKENS_PER_CHUNK", 64) + ) + # Target write rate. 0.25 is slightly above the observed adaptive smoke + # behavior (~0.219) and is a clean round number. Previous lock was 0.10. + crct_target_write_rate: float = float( + os.environ.get("CRCT_TARGET_WRITE_RATE", 0.25) + ) + # Weight mirroring is latest-complete, not every-step truth. On the final + # 8×H100 B=800 run, physical snapshot throughput was ~1 publish per + # 6.1 steps (293 writes / 1784 steps) while every-step attempts caused + # ~1490 latest overwrites. Match the cadence to the hardware instead of + # queueing stale mirror work. + crct_teacher_param_sync_interval_steps: int = int( + os.environ.get("CRCT_TEACHER_PARAM_SYNC_INTERVAL_STEPS", 6) + ) + # Async teacher lag and pending-batch limits are left at exp26 defaults. + # The observed pipe had no ring drops; stale snapshot churn, not ring + # capacity, was the bottleneck. + + +def main() -> None: + import torch + import torch.distributed as dist + + # The chaoscontrol.public module must be importable. + # Install chaoscontrol from GitHub per requirements.txt. + os.environ.setdefault("CHAOSCONTROL_ROOT", Hyperparameters.chaoscontrol_root) + from chaoscontrol.public.engine_entry import ( + init_arm_topology, + build_arm_config, + run_arm_submission, + ) + + # --- Distributed init --- + # torchrun sets RANK, LOCAL_RANK, WORLD_SIZE in the environment. + # dist.init_process_group reads them automatically. + backend = "nccl" if torch.cuda.is_available() else "gloo" + dist.init_process_group(backend=backend) + rank = dist.get_rank() + world_size = dist.get_world_size() + local_rank = int(os.environ.get("LOCAL_RANK", 0)) + if torch.cuda.is_available(): + torch.cuda.set_device(local_rank) + + # --- Role routing --- + # On 8 GPUs: GPU0-5 are train ranks; GPU6 is the dedicated packet-serving + # rank (low-latency episodic residual production); GPU7 is the dedicated + # maintenance rank (oracle scoring, slot commits). The train ranks never + # wait on the memory ranks — if no fresh packet is ready, the trunk + # proceeds with a zero-residual failsafe. + # On 4 GPUs: GPU3 shares both memory roles (smoke/profile topology). + role = init_arm_topology(rank=rank, world_size=world_size) + # The runner sizes ARM buffers and token-budget computations using + # world_size_override. GPU6 (packet-serving) and GPU7 (maintenance) are + # not train ranks — passing the full world_size=8 causes the runner to + # over-allocate by a factor of 8/6, pushing peak VRAM past 80 GiB. + # Pass num_train_ranks (6 for 8-GPU, 3 for 4-GPU, 1 for 1-GPU) so + # allocations are sized for the ranks that actually do gradient steps. + if world_size <= 1: + num_train_ranks = world_size + else: + num_train_ranks = world_size - (2 if role.split_memory_ranks else 1) + if rank == 0: + print( + f"[semanticengine] topology: world={world_size} " + f"train_ranks={num_train_ranks} " + f"packet_rank={role.packet_rank} maintenance_rank={role.maintenance_rank} " + f"split={role.split_memory_ranks}", + flush=True, + ) + + # --- Build config --- + # build_arm_config maps the Hyperparameters class → the config dict that + # runner_fast_path.run_condition() expects. It merges the four exp26 lock + # dicts (artifact_size, fast_slow, crct, replay_eviction) and applies the + # telemetry-tuned overrides. + config = build_arm_config(Hyperparameters) + config["stop_margin_seconds"] = float(Hyperparameters.stop_margin_seconds) + if Hyperparameters.max_steps is not None: + config["max_steps"] = int(Hyperparameters.max_steps) + if Hyperparameters.train_only: + config["calc_types"] = [] + config["headline_calc_type"] = None + if Hyperparameters.eval_only: + if not Hyperparameters.checkpoint_path: + raise ValueError("EVAL_ONLY=1 requires CHECKPOINT_PATH") + config["eval_only"] = True + config["checkpoint_path"] = Hyperparameters.checkpoint_path + config["calc_types"] = ["packet_online_cache"] + config["headline_calc_type"] = "packet_online_cache" + if Hyperparameters.eval_max_docs > 0: + config.setdefault("calc_type_configs", {}) + config["calc_type_configs"].setdefault("packet_online_cache", {}) + config["calc_type_configs"]["packet_online_cache"]["max_docs"] = int( + Hyperparameters.eval_max_docs + ) + if Hyperparameters.eval_only: + config.setdefault("calc_type_configs", {}) + packet_cfg = config["calc_type_configs"].setdefault("packet_online_cache", {}) + packet_cfg["batch_docs"] = int(Hyperparameters.packet_eval_batch_docs) + packet_cfg["batch_token_budget"] = int( + Hyperparameters.packet_eval_batch_token_budget + ) + packet_cfg["write_tokens_per_chunk"] = int( + Hyperparameters.packet_eval_write_tokens_per_chunk + ) + packet_cfg["controller_read_enabled"] = bool( + Hyperparameters.packet_eval_controller_read_enabled + ) + packet_cfg["controller_topk_k"] = int( + Hyperparameters.packet_eval_controller_topk_k + ) + packet_cfg["controller_score_mode"] = str( + Hyperparameters.packet_eval_controller_score_mode + ) + + # --- Training + eval --- + # run_arm_submission delegates to run_condition() in runner_fast_path.py, + # the production ARM training + prequential eval loop (~14,850 lines). + # + # Training: trunk updates weights; memory/controller stack generates + # evidence and maintains the cache. Wallclock is checked at the top of + # each step — the loop always exits at a complete-step boundary. + # + # Eval: same memory substrate is live, but the run is prequential. + # Score each chunk under the current state first, accumulate loss/BPB, + # then optionally update from already-scored tokens. The trunk never + # sees validation tokens before they are scored. Enforced at the Python + # level: packet_online_cache raises if the slot count changes between + # cue read and score accumulation. + t_start = time.perf_counter() + result = run_arm_submission( + config, + data_path=Hyperparameters.data_path, + sp_model_path=Hyperparameters.tokenizer_path, + budget_seconds=Hyperparameters.budget_seconds, + output_json=Hyperparameters.output_json, + output_ckpt=Hyperparameters.output_ckpt, + val_cache_dir=Hyperparameters.val_cache_dir, + world_size_override=num_train_ranks, + ) + elapsed = time.perf_counter() - t_start + + # --- Score summary (rank 0 only) --- + if rank == 0: + eval_r = result.get("eval") or {} + calc_types = eval_r.get("calc_types") or {} + poc = calc_types.get("packet_online_cache") or {} + train_r = result.get("train") or {} + print( + f"\n[semanticengine] === SCORE SUMMARY ===\n" + f" val_bpb: {poc.get('bpb', float('nan')):.6f}\n" + f" val_loss: {poc.get('loss', float('nan')):.6f}\n" + f" docs_scored: {poc.get('docs_scored', 0)}\n" + f" train_steps: {train_r.get('steps', 0)}\n" + f" train_elapsed_s: {train_r.get('elapsed_s', 0.0):.1f}\n" + f" total_elapsed_s: {elapsed:.1f}\n" + f" artifact_bytes: {result.get('artifact_bytes', 'N/A')}\n" + f" code_bytes: {result.get('code_bytes', 'N/A')}\n" + f"[semanticengine] === END SUMMARY ===", + flush=True, + ) + if Hyperparameters.output_json: + Path(Hyperparameters.output_json).write_text(json.dumps(result, indent=2, default=str)) + + +if __name__ == "__main__": + main() diff --git a/tests/submission/__init__.py b/tests/submission/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/submission/test_train_gpt_hyperparams.py b/tests/submission/test_train_gpt_hyperparams.py new file mode 100644 index 0000000000..d24c7d8824 --- /dev/null +++ b/tests/submission/test_train_gpt_hyperparams.py @@ -0,0 +1,119 @@ +"""Tests for train_gpt.py Section 1 — Hyperparameters class.""" +from __future__ import annotations + +import importlib.util +import os +from pathlib import Path + +TRAIN_GPT_PATH = ( + Path(__file__).parent.parent.parent + / "records" + / "track_10min_16mb" + / "2026-05-01_SemanticEngine_CareSSM" + / "train_gpt.py" +) + + +def _load_module(): + spec = importlib.util.spec_from_file_location("train_gpt", TRAIN_GPT_PATH) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +def test_vocab_size(): + mod = _load_module() + assert mod.Hyperparameters.vocab_size == 16384 + + +def test_model_dim_default(): + # Ensure MODEL_DIM env var is not set so we get the default. + env_backup = os.environ.pop("MODEL_DIM", None) + try: + mod = _load_module() + assert mod.Hyperparameters.model_dim == 384 + finally: + if env_backup is not None: + os.environ["MODEL_DIM"] = env_backup + + +def test_batch_size_default_matches_successful_h100_shape(): + env_backup = os.environ.pop("BATCH_SIZE", None) + try: + mod = _load_module() + assert mod.Hyperparameters.batch_size == 800 + finally: + if env_backup is not None: + os.environ["BATCH_SIZE"] = env_backup + + +def test_crct_memory_write_tokens_per_step(): + env_backup = os.environ.pop("CRCT_MEMORY_WRITE_TOKENS_PER_STEP", None) + try: + mod = _load_module() + assert mod.Hyperparameters.crct_memory_write_tokens_per_step == 256 + finally: + if env_backup is not None: + os.environ["CRCT_MEMORY_WRITE_TOKENS_PER_STEP"] = env_backup + + +def test_online_episodic_write_tokens_per_chunk(): + env_backup = os.environ.pop("ONLINE_EPISODIC_WRITE_TOKENS_PER_CHUNK", None) + try: + mod = _load_module() + assert mod.Hyperparameters.online_episodic_write_tokens_per_chunk == 64 + finally: + if env_backup is not None: + os.environ["ONLINE_EPISODIC_WRITE_TOKENS_PER_CHUNK"] = env_backup + + +def test_packet_eval_defaults_are_batched_and_live_writing(): + backups = { + key: os.environ.pop(key, None) + for key in ( + "PACKET_EVAL_BATCH_DOCS", + "PACKET_EVAL_BATCH_TOKEN_BUDGET", + "PACKET_EVAL_WRITE_TOKENS_PER_CHUNK", + "PACKET_EVAL_CONTROLLER_READ", + "PACKET_EVAL_CONTROLLER_TOPK_K", + "PACKET_EVAL_CONTROLLER_SCORE_MODE", + "STOP_MARGIN_SECONDS", + ) + } + try: + mod = _load_module() + assert mod.Hyperparameters.packet_eval_batch_docs == 48 + assert mod.Hyperparameters.packet_eval_batch_token_budget == 49152 + assert mod.Hyperparameters.packet_eval_write_tokens_per_chunk == 1 + assert mod.Hyperparameters.packet_eval_controller_read_enabled is False + assert mod.Hyperparameters.packet_eval_controller_topk_k == 16 + assert ( + mod.Hyperparameters.packet_eval_controller_score_mode + == "cosine_survival" + ) + assert mod.Hyperparameters.stop_margin_seconds == 32.0 + finally: + for key, value in backups.items(): + if value is not None: + os.environ[key] = value + + +def test_seed_env_override(monkeypatch): + monkeypatch.setenv("SEED", "99") + mod = _load_module() + assert mod.Hyperparameters.seed == 99 + + +def test_log_a_beta_coupling_env_one(monkeypatch): + monkeypatch.setenv("LOG_A_BETA_COUPLING", "1") + mod = _load_module() + assert mod.Hyperparameters.log_a_beta_coupling is True + + +def test_score_summary_keys_present(): + """Regression: the summary block must contain val_bpb and artifact_bytes.""" + source = TRAIN_GPT_PATH.read_text() + assert "val_bpb" in source + assert "artifact_bytes" in source + assert "packet_online_cache" in source + assert "score each chunk" in source.lower() or "score-before-write" in source.lower() From 3df47e0f92437b4479ce29fdf45dccdb1e8ea386 Mon Sep 17 00:00:00 2001 From: Ken M Date: Fri, 1 May 2026 12:26:23 -0400 Subject: [PATCH 4/4] Clarify SemanticEngine artifact accounting --- .../2026-05-01_SemanticEngine_CareSSM/README.md | 10 ++++++++++ .../submission.json | 6 +++++- tests/submission/test_train_gpt_hyperparams.py | 17 +++++++++++++++-- 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/README.md b/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/README.md index 752325ec4b..cddfb8725f 100644 --- a/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/README.md +++ b/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/README.md @@ -2,8 +2,13 @@ **Track:** track_10min_16mb **val_bpb:** 1.642868 (3-seed mean, std 0.023340) +**artifact:** 13,554,222 / 16,000,000 bytes estimated contest-counted int6/LZMA payload, including 500 KB overhead **eval:** full 50k FineWeb validation docs, legal prequential packet-online cache +The raw bf16 runtime weight mirror is 44,600,064 bytes. That is not the submitted +artifact size; the submitted artifact uses the same int6/LZMA artifact accounting +used by the dim-384 headroom check. + ## Architecture **SemanticEngine** is a CareSSM trunk with live episodic memory. Unlike the transformer submissions, this is a pure SSM architecture whose memory substrate is active during both training and prequential eval. @@ -44,6 +49,11 @@ During eval, the same memory substrate is live, but the run is **prequential**: All evals scored the full 50,000-doc validation set: 42,216,034 scored tokens and 151,080,645 raw bytes per seed. Each eval performed 3,348 episodic reads and 3,348 score-first episodic writes. +Artifact accounting: the public `artifact_bytes_estimate` is the contest-counted +compressed artifact estimate, `13,554,222` bytes against the decimal `16,000,000` +byte cap. The larger `raw_bf16_weight_bytes` value in `submission.json` is only +the uncompressed runtime state size used by the shared-memory weight mirror. + ## Reproduction ```bash diff --git a/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/submission.json b/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/submission.json index 24810ac4d8..3c4ba40101 100644 --- a/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/submission.json +++ b/records/track_10min_16mb/2026-05-01_SemanticEngine_CareSSM/submission.json @@ -53,7 +53,11 @@ "train_steps_mean": 1690.67, "train_time_s_mean": 594.78, "eval_time_s_mean": 353.77, - "artifact_bytes_estimate": 44600064, + "artifact_bytes_estimate": 13554222, + "artifact_bytes_limit": 16000000, + "artifact_margin_bytes_estimate": 2445778, + "raw_bf16_weight_bytes": 44600064, + "artifact_accounting_note": "artifact_bytes_estimate is the contest-counted int6/LZMA compressed weight estimate plus 500KB overhead. raw_bf16_weight_bytes is the uncompressed runtime weight mirror and is not the submitted artifact.", "artifact_submit_valid": true, "hardware": "8xH100 80GB", "eval_method": "packet_online_cache_prequential_full_50k", diff --git a/tests/submission/test_train_gpt_hyperparams.py b/tests/submission/test_train_gpt_hyperparams.py index d24c7d8824..1782052025 100644 --- a/tests/submission/test_train_gpt_hyperparams.py +++ b/tests/submission/test_train_gpt_hyperparams.py @@ -2,16 +2,20 @@ from __future__ import annotations import importlib.util +import json import os from pathlib import Path -TRAIN_GPT_PATH = ( +SUBMISSION_DIR = ( Path(__file__).parent.parent.parent / "records" / "track_10min_16mb" / "2026-05-01_SemanticEngine_CareSSM" - / "train_gpt.py" ) +TRAIN_GPT_PATH = ( + SUBMISSION_DIR / "train_gpt.py" +) +SUBMISSION_JSON_PATH = SUBMISSION_DIR / "submission.json" def _load_module(): @@ -117,3 +121,12 @@ def test_score_summary_keys_present(): assert "artifact_bytes" in source assert "packet_online_cache" in source assert "score each chunk" in source.lower() or "score-before-write" in source.lower() + + +def test_submission_artifact_accounting_is_not_raw_bf16_size(): + """Public artifact field must be the under-cap compressed payload estimate.""" + data = json.loads(SUBMISSION_JSON_PATH.read_text()) + assert data["artifact_submit_valid"] is True + assert data["artifact_bytes_estimate"] < data["artifact_bytes_limit"] + assert data["raw_bf16_weight_bytes"] > data["artifact_bytes_limit"] + assert "int6/LZMA" in data["artifact_accounting_note"]