hulohot · hulohot · Feb 26, 2026 · Feb 26, 2026 · Mar 1, 2026 · Mar 15, 2026
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -7,7 +7,7 @@
     },
     "ghcr.io/devcontainers/features/github-cli:1": {}
   },
-  "postCreateCommand": "sudo apt-get update && sudo apt-get install -y cmake verilator python3-pip && pip3 install numpy",
+  "postCreateCommand": "sudo apt-get update && sudo apt-get install -y cmake verilator python3-pip && python3 -m pip install --upgrade pip && python3 -m pip install -r requirements-llm.txt",
   "postStartCommand": "cmake --version && verilator --version",
   "customizations": {
     "vscode": {

diff --git a/README.md b/README.md
@@ -64,27 +64,38 @@ tiny-npu/
 └── .github/workflows/      # CI
 ```
 
-## Minimal real-weights LLM demo (interactive + first-token compare)
+## Minimal real-weights LLM demo (interactive + reference vs simulated decode)
 
 This repository includes a minimal end-to-end path that uses **real HuggingFace GPT-2-family weights** (default: `sshleifer/tiny-gpt2`) and reports:
 - **reference generation** from full HF model
-- **simulated token** from an INT8 projection-only path (real hidden-state + real `lm_head`, first-token only)
+- **simulated generation** from an INT8 projection decode path (multi-token software approximation using real hidden states + real `lm_head`)
 
-### 1) Prepare artifacts in `demo_data`
+### 1) Install runtime dependencies (one-time)
+
+```bash
+bash scripts/setup_llm_env.sh
+source .venv/bin/activate
+```
+
+(Or manually: `python -m pip install -r requirements-llm.txt`.)
+
+### 2) Prepare artifacts in `demo_data`
 
 ```bash
 python -m python.run_tiny_llm_sim --prepare --prompt "hello"
 ```
 
 This runs export + quantization/packing. Pack assumptions are recorded in `demo_data/quant_manifest.json`.
 
-### 2) One-shot run (JSON output)
+### 3) One-shot run (JSON output)
 
 ```bash
 python -m python.run_tiny_llm_sim \
   --prompt "Hello tiny NPU" \
   --max-new-tokens 16 \
-  --temperature 0.9 --top-k 40 --top-p 0.95 --seed 42
+  --temperature 0.9 --top-k 40 --top-p 0.95 --seed 42 \
+  --sim-max-new-tokens 16 \
+  --sim-temperature 0.0 --sim-top-k 0 --sim-top-p 1.0 --sim-seed 123
 ```
 
 Optional smoke check integration (if Verilator build exists):
@@ -93,21 +104,24 @@ Optional smoke check integration (if Verilator build exists):
 python -m python.run_tiny_llm_sim --prompt "Hello tiny NPU" --run-verilator-smoke
 ```
 
-### 3) Interactive mode
+### 4) Interactive mode
 
 ```bash
-python -m python.run_tiny_llm_sim --interactive --max-new-tokens 16 --temperature 0.9 --top-k 40 --top-p 0.95 --seed 42
+python -m python.run_tiny_llm_sim \
+  --interactive \
+  --max-new-tokens 16 --temperature 0.9 --top-k 40 --top-p 0.95 --seed 42 \
+  --sim-max-new-tokens 16 --sim-temperature 0.0 --sim-top-k 0 --sim-top-p 1.0 --sim-seed 123
 ```
 
-### 4) Smoke/regression check
+### 5) Smoke/regression check
 
 ```bash
 python -m unittest python/tests/test_tiny_llm_smoke.py
 ```
 
 > Note: this smoke test auto-skips when model dependencies/download are unavailable in the environment.
 
-### 5) First-token evaluation harness (reference vs simulated)
+### 6) First-token evaluation harness (reference vs simulated)
 
 ```bash
 python3 -m python.eval_first_token --prepare
@@ -118,7 +132,7 @@ python3 -m python.eval_first_token --prepare
 
 This gives you a prompt-set match rate so improvements can be measured over time.
 
-### 6) Prompt-set variation check (interactive quality)
+### 7) Prompt-set variation check (interactive quality)
 
 ```bash
 python3 -m python.eval_prompt_variation
@@ -132,8 +146,8 @@ This reports unique first-token count and variation ratio across a prompt set.
 ### Current limitations
 
 - RTL path is **not yet wired** for full GPT-2 token generation.
-- `simulated` remains first-token projection-only INT8 emulation, not full block-by-block RTL execution.
-- Multi-token autoregressive decode and KV-cache handling are not yet implemented in hardware flow.
+- `simulated` uses INT8 projection decode with hidden states from the full HF model at each step; this is **not** full block-by-block RTL execution.
+- KV-cache behavior and true hardware-timed autoregressive decode are not yet implemented in the RTL path.
 
 ## Contributing
 

diff --git a/python/run_tiny_llm_sim.py b/python/run_tiny_llm_sim.py
@@ -3,7 +3,7 @@
 Capabilities:
 - Export + quantize + pack real tiny GPT-2 weights (optional --prepare)
 - Reference generation via full HF model (1+ tokens)
-- Simulated token via INT8 projection-only path (first generated token)
+- Simulated decode via INT8 projection-only path (multi-token software approximation)
 - Optional Verilator smoke execution
 - Optional interactive REPL mode
 """
@@ -111,22 +111,30 @@ def _run_reference_generation(
     top_p: float,
     repetition_penalty: float,
     seed: int,
-) -> tuple[list[int], str, np.ndarray, list[int]]:
+) -> tuple[list[int], str, np.ndarray, list[int], list[np.ndarray]]:
+    """Reference generation via full HF model.
+
+    Returns generated tokens, text, first hidden state, prompt token IDs,
+    and a list of hidden states (one per generated token position).
+    """
     inputs = tokenizer(prompt, return_tensors="pt")
     input_ids = inputs["input_ids"]
     seen_ids = input_ids[0].tolist()
     rng = np.random.default_rng(seed)
 
     generated: list[int] = []
     first_hidden: np.ndarray | None = None
+    hidden_states_list: list[np.ndarray] = []
 
     for _ in range(max_new_tokens):
         with torch.no_grad():
             out = model(input_ids=input_ids, output_hidden_states=True, return_dict=True)
 
         logits = out.logits[0, -1, :].detach().cpu().numpy()
+        last_hidden = out.hidden_states[-1][0, -1, :].detach().cpu().numpy()
+        hidden_states_list.append(last_hidden)
         if first_hidden is None:
-            first_hidden = out.hidden_states[-1][0, -1, :].detach().cpu().numpy()
+            first_hidden = last_hidden
 
         adjusted = _apply_repetition_penalty(logits, seen_ids + generated, repetition_penalty)
         next_id = _sample_from_logits(
@@ -142,7 +150,63 @@ def _run_reference_generation(
         input_ids = torch.cat([input_ids, next_tok], dim=1)
 
     text = tokenizer.decode(generated, clean_up_tokenization_spaces=False)
-    return generated, text, first_hidden, seen_ids
+    return generated, text, first_hidden, seen_ids, hidden_states_list
+
+
+def _run_simulated_generation(
+    model: Any,
+    tokenizer: Any,
+    torch: Any,
+    prompt: str,
+    max_new_tokens: int,
+    lm_head_w: np.ndarray,
+    temperature: float,
+    top_k: int,
+    top_p: float,
+    repetition_penalty: float,
+    seed: int,
+    hidden_states: list[np.ndarray] | None = None,
+) -> tuple[list[int], str, list[int]]:
+    """Multi-token simulated decode using hybrid approach.
+
+    Option C (hybrid simulated decode):
+    - Token 0: INT8 projection from initial hidden state (original behavior)
+    - Tokens 2+: Use cached reference hidden states from prior HF forward steps
+
+    If hidden_states is provided, uses cached states for tokens 1+.
+    Otherwise, falls back to computing hidden states via full HF model each step.
+    """
+    inputs = tokenizer(prompt, return_tensors="pt")
+    input_ids = inputs["input_ids"]
+    seen_ids = input_ids[0].tolist()
+    rng = np.random.default_rng(seed)
+
+    generated: list[int] = []
+    for i in range(max_new_tokens):
+        # Use cached hidden state if available (tokens 1+), otherwise compute
+        if hidden_states is not None and i < len(hidden_states):
+            last_hidden = hidden_states[i]
+        else:
+            with torch.no_grad():
+                out = model(input_ids=input_ids, output_hidden_states=True, return_dict=True)
+            last_hidden = out.hidden_states[-1][0, -1, :].detach().cpu().numpy()
+
+        sim_logits = _sim_logits_from_hidden(last_hidden, lm_head_w)
+        adjusted = _apply_repetition_penalty(sim_logits, seen_ids + generated, repetition_penalty)
+        next_id = _sample_from_logits(
+            adjusted,
+            rng=rng,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+        )
+
+        generated.append(next_id)
+        next_tok = torch.tensor([[next_id]], dtype=input_ids.dtype, device=input_ids.device)
+        input_ids = torch.cat([input_ids, next_tok], dim=1)
+
+    text = tokenizer.decode(generated, clean_up_tokenization_spaces=False)
+    return generated, text, seen_ids
 
 
 def run_demo(
@@ -157,6 +221,12 @@ def run_demo(
     top_p: float,
     repetition_penalty: float,
     seed: int,
+    sim_max_new_tokens: int,
+    sim_temperature: float,
+    sim_top_k: int,
+    sim_top_p: float,
+    sim_repetition_penalty: float,
+    sim_seed: int,
 ) -> dict:
     try:
         import torch
@@ -182,7 +252,7 @@ def run_demo(
     model = AutoModelForCausalLM.from_pretrained(model_name)
     model.eval()
 
-    gen_ids, gen_text, first_hidden, prompt_token_ids = _run_reference_generation(
+    gen_ids, gen_text, _first_hidden, prompt_token_ids, ref_hidden_states = _run_reference_generation(
         model=model,
         tokenizer=tokenizer,
         torch=torch,
@@ -196,10 +266,30 @@ def run_demo(
     )
 
     # Simulated path (INT8 projection from real hidden state + real lm_head)
+    # Use cached reference hidden states for hybrid simulated decode
     lm_head = np.load(datadir / "lm_head.npy")
-    sim_logits = _sim_logits_from_hidden(first_hidden, lm_head)
-    sim_token_id = int(np.argmax(sim_logits))
-    sim_token = tokenizer.decode([sim_token_id], clean_up_tokenization_spaces=False)
+    sim_gen_ids, sim_gen_text, sim_prompt_token_ids = _run_simulated_generation(
+        model=model,
+        tokenizer=tokenizer,
+        torch=torch,
+        prompt=prompt,
+        max_new_tokens=sim_max_new_tokens,
+        lm_head_w=lm_head,
+        temperature=sim_temperature,
+        top_k=sim_top_k,
+        top_p=sim_top_p,
+        repetition_penalty=sim_repetition_penalty,
+        seed=sim_seed,
+        hidden_states=ref_hidden_states,
+    )
+
+    # Backward-compatible first-token fields
+    sim_token_id = int(sim_gen_ids[0]) if sim_gen_ids else None
+    sim_token = (
+        tokenizer.decode([sim_token_id], clean_up_tokenization_spaces=False)
+        if sim_token_id is not None
+        else ""
+    )
 
     result = {
         "model": model_name,
@@ -213,6 +303,14 @@ def run_demo(
             "repetition_penalty": repetition_penalty,
             "seed": seed,
         },
+        "sim_decoding": {
+            "max_new_tokens": sim_max_new_tokens,
+            "temperature": sim_temperature,
+            "top_k": sim_top_k,
+            "top_p": sim_top_p,
+            "repetition_penalty": sim_repetition_penalty,
+            "seed": sim_seed,
+        },
         "reference": {
             "token_id": int(gen_ids[0]),
             "token": tokenizer.decode([int(gen_ids[0])], clean_up_tokenization_spaces=False),
@@ -223,7 +321,14 @@ def run_demo(
         "simulated": {
             "token_id": sim_token_id,
             "token": sim_token,
-            "note": "INT8 simulated projection only (first-token projection from real hidden state + real lm_head)",
+            "prompt_token_ids": sim_prompt_token_ids,
+            "generated_token_ids": [int(x) for x in sim_gen_ids],
+            "generated_text": sim_gen_text,
+            "note": (
+                "Hybrid simulated decode using reference hidden states with INT8 projection; "
+                "token 0 uses initial hidden state, tokens 2+ use cached reference hidden states; "
+                "this is a software approximation and not full RTL/hardware autoregressive execution"
+            ),
         },
     }
 
@@ -269,6 +374,7 @@ def _run_interactive(args: argparse.Namespace) -> None:
         # For interactive UX, avoid reusing the exact same RNG stream every turn.
         # If seed>=0, derive per-turn deterministic seeds; if seed<0, randomize.
         run_seed = (args.seed + turn) if args.seed >= 0 else int(np.random.randint(0, 2**31 - 1))
+        sim_run_seed = (args.sim_seed + turn) if args.sim_seed >= 0 else int(np.random.randint(0, 2**31 - 1))
 
         result = run_demo(
             prompt=prompt,
@@ -282,10 +388,16 @@ def _run_interactive(args: argparse.Namespace) -> None:
             top_p=args.top_p,
             repetition_penalty=args.repetition_penalty,
             seed=run_seed,
+            sim_max_new_tokens=args.sim_max_new_tokens,
+            sim_temperature=args.sim_temperature,
+            sim_top_k=args.sim_top_k,
+            sim_top_p=args.sim_top_p,
+            sim_repetition_penalty=args.sim_repetition_penalty,
+            sim_seed=sim_run_seed,
         )
 
         print(f"ref> {result['reference']['generated_text']!r}")
-        print(f"sim> {result['simulated']['token']!r}  (first-token projection-only)")
+        print(f"sim> {result['simulated']['generated_text']!r}  (projection-decode simulation)")
         turn += 1
 
 
@@ -302,6 +414,12 @@ def main() -> None:
     parser.add_argument("--top-p", type=float, default=0.95, help="Top-p nucleus sampling cutoff")
     parser.add_argument("--repetition-penalty", type=float, default=1.1, help="Penalty >1.0 discourages repeats")
     parser.add_argument("--seed", type=int, default=42, help="Base random seed (interactive mode auto-increments per turn; use -1 for random each turn)")
+    parser.add_argument("--sim-max-new-tokens", type=int, default=12, help="Simulated generation length")
+    parser.add_argument("--sim-temperature", type=float, default=0.0, help="Simulated sampling temperature (0=greedy)")
+    parser.add_argument("--sim-top-k", type=int, default=0, help="Simulated top-k sampling cutoff (0 disables)")
+    parser.add_argument("--sim-top-p", type=float, default=1.0, help="Simulated top-p nucleus sampling cutoff")
+    parser.add_argument("--sim-repetition-penalty", type=float, default=1.0, help="Simulated penalty >1.0 discourages repeats")
+    parser.add_argument("--sim-seed", type=int, default=123, help="Simulated random seed (interactive mode auto-increments per turn; use -1 for random each turn)")
     parser.add_argument(
         "--run-verilator-smoke",
         action="store_true",
@@ -329,6 +447,12 @@ def main() -> None:
         top_p=args.top_p,
         repetition_penalty=args.repetition_penalty,
         seed=args.seed,
+        sim_max_new_tokens=args.sim_max_new_tokens,
+        sim_temperature=args.sim_temperature,
+        sim_top_k=args.sim_top_k,
+        sim_top_p=args.sim_top_p,
+        sim_repetition_penalty=args.sim_repetition_penalty,
+        sim_seed=args.sim_seed,
     )
     print(json.dumps(result, indent=2))
 

diff --git a/requirements-llm.txt b/requirements-llm.txt
@@ -0,0 +1,7 @@
+# Tiny-NPU LLM demo runtime dependencies
+# Pinned to versions compatible with Python 3.11 in Codespaces/macOS.
+numpy>=1.26,<2.0
+torch>=2.2,<2.6
+transformers>=4.41,<4.50
+huggingface-hub<1.0
+tokenizers<0.20
diff --git a/scripts/setup_llm_env.sh b/scripts/setup_llm_env.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PYTHON_BIN=${PYTHON_BIN:-python3}
+VENV_DIR=${VENV_DIR:-.venv}
+
+if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then
+  echo "ERROR: python binary '$PYTHON_BIN' not found" >&2
+  exit 1
+fi
+
+"$PYTHON_BIN" -m venv "$VENV_DIR"
+source "$VENV_DIR/bin/activate"
+
+python -m pip install --upgrade pip setuptools wheel
+python -m pip install -r requirements-llm.txt
+
+echo "LLM environment ready in $VENV_DIR"
+echo "Activate with: source $VENV_DIR/bin/activate"