diff --git a/.vscode/extensions.json b/.vscode/extensions.json
new file mode 100644
index 0000000..35931d5
--- /dev/null
+++ b/.vscode/extensions.json
@@ -0,0 +1,7 @@
+{
+  "recommendations": [
+    "mshr-h.veriloghdl",
+    "ms-python.python",
+    "yzhang.markdown-all-in-one"
+  ]
+}
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..c573d2c
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,113 @@
+{
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "name": "Run full test: matadd",
+      "type": "node",
+      "request": "launch",
+      "runtimeExecutable": "bash",
+      "args": [
+        "-lc",
+        "rm -f build/gpu.v build/temp.v build/sim.vvp && ulimit -n 4096 && make test_matadd"
+      ],
+      "cwd": "${workspaceFolder}",
+      "console": "integratedTerminal",
+      "internalConsoleOptions": "neverOpen",
+      "preLaunchTask": "Setup: create build directory"
+    },
+    {
+      "name": "Run full test: matmul",
+      "type": "node",
+      "request": "launch",
+      "runtimeExecutable": "bash",
+      "args": [
+        "-lc",
+        "rm -f build/gpu.v build/temp.v build/sim.vvp && ulimit -n 4096 && make test_matmul"
+      ],
+      "cwd": "${workspaceFolder}",
+      "console": "integratedTerminal",
+      "internalConsoleOptions": "neverOpen",
+      "preLaunchTask": "Setup: create build directory"
+    },
+    {
+      "name": "Attach: cocotb debugpy (5678)",
+      "type": "python",
+      "request": "attach",
+      "connect": {
+        "host": "127.0.0.1",
+        "port": 5678
+      },
+      "justMyCode": false,
+      "pathMappings": [
+        {
+          "localRoot": "${workspaceFolder}",
+          "remoteRoot": "${workspaceFolder}"
+        }
+      ]
+    },
+    {
+      "name": "Debug Python cocotb: matadd",
+      "type": "python",
+      "request": "attach",
+      "connect": {
+        "host": "127.0.0.1",
+        "port": 5678
+      },
+      "justMyCode": false,
+      "pathMappings": [
+        {
+          "localRoot": "${workspaceFolder}",
+          "remoteRoot": "${workspaceFolder}"
+        }
+      ],
+      "preLaunchTask": "Debug: matadd (debugpy wait)"
+    },
+    {
+      "name": "Debug Python cocotb: matmul",
+      "type": "python",
+      "request": "attach",
+      "connect": {
+        "host": "127.0.0.1",
+        "port": 5678
+      },
+      "justMyCode": false,
+      "pathMappings": [
+        {
+          "localRoot": "${workspaceFolder}",
+          "remoteRoot": "${workspaceFolder}"
+        }
+      ],
+      "preLaunchTask": "Debug: matmul (debugpy wait)"
+    },
+    {
+      "name": "Debug Python cocotb: selected module",
+      "type": "python",
+      "request": "attach",
+      "connect": {
+        "host": "127.0.0.1",
+        "port": 5678
+      },
+      "justMyCode": false,
+      "pathMappings": [
+        {
+          "localRoot": "${workspaceFolder}",
+          "remoteRoot": "${workspaceFolder}"
+        }
+      ],
+      "preLaunchTask": "Debug: selected cocotb module (debugpy wait)"
+    },
+    {
+      "name": "Run helper: latest test log path",
+      "type": "node",
+      "request": "launch",
+      "runtimeExecutable": "bash",
+      "args": [
+        "-lc",
+        "python -c \"from pathlib import Path; logs=sorted(Path('test/logs').glob('log_*.txt'), key=lambda p: p.stat().st_mtime); print(logs[-1].resolve() if logs else 'No log files found')\""
+      ],
+      "cwd": "${workspaceFolder}",
+      "console": "integratedTerminal",
+      "internalConsoleOptions": "neverOpen"
+    }
+  ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..271c14f
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,14 @@
+{
+  "terminal.integrated.cwd": "${workspaceFolder}",
+  "files.associations": {
+    "*.sv": "systemverilog"
+  },
+  "search.exclude": {
+    "build": true,
+    "test/logs": true,
+    "**/__pycache__": true
+  },
+  "files.exclude": {
+    "**/__pycache__": true
+  }
+}
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
new file mode 100644
index 0000000..d46594e
--- /dev/null
+++ b/.vscode/tasks.json
@@ -0,0 +1,375 @@
+{
+  "version": "2.0.0",
+  "tasks": [
+    {
+      "label": "Setup: create build directory",
+      "type": "shell",
+      "command": "mkdir -p build",
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "problemMatcher": []
+    },
+    {
+      "label": "Tools: print tool versions",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "echo '--- PATH ---' && printf '%s\n' \"$PATH\" && echo && echo '--- cocotb ---' && cocotb-config --version && echo && echo '--- iverilog ---' && iverilog -V && echo && echo '--- vvp ---' && vvp -V && echo && echo '--- sv2v ---' && sv2v --version"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "problemMatcher": []
+    },
+    {
+      "label": "Build: compile full design",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "rm -f build/gpu.v build/temp.v build/sim.vvp && ulimit -n 4096 && make compile"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "dependsOn": "Setup: create build directory",
+      "problemMatcher": []
+    },
+    {
+      "label": "Build: compile alu",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "ulimit -n 4096 && make compile_alu"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "dependsOn": "Setup: create build directory",
+      "problemMatcher": []
+    },
+    {
+      "label": "Build: compile controller",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "ulimit -n 4096 && make compile_controller"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "dependsOn": "Setup: create build directory",
+      "problemMatcher": []
+    },
+    {
+      "label": "Build: compile core",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "ulimit -n 4096 && make compile_core"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "dependsOn": "Setup: create build directory",
+      "problemMatcher": []
+    },
+    {
+      "label": "Build: compile dcr",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "ulimit -n 4096 && make compile_dcr"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "dependsOn": "Setup: create build directory",
+      "problemMatcher": []
+    },
+    {
+      "label": "Build: compile decoder",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "ulimit -n 4096 && make compile_decoder"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "dependsOn": "Setup: create build directory",
+      "problemMatcher": []
+    },
+    {
+      "label": "Build: compile dispatch",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "ulimit -n 4096 && make compile_dispatch"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "dependsOn": "Setup: create build directory",
+      "problemMatcher": []
+    },
+    {
+      "label": "Build: compile fetcher",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "ulimit -n 4096 && make compile_fetcher"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "dependsOn": "Setup: create build directory",
+      "problemMatcher": []
+    },
+    {
+      "label": "Build: compile gpu",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "ulimit -n 4096 && make compile_gpu"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "dependsOn": "Setup: create build directory",
+      "problemMatcher": []
+    },
+    {
+      "label": "Build: compile lsu",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "ulimit -n 4096 && make compile_lsu"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "dependsOn": "Setup: create build directory",
+      "problemMatcher": []
+    },
+    {
+      "label": "Build: compile pc",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "ulimit -n 4096 && make compile_pc"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "dependsOn": "Setup: create build directory",
+      "problemMatcher": []
+    },
+    {
+      "label": "Build: compile registers",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "ulimit -n 4096 && make compile_registers"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "dependsOn": "Setup: create build directory",
+      "problemMatcher": []
+    },
+    {
+      "label": "Build: compile scheduler",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "ulimit -n 4096 && make compile_scheduler"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "dependsOn": "Setup: create build directory",
+      "problemMatcher": []
+    },
+    {
+      "label": "Test: matadd",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "rm -f build/gpu.v build/temp.v build/sim.vvp && ulimit -n 4096 && make test_matadd"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "dependsOn": "Setup: create build directory",
+      "problemMatcher": []
+    },
+    {
+      "label": "Test: matmul",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "rm -f build/gpu.v build/temp.v build/sim.vvp && ulimit -n 4096 && make test_matmul"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "dependsOn": "Setup: create build directory",
+      "problemMatcher": []
+    },
+    {
+      "label": "Debug: matadd (debugpy wait)",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "rm -f build/gpu.v build/temp.v build/sim.vvp && ulimit -n 4096 && COCOTB_DEBUGPY=1 COCOTB_DEBUGPY_WAIT=1 COCOTB_DEBUGPY_HOST=127.0.0.1 COCOTB_DEBUGPY_PORT=5678 make test_matadd"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "dependsOn": "Setup: create build directory",
+      "isBackground": true,
+      "problemMatcher": {
+        "owner": "cocotb-debug",
+        "pattern": [
+          {
+            "regexp": ".+",
+            "file": 1,
+            "location": 1,
+            "message": 1
+          }
+        ],
+        "background": {
+          "activeOnStart": true,
+          "endsPattern": "\\[cocotb-debug\\] Waiting for debugger attach on .*"
+        }
+      }
+    },
+    {
+      "label": "Debug: matmul (debugpy wait)",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "rm -f build/gpu.v build/temp.v build/sim.vvp && ulimit -n 4096 && COCOTB_DEBUGPY=1 COCOTB_DEBUGPY_WAIT=1 COCOTB_DEBUGPY_HOST=127.0.0.1 COCOTB_DEBUGPY_PORT=5678 make test_matmul"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "dependsOn": "Setup: create build directory",
+      "isBackground": true,
+      "problemMatcher": {
+        "owner": "cocotb-debug",
+        "pattern": [
+          {
+            "regexp": ".+",
+            "file": 1,
+            "location": 1,
+            "message": 1
+          }
+        ],
+        "background": {
+          "activeOnStart": true,
+          "endsPattern": "\\[cocotb-debug\\] Waiting for debugger attach on .*"
+        }
+      }
+    },
+    {
+      "label": "Debug: selected cocotb module (debugpy wait)",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "rm -f build/gpu.v build/temp.v build/sim.vvp && ulimit -n 4096 && COCOTB_DEBUGPY=1 COCOTB_DEBUGPY_WAIT=1 COCOTB_DEBUGPY_HOST=127.0.0.1 COCOTB_DEBUGPY_PORT=5678 MODULE=${input:cocotbModule} make test_matadd"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "dependsOn": "Setup: create build directory",
+      "isBackground": true,
+      "problemMatcher": {
+        "owner": "cocotb-debug",
+        "pattern": [
+          {
+            "regexp": ".+",
+            "file": 1,
+            "location": 1,
+            "message": 1
+          }
+        ],
+        "background": {
+          "activeOnStart": true,
+          "endsPattern": "\\[cocotb-debug\\] Waiting for debugger attach on .*"
+        }
+      }
+    },
+    {
+      "label": "Logs: show latest test log path",
+      "type": "shell",
+      "command": "python",
+      "args": [
+        "-c",
+        "from pathlib import Path; logs=sorted(Path('test/logs').glob('log_*.txt'), key=lambda p: p.stat().st_mtime); print(logs[-1].resolve() if logs else 'No log files found')"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "problemMatcher": []
+    },
+    {
+      "label": "Logs: print latest test log",
+      "type": "shell",
+      "command": "python",
+      "args": [
+        "-c",
+        "from pathlib import Path; logs=sorted(Path('test/logs').glob('log_*.txt'), key=lambda p: p.stat().st_mtime); print('No log files found' if not logs else str(logs[-1].resolve()) + '\\n\\n' + logs[-1].read_text())"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "problemMatcher": []
+    },
+    {
+      "label": "Logs: open latest test log in VS Code",
+      "type": "shell",
+      "command": "bash",
+      "args": [
+        "-lc",
+        "latest=$(python -c \"from pathlib import Path; logs=sorted(Path('test/logs').glob('log_*.txt'), key=lambda p: p.stat().st_mtime); print(logs[-1].resolve() if logs else '')\"); if [ -z \"$latest\" ]; then echo 'No log files found'; elif command -v code >/dev/null 2>&1; then code -r \"$latest\" && echo \"Opened $latest\"; else echo 'VS Code CLI (code) not found; latest log path:' && echo \"$latest\"; fi"
+      ],
+      "options": {
+        "cwd": "${workspaceFolder}"
+      },
+      "problemMatcher": []
+    }
+  ],
+  "inputs": [
+    {
+      "id": "cocotbModule",
+      "type": "pickString",
+      "description": "Select the cocotb test module to run under debugpy",
+      "options": [
+        "test.test_matadd",
+        "test.test_matmul"
+      ],
+      "default": "test.test_matadd"
+    }
+  ]
+}
diff --git a/Makefile b/Makefile
index bc10f84..665f79e 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ export LIBPYTHON_LOC=$(shell cocotb-config --libpython)
 test_%:
 	make compile
 	iverilog -o build/sim.vvp -s gpu -g2012 build/gpu.v
-	MODULE=test.test_$* vvp -M $$(cocotb-config --prefix)/cocotb/libs -m libcocotbvpi_icarus build/sim.vvp
+	MODULE=$(if $(MODULE),$(MODULE),test.test_$*) vvp -M $$(cocotb-config --prefix)/cocotb/libs -m libcocotbvpi_icarus build/sim.vvp
 
 compile:
 	make compile_alu
diff --git a/docs/cocotb-overview.md b/docs/cocotb-overview.md
new file mode 100644
index 0000000..f7200c6
--- /dev/null
+++ b/docs/cocotb-overview.md
@@ -0,0 +1,306 @@
+# cocotb 入门：它在仿真流程里的哪一层
+
+这份说明面向有 C/C++ 背景、但刚接触 Verilog/SystemVerilog 和 cocotb 的读者。
+
+目标只有三个：
+
+1. 解释 cocotb 在整个仿真流程里的位置
+2. 解释 RTL、仿真器、Python 测试分别处于哪一层
+3. 结合本仓库说明一次 `test_matadd` 是怎么跑起来的
+
+---
+
+## 1. 一句话先建立直觉
+
+可以先把整个系统粗略理解成下面这样：
+
+- **RTL / Verilog / SystemVerilog**：被测试的“硬件程序”
+- **仿真器**：执行这些硬件描述的引擎
+- **cocotb**：把 Python 接到仿真器上的桥
+- **Python 测试代码**：像 testbench 一样驱动 DUT、等待时钟、检查结果
+
+所以 cocotb 不是 RTL 本身，也不是仿真器本身，而是位于两者之间的**Python 测试框架**。
+
+---
+
+## 2. 分层图：谁在上，谁在下
+
+下面这张图先回答“各自在哪一层”。
+
+```mermaid
+flowchart TB
+    py[Python Test Code<br/>test/test_matadd.py<br/>test/helpers/*.py]
+    cocotb[cocotb Runtime<br/>Coroutine Scheduler<br/>Trigger API]
+    sim[Simulator Bridge<br/>VPI / VHPI / FLI]
+    iverilog[Icarus Runtime<br/>iverilog + vvp]
+    rtl[RTL Design<br/>src/*.sv<br/>Top module: gpu]
+    model[Simulated Signals and State<br/>clk done registers buses memories]
+
+    py --> cocotb
+    cocotb --> sim
+    sim --> iverilog
+    iverilog --> rtl
+    rtl --> model
+    model --> sim
+```
+
+从上往下看：
+
+1. 最上层是你写的 Python 测试，比如 `test_matadd.py`
+2. 它调用 cocotb 的 API，比如 `RisingEdge`、`ReadOnly`、`@cocotb.test()`
+3. cocotb 再通过仿真接口和仿真器通信
+4. 仿真器执行底下的 RTL
+5. RTL 在仿真器内部产生各种信号值、寄存器状态、总线值
+
+所以 Python 并不是“直接执行 Verilog”，而是**通过 cocotb 间接观察和驱动仿真器里的 DUT**。
+
+---
+
+## 3. 流程图：一次 cocotb 测试是怎么跑起来的
+
+这张图回答“流程怎么走”。
+
+```mermaid
+flowchart LR
+    a[SystemVerilog RTL<br/>src/*.sv] --> b[sv2v<br/>convert to Verilog]
+    b --> c[iverilog<br/>compile and elaborate]
+    c --> d[vvp<br/>start simulation runtime]
+    d --> e[cocotb loads Python test]
+    e --> f[test_matadd starts]
+    f --> g[drive reset and clock]
+    g --> h[load program and data memory]
+    h --> i[assert start and wait cycles]
+    i --> j[read dut signals and logs]
+    j --> k[assert final memory results]
+```
+
+可以把它类比成：
+
+- `sv2v + iverilog + vvp` 负责把硬件世界跑起来
+- cocotb 负责把 Python 测试挂进去
+- Python 代码负责“刺激输入 + 观察输出 + 做断言”
+
+---
+
+## 4. 本仓库里，哪些文件属于哪一层
+
+### 硬件层
+
+这些文件描述的是被测设计，也就是 GPU 本体：
+
+- `src/gpu.sv`
+- `src/core.sv`
+- `src/decoder.sv`
+- `src/alu.sv`
+- 其他 `src/*.sv`
+
+这些文件属于 RTL 层。你可以把它们理解成“硬件源码”。
+
+### 测试层
+
+这些文件属于 cocotb/Python 测试层：
+
+- `test/test_matadd.py`
+- `test/test_matmul.py`
+- `test/helpers/setup.py`
+- `test/helpers/memory.py`
+- `test/helpers/logger.py`
+- `test/helpers/format.py`
+
+这些文件不实现 GPU 硬件功能，而是在仿真时扮演“测试平台”的角色。
+
+### 构建与仿真层
+
+这些内容负责把设计编译并跑起来：
+
+- `Makefile`
+- `sv2v`
+- `iverilog`
+- `vvp`
+
+这一层更像“构建系统 + 运行时环境”。
+
+---
+
+## 5. `dut` 到底是哪一层的对象
+
+在测试入口里你会看到：
+
+```python
+@cocotb.test()
+async def test_matadd(dut):
+```
+
+这里的 `dut`：
+
+- 不是你手动 new 出来的 Python 对象
+- 也不是某个普通数据结构
+- 它是 **cocotb 传进来的 DUT 句柄**
+
+`DUT` 是 `Device Under Test` 的缩写，意思是“被测设计”。
+
+你可以把 `dut` 理解成：
+
+> Python 世界里指向 Verilog 顶层实例的一根“把手”。
+
+所以你才能写出这样的代码：
+
+```python
+await RisingEdge(dut.clk)
+while dut.done.value != 1:
+    ...
+```
+
+它的含义就是：
+
+- `dut.clk`：访问 DUT 的时钟信号
+- `dut.done`：访问 DUT 的完成信号
+- `dut.xxx`：访问 DUT 上名为 `xxx` 的端口或层次对象
+
+如果用 C++ 类比，它有点像“仿真器暴露出来的顶层对象句柄”。
+
+---
+
+## 6. 为什么 Python 代码里经常看到“字符串 <-> 整数”的转换
+
+这部分最容易让软件背景的初学者困惑。
+
+先说结论：
+
+> **Verilog 信号本质上是位向量，不是字符串。**
+> 但在 cocotb 的 Python 代码里，开发者经常把信号值临时转成字符串，方便切片、打印和观察。
+
+例如在 `test/helpers/memory.py` 中会有这种代码：
+
+```python
+mem_read_address_bits = str(self.mem_read_address.value)
+address_slice = mem_read_address_bits[i : i + self.addr_bits]
+mem_read_address.append(int(address_slice, 2))
+```
+
+这三步分别在干什么：
+
+1. 把总线值转成字符串，例如 `0000010100000011`
+2. 按地址宽度切出每个 lane 的那一段 bit
+3. 再把那段 bit 转回整数，拿去做列表下标
+
+所以不是“信号本来就是字符串”，而是：
+
+- **在硬件层**：它是一串 bit
+- **在 Python 观察层**：为了方便切片，先显示成字符串
+- **在 Python 计算层**：为了索引/计算，再转成整数
+
+这是一种很常见、也很直观的 testbench 写法。
+
+---
+
+## 7. 一次 `test_matadd` 的分工
+
+下面按“谁负责什么”来看一次测试。
+
+### RTL 负责什么
+
+RTL 负责真正的硬件行为，例如：
+
+- 取指
+- 译码
+- 读写寄存器
+- 发起访存
+- 执行 ALU 操作
+- 最后拉高 `done`
+
+这些行为发生在 `src/*.sv` 里。
+
+### cocotb/Python 负责什么
+
+Python 测试负责：
+
+- 建立时钟和复位
+- 预装 program memory
+- 预装 data memory
+- 调用 `data_memory.run()` 响应访存握手
+- 等待 `dut.done`
+- 读取日志
+- 检查最终内存值是否正确
+
+换句话说，Python 不是在“实现 GPU 算法”，而是在**驱动和验证** GPU。
+
+---
+
+## 8. `test/helpers/memory.py` 在整个系统中的位置
+
+这份文件很关键，因为它不是 RTL 里的真正 RAM，而是 **Python 写的内存模型**。
+
+它处在这个位置：
+
+```text
+RTL 发出读写请求
+    -> cocotb 里的 Memory.run() 读取这些请求
+    -> Python 列表 self.memory 充当“软件内存”
+    -> 再把 read_data / ready 写回 dut 信号
+```
+
+也就是说：
+
+- 硬件核心在 RTL 里
+- 但测试里的 program/data memory，有一部分是 Python 侧在配合模拟
+
+这和很多软件背景工程师第一次接触 cocotb 时的直觉不同。你可能以为“所有东西都在 Verilog 里”，但在测试环境里，**外设模型、内存模型、驱动逻辑** 很多都可以写在 Python 里。
+
+---
+
+## 9. 一个最实用的心智模型
+
+如果你以前写过 C++ 单元测试，可以先这样记：
+
+- **RTL** 像“待测库”
+- **仿真器** 像“执行这个库的运行时”
+- **cocotb** 像“把 Python 测试接到运行时上的适配层”
+- **Python 测试** 像“测试代码 + mock + assertions”
+
+但要额外加上一点硬件特性：
+
+- 这里不是函数调用，而是时钟驱动的并发逻辑
+- 这里的对象不是普通变量，而是信号和位向量
+- 这里的值可能不仅有 `0/1`，还会有 `X/Z`
+
+---
+
+## 10. 把这个仓库的真实执行路径串起来
+
+最后用仓库里的真实名字再串一遍：
+
+1. `make test_matadd`
+2. `sv2v` 把 `src/*.sv` 转成 Verilog
+3. `iverilog` 编译生成仿真镜像
+4. `vvp` 启动仿真
+5. cocotb 加载 `test/test_matadd.py`
+6. cocotb 把顶层模块实例作为 `dut` 传给 `test_matadd(dut)`
+7. `setup()` 负责拉时钟、复位、装载数据
+8. `Memory.run()` 在每个周期响应程序存储器和数据存储器请求
+9. Python 测试不断等待 `RisingEdge(dut.clk)`，直到 `dut.done == 1`
+10. 测试检查日志和最终内存，决定 pass/fail
+
+---
+
+## 11. 读这类代码时，建议按这三个问题去看
+
+每看到一段 cocotb 代码，都可以先问自己：
+
+1. 这段代码是在驱动 DUT，还是在观察 DUT？
+2. 这段代码操作的是 Python 变量，还是 DUT 信号？
+3. 这段代码发生在“某个时钟周期之前、之中，还是之后”？
+
+只要这三个问题分清楚，cocotb 代码会清晰很多。
+
+---
+
+## 12. 你现在最值得记住的结论
+
+- cocotb 是 **Python 测试框架**，不是 RTL，也不是仿真器
+- RTL 在 `src/*.sv`，负责硬件行为
+- Python 在 `test/*.py`，负责驱动、建模、观察和断言
+- `dut` 是 cocotb 注入的“被测设计句柄”
+- 信号本质是位向量；字符串和整数只是 Python 侧为了处理方便做的表示转换
+
+如果这套心智模型建立起来了，后面再看 `test_matadd.py`、`memory.py`、`setup.py` 就会顺很多。
\ No newline at end of file
diff --git a/docs/modules/README.md b/docs/modules/README.md
new file mode 100644
index 0000000..258ffd2
--- /dev/null
+++ b/docs/modules/README.md
@@ -0,0 +1,49 @@
+# tiny-gpu Independent Module Guide
+
+These notes explain both:
+
+- the small, independent building-block modules in `src/`
+- the two integration layers that assemble them: `core.sv` and `gpu.sv`
+
+They are meant to be read **side by side with the SystemVerilog source**. The focus is not just "what this line does," but **what role the module plays in the whole GPU** and **how to mentally trace it over time**.
+
+This set is grounded in:
+
+- the actual RTL in `src/*.sv`
+- the repo's DeepWiki architecture pages (`Overview`, `Architecture Overview`, `Execution Model`, `Hardware Modules`, `Memory System`)
+
+## Recommended reading order
+
+1. [`scheduler.md`](./scheduler.md) — the global per-instruction rhythm of a core
+2. [`decoder.md`](./decoder.md) — how instruction bits become control signals
+3. [`fetcher.md`](./fetcher.md) — instruction fetch handshake
+4. [`registers.md`](./registers.md) — operand reads and writeback timing
+5. [`alu.md`](./alu.md) — arithmetic and compare execution
+6. [`pc.md`](./pc.md) — branch and NZP behavior
+7. [`lsu.md`](./lsu.md) — per-thread data-memory access
+8. [`controller.md`](./controller.md) — how many requesters share few memory channels
+9. [`dispatch.md`](./dispatch.md) — how thread_count becomes blocks on cores
+10. [`dcr.md`](./dcr.md) — where launch metadata comes from
+11. [`core.md`](./core.md) — how one core combines shared control with replicated thread lanes
+12. [`gpu.md`](./gpu.md) — how the whole chip is wired together
+
+## Module list
+
+- [`alu.md`](./alu.md)
+- [`controller.md`](./controller.md)
+- [`dcr.md`](./dcr.md)
+- [`decoder.md`](./decoder.md)
+- [`dispatch.md`](./dispatch.md)
+- [`fetcher.md`](./fetcher.md)
+- [`core.md`](./core.md)
+- [`gpu.md`](./gpu.md)
+- [`lsu.md`](./lsu.md)
+- [`pc.md`](./pc.md)
+- [`registers.md`](./registers.md)
+- [`scheduler.md`](./scheduler.md)
+
+## What is intentionally not covered here
+
+- advanced GPU topics like warp scheduling, coalescing, or branch divergence handling
+
+Those are better understood **after** the smaller modules feel natural.
diff --git a/docs/modules/alu.md b/docs/modules/alu.md
new file mode 100644
index 0000000..060350e
--- /dev/null
+++ b/docs/modules/alu.md
@@ -0,0 +1,106 @@
+# ALU Module
+
+Source: `src/alu.sv`
+
+## What this module is
+
+`alu.sv` is the per-thread arithmetic unit. Each active thread lane inside a core gets its own ALU instance, so all threads can execute the same arithmetic instruction in parallel on different register values.
+
+In DeepWiki terms, this is one of the thread execution units inside the core's replicated datapath.
+
+## Where it sits in tiny-gpu
+
+- **Upstream:** `registers.sv` provides `rs` and `rt`; `decoder.sv` provides ALU control bits; `scheduler.sv` provides the shared `core_state`
+- **Downstream:** `registers.sv` may write `alu_out` back into `rd`; `pc.sv` uses the low 3 bits during `CMP`
+
+## Clock/reset and when work happens
+
+- Synchronous module: work happens on `posedge clk`
+- Reset clears the stored output register
+- Useful work only happens when:
+  - `enable == 1`
+  - `core_state == EXECUTE (3'b101)`
+
+## Interface cheat sheet
+
+| Port group | Meaning |
+|---|---|
+| `clk`, `reset` | standard sequential timing |
+| `enable` | disables unused thread lanes in a partially full block |
+| `core_state` | stage gating from the scheduler |
+| `decoded_alu_arithmetic_mux` | selects ADD / SUB / MUL / DIV |
+| `decoded_alu_output_mux` | selects arithmetic path vs compare path |
+| `rs`, `rt` | source operands from the register file |
+| `alu_out` | final registered ALU result |
+
+## Diagram
+
+```mermaid
+flowchart TD
+    A["Clock edge arrives"] --> B{"reset?"}
+    B -- yes --> C["Clear stored ALU output register"]
+    B -- no --> D{"enable == 1 and core_state == EXECUTE?"}
+    D -- no --> E["Hold previous alu_out_reg value"]
+    D -- yes --> F{"decoded_alu_output_mux"}
+
+    F -- compare path --> G["Compare rs vs rt<br/>compute gt, eq, lt bits"]
+    G --> H["Pack compare flags into low result bits"]
+    H --> M["Write result into alu_out_reg"]
+
+    F -- arithmetic path --> I{"decoded_alu_arithmetic_mux"}
+    I -- ADD --> J["Compute rs + rt"]
+    I -- SUB --> K["Compute rs - rt"]
+    I -- MUL --> L["Compute rs * rt"]
+    I -- DIV --> N["Compute rs / rt"]
+    J --> M
+    K --> M
+    L --> M
+    N --> M
+
+    M --> O["Expose registered output as alu_out"]
+    C --> O
+    E --> O
+```
+
+## Behavior walkthrough
+
+1. The scheduler moves the whole core into `EXECUTE`.
+2. The decoder has already chosen which ALU behavior is needed.
+3. If the instruction is normal arithmetic, the ALU uses the arithmetic mux.
+4. If the instruction is `CMP`, the ALU does not return a normal arithmetic result. Instead, it packs comparison flags into the low 3 bits.
+5. The result is written into `alu_out_reg`, so the output is **registered**, not purely combinational.
+
+## Decision logic to focus on
+
+- First gate: only act in `EXECUTE`
+- Second gate: arithmetic result vs compare result
+- Third gate: which arithmetic sub-operation to apply
+
+The important beginner insight is that the ALU is **not choosing instructions directly**. It only reacts to the control signals already produced by the decoder.
+
+## Timing notes
+
+- `rs` and `rt` are prepared earlier by `registers.sv`
+- `alu_out` becomes meaningful after the `EXECUTE` edge
+- For `CMP`, `pc.sv` later samples `alu_out[2:0]` during `UPDATE` to refresh the NZP register
+
+## Common pitfalls
+
+- Thinking `CMP` writes a normal number into a general register. It does not.
+- Thinking the ALU always runs every cycle. It is stage-gated.
+- Missing that `alu_out` is backed by `alu_out_reg`, so old values persist until overwritten.
+
+## Trace-it-yourself
+
+Try the instruction `ADD R6, R4, R5`:
+
+1. In `REQUEST`, the register file snapshots `R4` and `R5` into `rs` and `rt`
+2. In `EXECUTE`, the ALU sees arithmetic mode + ADD sub-op
+3. On the clock edge, it stores `rs + rt` into `alu_out_reg`
+4. In `UPDATE`, the register file writes that value into `R6`
+
+## Read next
+
+- [`registers.md`](./registers.md)
+- [`pc.md`](./pc.md)
+- [`decoder.md`](./decoder.md)
diff --git a/docs/modules/controller.md b/docs/modules/controller.md
new file mode 100644
index 0000000..99282a9
--- /dev/null
+++ b/docs/modules/controller.md
@@ -0,0 +1,111 @@
+# Controller Module
+
+Source: `src/controller.sv`
+
+## What this module is
+
+`controller.sv` is the memory traffic manager. It sits between many internal requesters and a smaller number of external memory channels.
+
+The same module is reused for both:
+
+- **program memory** requests from fetchers
+- **data memory** requests from LSUs
+
+DeepWiki's memory-system description matches this role exactly: the controller is the bandwidth gatekeeper and response relay.
+
+## Where it sits in tiny-gpu
+
+- **Upstream:** fetchers or LSUs behave like "consumers"
+- **Downstream:** external program memory or data memory
+- **Sibling concept:** one controller instance can have several independent channels, and each channel has its own internal state
+
+## Clock/reset and when work happens
+
+- Entirely synchronous on `posedge clk`
+- Reset clears all valid/ready outputs and returns every channel to `IDLE`
+- After reset, each channel repeatedly:
+  - looks for a request
+  - forwards it to memory
+  - waits for memory
+  - relays completion back to the right consumer
+
+## Interface cheat sheet
+
+| Port group | Meaning |
+|---|---|
+| `consumer_read_*` | incoming read requests from fetchers/LSUs |
+| `consumer_write_*` | incoming write requests from LSUs |
+| `consumer_*_ready` | completion signal back to the original requester |
+| `mem_read_*` | outgoing read request to external memory |
+| `mem_write_*` | outgoing write request to external memory |
+| `current_consumer[]` | which requester each channel is currently serving |
+| `channel_serving_consumer` | prevents two channels from claiming the same requester |
+
+## Diagram
+
+```mermaid
+stateDiagram-v2
+    [*] --> IDLE
+    IDLE --> IDLE: scan consumers<br/>skip already-claimed requests
+    IDLE --> READ_WAITING: claim one read requester<br/>record current_consumer<br/>raise mem_read_valid
+    IDLE --> WRITE_WAITING: claim one write requester<br/>record current_consumer<br/>raise mem_write_valid
+
+    READ_WAITING --> READ_WAITING: keep request active<br/>until mem_read_ready
+    READ_WAITING --> READ_RELAYING: memory returns read data<br/>drop mem_read_valid<br/>raise consumer_read_ready
+
+    WRITE_WAITING --> WRITE_WAITING: keep request active<br/>until mem_write_ready
+    WRITE_WAITING --> WRITE_RELAYING: memory accepts write<br/>drop mem_write_valid<br/>raise consumer_write_ready
+
+    READ_RELAYING --> READ_RELAYING: keep consumer_read_ready high<br/>until consumer drops read_valid
+    READ_RELAYING --> IDLE: release claimed consumer
+
+    WRITE_RELAYING --> WRITE_RELAYING: keep consumer_write_ready high<br/>until consumer drops write_valid
+    WRITE_RELAYING --> IDLE: release claimed consumer
+```
+
+## Behavior walkthrough
+
+1. Each memory channel behaves like a tiny worker.
+2. While idle, a channel scans consumers looking for a request not already claimed by another channel.
+3. Once found, the channel records `current_consumer[i]` and raises the corresponding memory-side valid signal.
+4. It then waits for external memory to acknowledge/return data.
+5. After memory responds, the controller raises the corresponding consumer-side `ready` signal.
+6. It does **not** immediately forget the request. It waits for the consumer to drop its `valid`, which serves as the acknowledgement.
+7. Only then does the channel return to `IDLE` and become free again.
+
+## State machine idea
+
+- `IDLE`: channel is free and looking for work
+- `READ_WAITING`: a read request has been forwarded to memory
+- `WRITE_WAITING`: a write request has been forwarded to memory
+- `READ_RELAYING`: read data is ready and being presented back to the consumer
+- `WRITE_RELAYING`: write completion is being presented back to the consumer
+
+## Timing notes
+
+- `valid` and `ready` are not one-cycle pulses by accident here; they are part of a handshake lifecycle
+- `channel_serving_consumer` is important because multiple channels are updated in the same clocked block
+- The relay states intentionally hold `consumer_*_ready` high until the requester lowers `valid`
+
+## Common pitfalls
+
+- Thinking `ready` means "the channel is idle." Here it often means "your request has completed."
+- Missing that there are **two** sides of handshake: consumer side and memory side.
+- Forgetting this module can represent either program-memory traffic or data-memory traffic depending on parameters.
+
+## Trace-it-yourself
+
+Imagine one LSU asserts `consumer_read_valid[3]`:
+
+1. A free channel in `IDLE` claims consumer 3
+2. It copies `consumer_read_address[3]` to `mem_read_address[i]`
+3. It waits in `READ_WAITING`
+4. When `mem_read_ready[i]` arrives, it copies `mem_read_data[i]` back into `consumer_read_data[3]`
+5. It raises `consumer_read_ready[3]`
+6. When the LSU drops `consumer_read_valid[3]`, the channel goes back to `IDLE`
+
+## Read next
+
+- [`fetcher.md`](./fetcher.md)
+- [`lsu.md`](./lsu.md)
+- [`scheduler.md`](./scheduler.md)
diff --git a/docs/modules/core.md b/docs/modules/core.md
new file mode 100644
index 0000000..4e9948d
--- /dev/null
+++ b/docs/modules/core.md
@@ -0,0 +1,220 @@
+# Core Module
+
+Source: `src/core.sv`
+
+## What this module is
+
+`core.sv` is the main compute engine for one block of threads. If the smaller module docs explain the **parts**, this file explains how those parts are assembled into one working SIMD-style core.
+
+The key beginner mental model is this:
+
+- **one shared control path** per core
+- **many replicated thread lanes** per core
+
+So the core behaves like **one instruction stream controlling several per-thread datapaths in parallel**.
+
+This matches the repo's DeepWiki architecture model: the core contains a fetcher, decoder, scheduler, and per-thread execution units.
+
+## Where it sits in tiny-gpu
+
+- **Upstream:** `dispatch.sv` starts the core on a specific block and tells it `block_id` and `thread_count`
+- **Inside the core:**
+  - shared modules: `fetcher`, `decoder`, `scheduler`
+  - per-thread modules: `registers`, `alu`, `lsu`, `pc`
+- **Downstream:**
+  - program-memory controller sees fetch traffic
+  - data-memory controller sees LSU traffic
+  - dispatcher sees `done`
+
+## Clock/reset and when work happens
+
+- Entire module is synchronous through its submodules
+- `reset` resets the whole core and all its internal submodules
+- `start` tells the scheduler to begin executing the assigned block
+- The scheduler drives the per-instruction rhythm using `core_state`
+
+## Interface cheat sheet
+
+| Group | Meaning |
+|---|---|
+| `start`, `done` | per-block launch and completion handshake |
+| `block_id`, `thread_count` | metadata for the currently assigned block |
+| `program_mem_*` | one shared instruction fetch interface for the whole core |
+| `data_mem_read_*`, `data_mem_write_*` | per-thread data-memory interfaces for LSU traffic |
+| `core_state`, `instruction`, decoded signals | shared control-path signals inside the core |
+| `rs/rt`, `alu_out`, `lsu_out`, `next_pc` arrays | per-thread lane datapath signals |
+
+## Diagram
+
+```mermaid
+flowchart TD
+    subgraph SharedControl["Shared per-core control path"]
+        S["scheduler"] --> F["fetcher"]
+        F --> D["decoder"]
+        D --> CTRL["decoded control bundle"]
+        S --> STAGE["core_state"]
+        S --> CPC["current_pc"]
+    end
+
+    subgraph ThreadLanes["Replicated per-thread lane i"]
+        R0["registers lane i"] --> A0["alu lane i"]
+        R0 --> L0[lsu lane i]
+        A0 --> R0
+        L0 --> R0
+        A0 --> P0["pc lane i"]
+        P0 --> NPC["next_pc for lane i"]
+    end
+
+    CTRL --> R0
+    CTRL --> A0
+    CTRL --> L0
+    CTRL --> P0
+    STAGE --> R0
+    STAGE --> A0
+    STAGE --> L0
+    STAGE --> P0
+    CPC --> F
+    CPC --> P0
+    F --> PMEM["program memory interface"]
+    L0 --> DMEM["data memory interface for this lane"]
+    NPC --> MERGE["scheduler assumes active lanes converge"]
+    MERGE --> S
+```
+
+## How to read this file
+
+This file is mostly an **integration file**. It does not invent a lot of new behavior. Instead, it answers these questions:
+
+1. Which modules are shared per core?
+2. Which modules are duplicated per thread lane?
+3. How do the shared control signals fan out to all lanes?
+4. How do the per-lane outputs feed back into shared control?
+
+That is why `core.sv` has many wires/regs and many module instantiations, but relatively little algorithmic logic of its own.
+
+## Behavior walkthrough
+
+1. The dispatcher gives this core a `block_id` and `thread_count`.
+2. The scheduler starts in charge of the block's instruction lifecycle.
+3. The fetcher retrieves one instruction from program memory using the shared `current_pc`.
+4. The decoder turns that instruction into shared control signals.
+5. Those shared control signals are broadcast to **all active thread lanes**.
+6. Inside each lane:
+   - `registers` provides operands
+   - `alu` computes arithmetic or compare results
+   - `lsu` performs memory access if needed
+   - `pc` computes that lane's `next_pc`
+7. The scheduler later decides whether the block is done or moves to the next instruction.
+
+## Shared path vs replicated path
+
+This is the most important structural idea in the file.
+
+### Shared per-core pieces
+
+- `fetcher`
+- `decoder`
+- `scheduler`
+- one shared `instruction`
+- one shared `current_pc`
+- one shared bundle of decoded control signals
+
+These exist only once per core because the tiny-gpu executes one instruction stream per block.
+
+### Replicated per-thread pieces
+
+Inside the `generate` loop, every thread lane gets its own:
+
+- `alu`
+- `lsu`
+- `registers`
+- `pc`
+
+This is how the same instruction can operate on different thread-local data at the same time.
+
+## The `generate` loop
+
+The most important code pattern in this file is:
+
+```sv
+for (i = 0; i < THREADS_PER_BLOCK; i = i + 1) begin : threads
+    ...
+end
+```
+
+This does **not** mean "loop at runtime" the way a software `for` loop does.
+
+It means:
+
+- build hardware lane 0
+- build hardware lane 1
+- build hardware lane 2
+- ...
+
+So if `THREADS_PER_BLOCK = 4`, the generated hardware really contains 4 ALUs, 4 LSUs, 4 register files, and 4 PCs.
+
+## Partial block handling
+
+Not every block uses every physical lane.
+
+That is why each thread-local module gets:
+
+```sv
+.enable(i < thread_count)
+```
+
+Meaning:
+
+- if this is a full block, all lanes are enabled
+- if this is the final partial block, only the first `thread_count` lanes are active
+
+This is how one physical core can still execute a short last block safely.
+
+## The key simplification: converged PC
+
+Look closely at this structure:
+
+- each lane computes `next_pc[i]`
+- but the scheduler later chooses one representative `next_pc`
+
+This reflects one of the repo's biggest simplifications:
+
+> all active threads in a block are assumed to converge back to the same PC
+
+Real GPUs must deal with branch divergence much more carefully.
+
+So `core.sv` is the place where the repo's simplified SIMD control model becomes most visible.
+
+## Timing notes
+
+- Source operand arrays `rs[i]`, `rt[i]` are filled by `registers.sv`
+- `alu_out[i]`, `lsu_out[i]`, `next_pc[i]` are per-lane outputs consumed later in the instruction lifecycle
+- Decoded signals are shared because the decoder runs once per core, not once per thread
+
+## Common pitfalls
+
+- Thinking `core.sv` contains the actual arithmetic/memory algorithms. Most of those live in submodules.
+- Thinking the `generate` loop is software-style iteration. It is hardware replication.
+- Forgetting that this core processes **one block at a time**, not one thread at a time.
+- Missing the difference between:
+  - shared control state (`core_state`, `instruction`, `current_pc`)
+  - per-thread datapath state (`rs[i]`, `registers`, `lsu_state[i]`, `next_pc[i]`)
+
+## Trace-it-yourself
+
+Try tracing one `ADD` instruction for a block with 4 active threads:
+
+1. Scheduler moves to `FETCH`
+2. Fetcher returns one instruction word
+3. Decoder produces one shared arithmetic-control bundle
+4. All 4 register files snapshot their own `rs` and `rt`
+5. All 4 ALUs compute in parallel
+6. All 4 register files independently write back their own `alu_out`
+
+Same instruction, different per-thread data.
+
+## Read next
+
+- [`gpu.md`](./gpu.md)
+- [`scheduler.md`](./scheduler.md)
+- [`registers.md`](./registers.md)
diff --git a/docs/modules/dcr.md b/docs/modules/dcr.md
new file mode 100644
index 0000000..a20d55c
--- /dev/null
+++ b/docs/modules/dcr.md
@@ -0,0 +1,77 @@
+# DCR Module
+
+Source: `src/dcr.sv`
+
+## What this module is
+
+`dcr.sv` is the Device Control Register. In this tiny GPU, it is a very small configuration module whose main job is to remember the total `thread_count` for the next kernel launch.
+
+This is one of the simplest modules in the repo, but it is conceptually important because it shows how software/testbench configuration becomes hardware state.
+
+## Where it sits in tiny-gpu
+
+- **Upstream:** host/testbench writes launch metadata through `device_control_write_enable` and `device_control_data`
+- **Downstream:** `dispatch.sv` reads `thread_count`
+
+## Clock/reset and when work happens
+
+- Synchronous on `posedge clk`
+- Reset clears the internal register to zero
+- If `device_control_write_enable` is high, the module latches the new control value
+
+## Interface cheat sheet
+
+| Port group | Meaning |
+|---|---|
+| `device_control_write_enable` | host says "store this launch setting now" |
+| `device_control_data` | 8-bit configuration payload |
+| `thread_count` | current stored thread count for dispatch |
+
+## Diagram
+
+```mermaid
+flowchart TD
+    A[posedge clk] --> B{reset?}
+    B -- yes --> C[device_control_register = 0]
+    B -- no --> D{device_control_write_enable?}
+    D -- yes --> E[latch device_control_data]
+    D -- no --> F[hold previous value]
+    E --> G[thread_count = stored register]
+    F --> G
+    C --> G
+```
+
+## Behavior walkthrough
+
+1. The host/testbench chooses how many total threads the kernel should launch.
+2. It drives `device_control_data` and asserts `device_control_write_enable`.
+3. `dcr.sv` stores that byte internally.
+4. The output `thread_count` continuously reflects the stored value.
+5. Later, the dispatcher uses it to calculate how many blocks to issue.
+
+## Decision logic to focus on
+
+There is almost no state machine here. The important idea is simply:
+
+- reset clears launch metadata
+- write-enable updates launch metadata
+- otherwise the last configuration is preserved
+
+## Timing notes
+
+- `thread_count` is not recomputed each time; it is just the stored register contents
+- If software never writes the DCR after reset, the dispatcher sees zero threads
+
+## Common pitfalls
+
+- Overthinking it as a complex control block. It is really just a tiny configuration register.
+- Forgetting that this register belongs to the GPU launch path, not to per-thread execution.
+
+## Trace-it-yourself
+
+If the testbench writes `8` into `device_control_data` with `device_control_write_enable = 1`, then after the clock edge `thread_count` becomes `8`. The dispatcher can then divide those 8 threads into blocks.
+
+## Read next
+
+- [`dispatch.md`](./dispatch.md)
+- [`scheduler.md`](./scheduler.md)
diff --git a/docs/modules/decoder.md b/docs/modules/decoder.md
new file mode 100644
index 0000000..2b46a42
--- /dev/null
+++ b/docs/modules/decoder.md
@@ -0,0 +1,107 @@
+# Decoder Module
+
+Source: `src/decoder.sv`
+
+## What this module is
+
+`decoder.sv` converts the raw 16-bit instruction into fields and control signals. It is the module that answers the question: **"What should the rest of the core do for this instruction?"**
+
+DeepWiki describes the decoder as part of the core's shared control path. That is the right mental model: one decoder drives all active threads in a core at once.
+
+## Where it sits in tiny-gpu
+
+- **Upstream:** `fetcher.sv` provides `instruction`; `scheduler.sv` provides `core_state`
+- **Downstream:** `registers.sv`, `alu.sv`, `lsu.sv`, `pc.sv`, and `scheduler.sv` all consume decoded signals
+
+## Clock/reset and when work happens
+
+- Synchronous on `posedge clk`
+- Reset clears all remembered decode outputs
+- Real decoding only happens in `DECODE` stage: `core_state == 3'b010`
+
+## Interface cheat sheet
+
+| Group | Meaning |
+|---|---|
+| `instruction` | the raw 16-bit instruction word |
+| `decoded_rd/rs/rt_address` | register fields extracted from instruction bits |
+| `decoded_nzp`, `decoded_immediate` | branch and immediate fields |
+| `decoded_reg_write_enable` | register file should write in UPDATE |
+| `decoded_mem_read_enable`, `decoded_mem_write_enable` | LSU should perform LDR/STR path |
+| `decoded_alu_*` | ALU should do arithmetic or compare path |
+| `decoded_pc_mux` | PC should use branch logic |
+| `decoded_ret` | scheduler should finish block execution |
+
+## Diagram
+
+```mermaid
+flowchart TD
+    A["DECODE stage begins"] --> B["Slice instruction into opcode, rd, rs, rt, nzp, immediate"]
+    B --> C["Clear all decoded control signals to safe defaults"]
+    C --> D{"opcode field"}
+
+    D -->|NOP| E["Leave all controls low"]
+    D -->|BRnzp| F["Tell PC logic to use branch decision path"]
+    D -->|CMP| G["Select ALU compare output<br/>enable NZP write path"]
+    D -->|ADD SUB MUL DIV| H["Enable register writeback<br/>select arithmetic ALU sub-operation"]
+    D -->|LDR| I["Enable memory read<br/>route returned value into register writeback"]
+    D -->|STR| J["Enable memory write only"]
+    D -->|CONST| K["Enable register writeback from immediate"]
+    D -->|RET| L["Raise decoded_ret for scheduler"]
+
+    F --> M["PC module"]
+    G --> N["ALU and PC modules"]
+    H --> O["ALU and register file"]
+    I --> P["LSU and register file"]
+    J --> Q["LSU"]
+    K --> R["Register file"]
+    L --> S["Scheduler"]
+```
+
+## Behavior walkthrough
+
+1. The fetcher has already captured the current instruction.
+2. In `DECODE`, the decoder slices reusable bit fields out of that instruction.
+3. It then resets all control outputs to zero.
+4. Based on the opcode, it asserts only the control bits needed by that instruction.
+5. Those control bits will influence other modules in later stages.
+
+## Decision logic to focus on
+
+The most important design pattern here is:
+
+1. extract fields
+2. clear all controls
+3. set only what the selected opcode needs
+
+That avoids stale control signals leaking from the previous instruction.
+
+## Timing notes
+
+- The decoder does not itself perform arithmetic, memory access, or branching
+- It only prepares control information for later stages
+- Several instruction formats reuse the same raw bit slices differently, which is normal in instruction-set design
+
+## Common pitfalls
+
+- Thinking "decode" and "execute" are the same moment. They are separate stages.
+- Forgetting to notice the default-zero control pattern.
+- Reading `decoded_immediate` as meaningful for every opcode. It is only used when relevant.
+
+## Trace-it-yourself
+
+Take `LDR R4, R4`:
+
+1. The decoder extracts `rd = R4`, `rs = R4`
+2. It clears all controls
+3. It asserts:
+   - `decoded_reg_write_enable = 1`
+   - `decoded_reg_input_mux = MEMORY`
+   - `decoded_mem_read_enable = 1`
+4. Later the LSU performs the load, and the register file writes the loaded value into `R4`
+
+## Read next
+
+- [`scheduler.md`](./scheduler.md)
+- [`registers.md`](./registers.md)
+- [`lsu.md`](./lsu.md)
diff --git a/docs/modules/dispatch.md b/docs/modules/dispatch.md
new file mode 100644
index 0000000..1011da7
--- /dev/null
+++ b/docs/modules/dispatch.md
@@ -0,0 +1,109 @@
+# Dispatch Module
+
+Source: `src/dispatch.sv`
+
+## What this module is
+
+`dispatch.sv` is the kernel-level work distributor. It takes one total `thread_count`, turns that into blocks, and hands those blocks to available cores.
+
+DeepWiki's execution-model and hardware-module pages describe the dispatcher as the block manager between the host configuration and the cores. That is exactly the right way to read this file.
+
+## Where it sits in tiny-gpu
+
+- **Upstream:** `dcr.sv` provides `thread_count`; external launch logic provides `start`
+- **Downstream:** cores receive `core_start`, `core_reset`, `core_block_id`, `core_thread_count`
+- **Feedback:** cores return `core_done`
+
+## Clock/reset and when work happens
+
+- Synchronous on `posedge clk`
+- Reset clears completion counters and puts all cores into reset
+- Once `start` is observed, dispatch begins managing block assignment until all blocks are done
+
+## Interface cheat sheet
+
+| Group | Meaning |
+|---|---|
+| `start` | launch this kernel |
+| `thread_count` | total threads requested for the kernel |
+| `core_done[]` | each core reports when its current block is complete |
+| `core_start[]` | tells a core to begin its assigned block |
+| `core_reset[]` | resets a core between blocks |
+| `core_block_id[]` | which block the core is executing |
+| `core_thread_count[]` | how many threads are active in that block |
+| `done` | whole kernel is finished |
+
+## Diagram
+
+```mermaid
+flowchart TD
+    A["Kernel start observed"] --> B["Compute total_blocks from thread_count and THREADS_PER_BLOCK"]
+    B --> C["Initialize launch bookkeeping<br/>blocks_dispatched and blocks_done"]
+    C --> D{"Any core leaving reset and blocks remain?"}
+
+    D -- yes --> E["Assign next block_id to that core"]
+    E --> F{"Is this the final partial block?"}
+    F -- no --> G["core_thread_count = THREADS_PER_BLOCK"]
+    F -- yes --> H["core_thread_count = remaining threads"]
+    G --> I["Raise core_start and increment blocks_dispatched"]
+    H --> I
+    I --> J{"Any started core reports core_done?"}
+
+    D -- no --> J
+    J -- yes --> K["Assert core_reset for that core<br/>increment blocks_done"]
+    K --> L{"blocks_done == total_blocks?"}
+    J -- no --> L
+
+    L -- no --> D
+    L -- yes --> M["Raise global done"]
+```
+
+## Behavior walkthrough
+
+1. It computes `total_blocks = ceil(thread_count / THREADS_PER_BLOCK)`.
+2. It tracks two counters:
+   - `blocks_dispatched`
+   - `blocks_done`
+3. When a core is available, the dispatcher gives it the next `block_id`.
+4. For most blocks, `core_thread_count` equals `THREADS_PER_BLOCK`.
+5. For the final block, `core_thread_count` may be smaller if the thread count does not divide evenly.
+6. When a core reports `core_done`, the dispatcher resets that core so it can receive more work.
+7. When `blocks_done == total_blocks`, it raises global `done`.
+
+## Control idea to focus on
+
+This file is not a deep per-cycle datapath. It is a **global work-accounting module**.
+
+The two core questions it answers are:
+
+- How many blocks exist?
+- Which core should get the next block?
+
+## Timing notes
+
+- `start_execution` is a small helper flag used to treat level-sensitive `start` more like a one-time launch event
+- `core_reset` is used both after global reset and between completed blocks
+- A partially full last block matters because the core still has full physical resources, but some thread lanes must stay disabled
+
+## Common pitfalls
+
+- Confusing `thread_count` with `THREADS_PER_BLOCK`
+- Forgetting the final block can be smaller
+- Thinking one core equals one thread; one core actually processes one **block** at a time
+
+## Trace-it-yourself
+
+If `THREADS_PER_BLOCK = 4` and `thread_count = 10`:
+
+- `total_blocks = 3`
+- block 0 has 4 threads
+- block 1 has 4 threads
+- block 2 has 2 threads
+
+That last value is what eventually becomes `core_thread_count` for the final issued block.
+
+## Read next
+
+- [`dcr.md`](./dcr.md)
+- [`scheduler.md`](./scheduler.md)
+- [`registers.md`](./registers.md)
diff --git a/docs/modules/fetcher.md b/docs/modules/fetcher.md
new file mode 100644
index 0000000..9db0695
--- /dev/null
+++ b/docs/modules/fetcher.md
@@ -0,0 +1,88 @@
+# Fetcher Module
+
+Source: `src/fetcher.sv`
+
+## What this module is
+
+`fetcher.sv` fetches the next instruction from program memory. Each core has one fetcher because all active threads in a core share the same current instruction stream.
+
+In DeepWiki's execution model, this is the module responsible for the `FETCH` stage of the core lifecycle.
+
+## Where it sits in tiny-gpu
+
+- **Upstream:** `scheduler.sv` provides `core_state` and `current_pc`
+- **Downstream:** `decoder.sv` consumes `instruction`
+- **Memory path:** program memory request goes through the program-memory controller
+
+## Clock/reset and when work happens
+
+- Synchronous on `posedge clk`
+- Reset returns the fetcher to `IDLE`
+- It launches a read only during the core's `FETCH` stage
+
+## Interface cheat sheet
+
+| Group | Meaning |
+|---|---|
+| `core_state`, `current_pc` | scheduler tells the fetcher what stage it is in and what address to fetch |
+| `mem_read_valid`, `mem_read_address` | outgoing instruction-memory request |
+| `mem_read_ready`, `mem_read_data` | memory/controller response |
+| `fetcher_state` | local FSM state |
+| `instruction` | latched fetched instruction |
+
+## Diagram
+
+```mermaid
+flowchart TD
+    A["Fetcher is idle"] --> B{"core_state == FETCH?"}
+    B -- no --> A
+    B -- yes --> C["Raise mem_read_valid<br/>present current_pc as mem_read_address"]
+    C --> D{"mem_read_ready?"}
+    D -- no --> C
+    D -- yes --> E["Latch mem_read_data into instruction<br/>set fetcher_state to FETCHED"]
+    E --> F{"core_state == DECODE?"}
+    F -- no --> E
+    F -- yes --> G["Return to IDLE for next fetch"]
+```
+
+## Behavior walkthrough
+
+1. While idle, the fetcher watches for `core_state == FETCH`.
+2. When that happens, it raises `mem_read_valid` and presents `current_pc` as the address.
+3. It waits for program memory to respond.
+4. When `mem_read_ready` arrives, it latches `mem_read_data` into `instruction`.
+5. It then waits for the core to move into `DECODE`, which marks that this fetch cycle is complete.
+
+## State machine idea
+
+- `IDLE`: waiting for a new fetch request
+- `FETCHING`: request is active, waiting for instruction return
+- `FETCHED`: instruction has been captured and is ready for decode
+
+## Timing notes
+
+- `instruction` is stored in a register, so the decoder reads a stable value next stage
+- The fetcher and scheduler are coordinated by `core_state`
+- This design deliberately keeps fetching simple: one instruction at a time, no instruction cache here
+
+## Common pitfalls
+
+- Thinking `mem_read_ready` means "memory is idle." Here it means the fetch completed.
+- Forgetting that the fetcher waits for `DECODE` before resetting its own state.
+- Confusing data memory and program memory paths; this module uses only program memory.
+
+## Trace-it-yourself
+
+Suppose `current_pc = 9`:
+
+1. Scheduler enters `FETCH`
+2. Fetcher outputs `mem_read_valid = 1`, `mem_read_address = 9`
+3. Later `mem_read_ready = 1` and the instruction word returns
+4. Fetcher stores that instruction and moves to `FETCHED`
+5. Once scheduler enters `DECODE`, fetcher returns to `IDLE`
+
+## Read next
+
+- [`decoder.md`](./decoder.md)
+- [`scheduler.md`](./scheduler.md)
+- [`controller.md`](./controller.md)
diff --git a/docs/modules/gpu.md b/docs/modules/gpu.md
new file mode 100644
index 0000000..bb0837a
--- /dev/null
+++ b/docs/modules/gpu.md
@@ -0,0 +1,210 @@
+# GPU Module
+
+Source: `src/gpu.sv`
+
+## What this module is
+
+`gpu.sv` is the top-level integration module for the whole design. It does not mainly perform arithmetic or decode instructions itself. Instead, it connects all major subsystems into one complete GPU.
+
+The best beginner mental model is:
+
+> `gpu.sv` is the top-level wiring and orchestration shell.
+
+It ties together:
+
+- launch configuration (`dcr`)
+- block assignment (`dispatch`)
+- memory arbitration (`controller`)
+- actual execution (`core` instances)
+
+This matches the DeepWiki architecture pages, where the GPU module is described as the wrapper around cores, memory controllers, and dispatch logic.
+
+## Where it sits in tiny-gpu
+
+- **Upstream:** host/testbench drives `start`, `device_control_write_enable`, and the external memory interfaces
+- **Inside the GPU:**
+  - one `dcr`
+  - one data-memory controller
+  - one program-memory controller
+  - one `dispatch`
+  - `NUM_CORES` compute cores
+- **Downstream:** external data/program memory ports and the host-visible `done`
+
+## Clock/reset and when work happens
+
+- Entire design is clocked by `clk`
+- `reset` resets the top-level submodules and their children
+- Execution begins only after:
+  1. the host loads memory
+  2. the host writes thread count into the DCR
+  3. the host asserts `start`
+
+## Interface cheat sheet
+
+| Group | Meaning |
+|---|---|
+| `start`, `done` | top-level kernel launch and completion |
+| `device_control_*` | host writes launch metadata into the DCR |
+| `program_mem_*` | external program memory interface |
+| `data_mem_*` | external data memory interface |
+| `core_*` arrays | dispatcher-managed per-core launch/status signals |
+| `lsu_*` arrays | flattened all-core/all-thread data-memory traffic |
+| `fetcher_*` arrays | per-core instruction-fetch traffic |
+
+## Diagram
+
+```mermaid
+flowchart TD
+    HOST["Host or testbench"] --> CFG["Write thread_count into dcr"]
+    CFG --> DCR["dcr"]
+    HOST --> START["start pulse"]
+
+    DCR --> DISP["dispatch"]
+    START --> DISP
+    DISP --> CORE0["core 0"]
+    DISP --> CORE1["core 1 .. core N"]
+    CORE0 --> FE0["fetcher traffic"]
+    CORE1 --> FE1["fetcher traffic"]
+    FE0 --> PMCTRL["program memory controller"]
+    FE1 --> PMCTRL
+    PMCTRL --> PMEM["external program memory"]
+
+    CORE0 --> LSU0["lane LSU traffic"]
+    CORE1 --> LSU1["lane LSU traffic"]
+    LSU0 --> FLAT["flatten all core and lane LSU ports"]
+    LSU1 --> FLAT
+    FLAT --> DMCTRL["data memory controller"]
+    DMCTRL --> DMEM["external data memory"]
+
+    CORE0 --> DONE0["core_done"]
+    CORE1 --> DONE1["core_done"]
+    DONE0 --> DISP
+    DONE1 --> DISP
+    DISP --> DONE["global done"]
+```
+
+## How to read this file
+
+This file is mostly about **connectivity**, not local algorithms.
+
+When reading it, ask these questions:
+
+1. Which modules exist once at GPU scope?
+2. Which modules are instantiated once per core?
+3. How are per-core and per-thread signals flattened so the memory controllers can see them?
+4. How does the launch flow move from host -> DCR -> dispatch -> core?
+
+## Behavior walkthrough
+
+1. The host writes `thread_count` into `dcr`.
+2. The host asserts `start`.
+3. `dispatch` uses `thread_count` to break the kernel into blocks and assign those blocks to cores.
+4. Each core runs one block at a time.
+5. Core fetchers request instructions through the program-memory controller.
+6. Core LSUs request data-memory reads/writes through the data-memory controller.
+7. As cores finish blocks, `dispatch` either assigns new blocks or eventually raises global `done`.
+
+## The most important structural idea: flattening
+
+Inside a core, LSU traffic is naturally grouped as:
+
+- per core
+- per thread lane
+
+But the controller wants one flat list of consumers.
+
+So `gpu.sv` creates flattened arrays such as:
+
+- `lsu_read_valid[NUM_LSUS-1:0]`
+- `lsu_write_valid[NUM_LSUS-1:0]`
+
+where:
+
+```text
+NUM_LSUS = NUM_CORES * THREADS_PER_BLOCK
+```
+
+Then it bridges each core-local lane into a unique global LSU index:
+
+```text
+lsu_index = i * THREADS_PER_BLOCK + j
+```
+
+That indexing rule is one of the most important things to understand in this file.
+
+## One-instance modules vs replicated modules
+
+### Single GPU-wide instances
+
+- `dcr`
+- `dispatch`
+- one data-memory controller
+- one program-memory controller
+
+These exist once because they coordinate global behavior.
+
+### Repeated per-core instances
+
+Inside the `generate` loop, `gpu.sv` instantiates one `core` per `i`.
+
+So if `NUM_CORES = 2`, the generated hardware contains two compute cores.
+
+## The bridging logic
+
+Inside the nested `generate` structure, the file creates per-core local LSU wires like:
+
+- `core_lsu_read_valid`
+- `core_lsu_read_address`
+
+Then it copies them into the flattened global arrays seen by the controller.
+
+This is integration glue. It is not new GPU behavior by itself, but it is essential for wiring the hierarchical design together.
+
+The comments mention OpenLane / Verilog-2005 compatibility. That tells you this structure is partly shaped by tool constraints, not just by pure architectural elegance.
+
+## The full launch path
+
+The top-level flow is:
+
+1. host writes launch metadata into `dcr`
+2. `dcr` exposes `thread_count`
+3. `dispatch` turns `thread_count` into blocks and per-core start/reset signals
+4. each `core` executes its assigned block
+5. fetch and memory access are arbitrated by the controllers
+6. once all blocks finish, `dispatch` raises `done`
+
+This is the main reason `gpu.sv` feels more like a system diagram than a small algorithmic module.
+
+## Timing notes
+
+- Memory requests do not go directly from cores to external memory; they pass through controllers
+- `core_done[i]` feeds back into dispatch so new blocks can be issued dynamically
+- `start` is consumed by `dispatch`, not broadcast directly as full execution control into every submodule
+
+## Common pitfalls
+
+- Expecting `gpu.sv` to contain the execution logic itself. Most real behavior lives in submodules.
+- Getting lost in the many arrays without first separating them into:
+  - host interface
+  - per-core control
+  - flattened fetch traffic
+  - flattened LSU traffic
+- Forgetting that `gpu.sv` must bridge **hierarchical structure** into **controller-friendly flat arrays**.
+- Confusing per-core repetition with per-thread repetition. `gpu.sv` duplicates cores; `core.sv` duplicates thread lanes.
+
+## Trace-it-yourself
+
+Suppose `NUM_CORES = 2` and `THREADS_PER_BLOCK = 4`:
+
+- there are 2 core instances
+- there are `2 * 4 = 8` flattened LSU consumer slots
+- one core's lane 0 might map to flattened LSU index 0
+- the other core's lane 0 might map to flattened LSU index 4
+
+This is how one shared data-memory controller can serve all thread-local LSUs across all cores.
+
+## Read next
+
+- [`core.md`](./core.md)
+- [`dispatch.md`](./dispatch.md)
+- [`controller.md`](./controller.md)
diff --git a/docs/modules/lsu.md b/docs/modules/lsu.md
new file mode 100644
index 0000000..d2f4c91
--- /dev/null
+++ b/docs/modules/lsu.md
@@ -0,0 +1,97 @@
+# LSU Module
+
+Source: `src/lsu.sv`
+
+## What this module is
+
+`lsu.sv` is the per-thread Load/Store Unit. Each active thread lane has its own LSU so memory accesses can be tracked independently.
+
+This is one of the clearest places to learn why the scheduler has a `WAIT` stage: memory requests take longer than simple arithmetic.
+
+## Where it sits in tiny-gpu
+
+- **Upstream:** `decoder.sv` says whether the instruction is `LDR` or `STR`; `registers.sv` supplies `rs` and `rt`
+- **Downstream:** data memory controller receives the request; `registers.sv` may later write back `lsu_out`
+
+## Clock/reset and when work happens
+
+- Synchronous on `posedge clk`
+- Reset clears request state and handshake outputs
+- Memory requests are launched starting around the core's `REQUEST` stage and completed before leaving `WAIT`
+
+## Interface cheat sheet
+
+| Group | Meaning |
+|---|---|
+| `decoded_mem_read_enable`, `decoded_mem_write_enable` | select LDR vs STR behavior |
+| `rs` | memory address |
+| `rt` | store data for STR |
+| `mem_read_*`, `mem_write_*` | external memory handshake |
+| `lsu_state` | local FSM state |
+| `lsu_out` | loaded data for later writeback |
+
+## Diagram
+
+```mermaid
+stateDiagram-v2
+    [*] --> IDLE
+    IDLE --> REQUESTING: core_state == REQUEST<br/>and memory op is enabled
+
+    REQUESTING --> WAITING: LDR path<br/>raise mem_read_valid<br/>drive read address from rs
+    REQUESTING --> WAITING: STR path<br/>raise mem_write_valid<br/>drive write address from rs<br/>drive write data from rt
+
+    WAITING --> WAITING: wait for memory-side ready
+    WAITING --> DONE: load completes<br/>capture mem_read_data into lsu_out
+    WAITING --> DONE: store completes<br/>write acknowledged
+
+    DONE --> DONE: wait for core UPDATE stage
+    DONE --> IDLE: core_state == UPDATE
+```
+
+## Behavior walkthrough
+
+1. The decoder chooses whether this instruction is a load or store.
+2. In `REQUEST`, the LSU begins the transaction.
+3. In `REQUESTING`, it drives either:
+   - read address (`LDR`)
+   - write address + write data (`STR`)
+4. In `WAITING`, it waits for the controller/memory to acknowledge completion.
+5. For a load, it captures returned data into `lsu_out`.
+6. In `DONE`, it waits until `UPDATE` before resetting back to `IDLE`.
+
+## State machine idea
+
+- `IDLE`: no memory op in flight
+- `REQUESTING`: presenting a fresh request
+- `WAITING`: request has been sent, waiting for completion
+- `DONE`: completion reached, waiting for the core's per-instruction cleanup point
+
+The same FSM is reused for both loads and stores; only the handshake signals differ.
+
+## Timing notes
+
+- This module is the reason the scheduler must sometimes stall in `WAIT`
+- `lsu_out` is only meaningful after a load has completed
+- Returning to `IDLE` in `UPDATE` keeps the per-instruction rhythm aligned with the rest of the core
+
+## Common pitfalls
+
+- Thinking memory requests finish in one cycle like ALU operations
+- Forgetting that `rs` is used as the memory address in this design
+- Missing that `lsu_out` is only for `LDR`, not `STR`
+
+## Trace-it-yourself
+
+For `LDR R4, R4`:
+
+1. Register file has already copied `R4` into `rs`
+2. LSU enters `REQUESTING` and drives `mem_read_address = rs`
+3. It waits in `WAITING`
+4. When `mem_read_ready` is asserted, it stores `mem_read_data` into `lsu_out`
+5. In `UPDATE`, the register file writes `lsu_out` into `R4`
+
+## Read next
+
+- [`controller.md`](./controller.md)
+- [`scheduler.md`](./scheduler.md)
+- [`registers.md`](./registers.md)
diff --git a/docs/modules/pc.md b/docs/modules/pc.md
new file mode 100644
index 0000000..53b7cfe
--- /dev/null
+++ b/docs/modules/pc.md
@@ -0,0 +1,99 @@
+# PC Module
+
+Source: `src/pc.sv`
+
+## What this module is
+
+`pc.sv` computes the next program counter for one thread lane and stores that lane's NZP condition state.
+
+This module is where the tiny-gpu branch mechanism comes together:
+
+- `decoder.sv` says whether the instruction is a branch
+- `alu.sv` computes compare flags for `CMP`
+- `pc.sv` decides whether to take the branch and where to go next
+
+## Where it sits in tiny-gpu
+
+- **Upstream:** decoder provides `decoded_pc_mux`, `decoded_nzp`, `decoded_immediate`, and `decoded_nzp_write_enable`; ALU provides `alu_out`
+- **Downstream:** scheduler later chooses one representative `next_pc` value as the core's shared `current_pc`
+
+## Clock/reset and when work happens
+
+- Synchronous on `posedge clk`
+- Reset clears both `nzp` and `next_pc`
+- Two important stages:
+  - `EXECUTE`: compute `next_pc`
+  - `UPDATE`: refresh `nzp` from compare output if needed
+
+## Interface cheat sheet
+
+| Group | Meaning |
+|---|---|
+| `current_pc` | current converged PC from the scheduler |
+| `decoded_pc_mux` | whether to use branch logic or plain PC+1 |
+| `decoded_nzp`, `decoded_immediate` | branch condition mask and branch target |
+| `decoded_nzp_write_enable` | whether this instruction updates NZP |
+| `alu_out` | low 3 bits carry compare results for CMP |
+| `next_pc` | this thread lane's computed next PC |
+
+## Diagram
+
+```mermaid
+flowchart TD
+    subgraph ExecuteStage["EXECUTE stage: choose next PC"]
+        A["Start from current_pc"] --> B{"decoded_pc_mux?"}
+        B -- no --> C["Sequential flow<br/>next_pc = current_pc + 1"]
+        B -- yes --> D{"Stored NZP matches decoded branch mask?"}
+        D -- yes --> E["Branch taken<br/>next_pc = decoded_immediate"]
+        D -- no --> C
+    end
+
+    subgraph UpdateStage["UPDATE stage: refresh NZP state"]
+        F{"decoded_nzp_write_enable?"}
+        F -- yes --> G["Store compare-result bits from ALU low bits"]
+        F -- no --> H["Keep previous NZP state"]
+    end
+```
+
+## Behavior walkthrough
+
+1. During `EXECUTE`, the PC logic decides what the next instruction address should be.
+2. If this is not a branch instruction, it simply does `current_pc + 1`.
+3. If this is `BRnzp`, it checks whether the stored `nzp` bits match the branch mask.
+4. If they match, it jumps to `decoded_immediate`.
+5. Separately, in `UPDATE`, a previous `CMP` can refresh the stored `nzp` bits from `alu_out[2:0]`.
+
+## Decision logic to focus on
+
+- Branch selection is based on `decoded_pc_mux`
+- Branch condition is based on `(nzp & decoded_nzp) != 0`
+- NZP update happens later than compare execution
+
+That last point is crucial: compare result generation and NZP state update are not the same moment.
+
+## Timing notes
+
+- `CMP` produces the bits in the ALU during `EXECUTE`
+- `pc.sv` stores those bits into `nzp` during `UPDATE`
+- A future branch reads the stored `nzp` state
+
+## Common pitfalls
+
+- Thinking the immediate is always used as the next PC. It is only used when a branch is taken.
+- Forgetting that `nzp` is stateful and persists across instructions.
+- Missing the repo's simplifying assumption: each thread lane computes a `next_pc`, but the scheduler later assumes all lanes converge.
+
+## Trace-it-yourself
+
+Imagine `CMP R9, R2` followed by `BRn LOOP`:
+
+1. `CMP` causes ALU to pack gt/eq/lt bits into `alu_out[2:0]`
+2. In `UPDATE`, PC module stores those bits into `nzp`
+3. On the later branch instruction, `decoded_pc_mux = 1`
+4. If the negative bit matches the branch mask, `next_pc` becomes the loop label address
+
+## Read next
+
+- [`alu.md`](./alu.md)
+- [`decoder.md`](./decoder.md)
+- [`scheduler.md`](./scheduler.md)
diff --git a/docs/modules/registers.md b/docs/modules/registers.md
new file mode 100644
index 0000000..8e79ba7
--- /dev/null
+++ b/docs/modules/registers.md
@@ -0,0 +1,107 @@
+# Registers Module
+
+Source: `src/registers.sv`
+
+## What this module is
+
+`registers.sv` is the per-thread register file. Each thread lane gets its own private set of 16 registers, which is one of the main reasons SIMD execution works: all threads execute the same instruction, but each thread reads and writes its own data.
+
+DeepWiki's thread-unit description highlights the special metadata registers here. They are especially important for understanding `blockIdx * blockDim + threadIdx`.
+
+## Where it sits in tiny-gpu
+
+- **Upstream:** decoder provides register addresses and writeback controls; ALU/LSU/immediate paths provide candidate writeback data
+- **Downstream:** ALU and LSU consume `rs` and `rt`
+
+## Clock/reset and when work happens
+
+- Synchronous on `posedge clk`
+- Reset clears general registers and initializes special registers
+- Important stages:
+  - `REQUEST`: read source operands into `rs` and `rt`
+  - `UPDATE`: write result into `rd`
+
+## Interface cheat sheet
+
+| Group | Meaning |
+|---|---|
+| `decoded_rd/rs/rt_address` | which registers to write/read |
+| `decoded_reg_write_enable` | whether UPDATE should write into `rd` |
+| `decoded_reg_input_mux` | choose ALU result, memory result, or immediate |
+| `alu_out`, `lsu_out`, `decoded_immediate` | writeback sources |
+| `block_id` | current block index from dispatch/core |
+| `rs`, `rt` | source operand outputs for this thread lane |
+
+## Diagram
+
+```mermaid
+flowchart TD
+    A["Clock edge arrives"] --> B{"reset?"}
+    B -- yes --> C["Clear R0-R12<br/>initialize R13-R15 metadata registers"]
+
+    B -- no --> D{"enable?"}
+    D -- no --> E["Hold register file state"]
+    D -- yes --> F["Refresh R13 from current block_id"]
+
+    F --> G{"core_state == REQUEST?"}
+    G -- yes --> H["Snapshot source operands<br/>rs <- selected source register<br/>rt <- selected target register"]
+    G -- no --> I["Do not update rs/rt this cycle"]
+
+    H --> J{"core_state == UPDATE and write enabled and rd is a general register?"}
+    I --> J
+
+    J -- no --> K["No writeback this cycle"]
+    J -- yes --> L{"decoded_reg_input_mux"}
+    L -- ARITHMETIC --> M["Write alu_out into rd"]
+    L -- MEMORY --> N["Write lsu_out into rd"]
+    L -- CONSTANT --> O["Write immediate into rd"]
+```
+
+## Behavior walkthrough
+
+1. On reset, it clears the general-purpose registers.
+2. It initializes:
+   - `R13 = %blockIdx`
+   - `R14 = %blockDim`
+   - `R15 = %threadIdx`
+3. While enabled, it keeps `R13` synchronized with the current `block_id`.
+4. In `REQUEST`, it snapshots the source registers named by the current instruction into `rs` and `rt`.
+5. In `UPDATE`, if writeback is enabled and `rd < 13`, it writes one of three sources into `rd`.
+
+## Decision logic to focus on
+
+- Source read timing is stage-based, not asynchronous
+- Writeback uses a 3-way mux
+- Special metadata registers are protected by `decoded_rd_address < 13`
+
+## Timing notes
+
+- `rs` and `rt` are registers themselves, not direct array aliases
+- The module reads in `REQUEST` and writes in `UPDATE`
+- That staged rhythm is what keeps operand use and result commit aligned with the rest of the core
+
+## Common pitfalls
+
+- Thinking `R13-R15` are normal writable registers. They are treated as read-only metadata.
+- Forgetting that `%blockIdx` is refreshed from `block_id`.
+- Missing that the source operands are copied out into `rs` and `rt` before ALU/LSU use.
+
+## Trace-it-yourself
+
+For `CONST R2, #8`:
+
+1. Decoder sets `decoded_reg_write_enable = 1`
+2. Decoder sets `decoded_reg_input_mux = CONSTANT`
+3. In `UPDATE`, register file writes `decoded_immediate` into `R2`
+
+For `ADD R6, R4, R5`:
+
+1. In `REQUEST`, it copies `R4 -> rs`, `R5 -> rt`
+2. ALU uses those values in `EXECUTE`
+3. In `UPDATE`, it writes `alu_out` into `R6`
+
+## Read next
+
+- [`alu.md`](./alu.md)
+- [`lsu.md`](./lsu.md)
+- [`pc.md`](./pc.md)
diff --git a/docs/modules/scheduler.md b/docs/modules/scheduler.md
new file mode 100644
index 0000000..043f627
--- /dev/null
+++ b/docs/modules/scheduler.md
@@ -0,0 +1,107 @@
+# Scheduler Module
+
+Source: `src/scheduler.sv`
+
+## What this module is
+
+`scheduler.sv` is the core's master stage machine. If you want one file that explains the overall execution rhythm of a block, this is the file.
+
+DeepWiki's execution-model page describes the six-stage flow `FETCH -> DECODE -> REQUEST -> WAIT -> EXECUTE -> UPDATE`. This module is exactly where that flow is enforced.
+
+## Where it sits in tiny-gpu
+
+- **Upstream:** `dispatch.sv` provides `start`; fetcher and LSUs report progress
+- **Downstream:** all other core-local modules react to `core_state`
+- **Key idea:** `core_state` is the shared control clocking rhythm for the whole core
+
+## Clock/reset and when work happens
+
+- Synchronous on `posedge clk`
+- Reset sets `current_pc = 0`, `core_state = IDLE`, `done = 0`
+- Every instruction executed by a block passes through the same state sequence
+
+## Interface cheat sheet
+
+| Group | Meaning |
+|---|---|
+| `start` | begin processing this block |
+| `fetcher_state` | tells scheduler when instruction fetch completed |
+| `lsu_state[]` | tells scheduler whether any thread is still waiting on memory |
+| `decoded_ret` | end-of-kernel/block instruction marker |
+| `next_pc[]` | each thread lane's computed next PC |
+| `core_state` | shared stage broadcast to the core |
+| `current_pc` | the core's converged PC |
+| `done` | this block has finished executing |
+
+## Diagram
+
+```mermaid
+stateDiagram-v2
+    [*] --> IDLE
+    IDLE --> FETCH: start
+    FETCH --> DECODE: fetcher_state == FETCHED
+    DECODE --> REQUEST
+    REQUEST --> WAIT
+    WAIT --> WAIT: any LSU still REQUESTING or WAITING
+    WAIT --> EXECUTE: no LSU still REQUESTING/WAITING
+    EXECUTE --> UPDATE
+    UPDATE --> DONE: decoded_ret
+    UPDATE --> FETCH: otherwise current_pc = one next_pc value
+    DONE --> DONE
+```
+
+## Behavior walkthrough
+
+1. In `IDLE`, the scheduler waits for the core to be started on a new block.
+2. `FETCH` waits until the fetcher reports that the instruction has arrived.
+3. `DECODE` gives the decoder one cycle to produce control signals.
+4. `REQUEST` lets registers/LSUs launch their work.
+5. `WAIT` stalls if any LSU still has an in-flight memory operation.
+6. `EXECUTE` is where ALUs and PC logic do their main calculations.
+7. `UPDATE` commits results and either:
+   - ends the block on `RET`
+   - or advances to the next instruction
+
+## State machine idea
+
+- `IDLE`: no active block
+- `FETCH`: instruction fetch in progress
+- `DECODE`: instruction decode
+- `REQUEST`: operand snapshot / memory request launch
+- `WAIT`: memory-latency hiding point in this simple design
+- `EXECUTE`: perform arithmetic/branch logic
+- `UPDATE`: commit state updates
+- `DONE`: block finished
+
+## Timing notes
+
+- `WAIT` is the key stage for understanding asynchronous memory
+- The scheduler chooses `current_pc <= next_pc[THREADS_PER_BLOCK-1]` as a representative value, which encodes the repo's simplifying assumption that active threads reconverge to one PC
+- `done` is asserted only when a `RET` instruction reaches `UPDATE`
+
+The Mermaid diagram intentionally shows a self-loop on `WAIT` because that stage can last multiple cycles when any thread LSU still has an in-flight memory operation.
+
+## Common pitfalls
+
+- Thinking every instruction always spends many cycles in `WAIT`. Non-memory instructions pass through quickly.
+- Missing that `decoded_ret` is handled in `UPDATE`, not immediately in `DECODE`.
+- Forgetting the branch-divergence simplification when reading the `next_pc[]` array.
+
+## Trace-it-yourself
+
+For a non-memory `ADD` instruction, the rough rhythm is:
+
+1. `FETCH` gets the instruction
+2. `DECODE` produces arithmetic controls
+3. `REQUEST` snapshots source operands
+4. `WAIT` exits quickly because no LSU is busy
+5. `EXECUTE` computes `alu_out`
+6. `UPDATE` writes the result and advances `current_pc`
+
+For `LDR`, the difference is that `WAIT` lasts until the matching LSU finishes.
+
+## Read next
+
+- [`decoder.md`](./decoder.md)
+- [`fetcher.md`](./fetcher.md)
+- [`lsu.md`](./lsu.md)
diff --git a/docs/report/00-executive-overview.md b/docs/report/00-executive-overview.md
new file mode 100644
index 0000000..0112416
--- /dev/null
+++ b/docs/report/00-executive-overview.md
@@ -0,0 +1,57 @@
+# Executive Overview
+
+## What this repository is
+
+`tiny-gpu` is a small educational GPU implementation written in SystemVerilog and exercised through a Python + cocotb simulation harness. The codebase is intentionally compact: the hardware lives entirely in `src/`, the tests live entirely in `test/`, and the project uses two example kernels to demonstrate SIMD-style execution on a simplified GPU architecture.
+
+The local checkout is connected to `https://github.com/adam-maj/tiny-gpu.git`. External DeepWiki material was used as a secondary explanatory source, but local source files remain the authority for implementation details.
+
+## Confirmed headline findings
+
+- The hardware source is concentrated in 12 SystemVerilog modules under `src/`.
+- The top-level implementation is `src/gpu.sv`, which instantiates the device control register, dispatcher, two memory controllers, and a configurable array of compute cores.
+- Each core contains one `fetcher`, one `decoder`, one `scheduler`, and per-thread `alu`, `lsu`, `registers`, and `pc` units.
+- The execution model is explicitly staged in the scheduler as `FETCH -> DECODE -> REQUEST -> WAIT -> EXECUTE -> UPDATE`.
+- The verification story is cocotb-driven: Python code simulates both program memory and data memory while the RTL runs inside Icarus Verilog.
+- Two proof-of-concept kernels are exercised by tests: matrix addition and matrix multiplication.
+
+## High-level architecture
+
+```mermaid
+flowchart TD
+    Host[Host / cocotb testbench] --> DCR[Device Control Register]
+    Host --> PMem[Program Memory Model]
+    Host --> DMem[Data Memory Model]
+    Host --> Start[start]
+
+    DCR --> Dispatch[Dispatcher]
+    Start --> Dispatch
+    Dispatch --> GPU[GPU Top Module]
+    GPU --> Cores[Core Array]
+    Cores --> ProgCtrl[Program Memory Controller]
+    Cores --> DataCtrl[Data Memory Controller]
+    ProgCtrl --> PMem
+    DataCtrl --> DMem
+```
+
+## Toolchain summary
+
+The build and simulation flow depends on four external tools:
+
+| Tool | Role in this repo | Grounded evidence | Official reference used here |
+| --- | --- | --- | --- |
+| `cocotb` | Python testbench framework | `test/helpers/setup.py`, `test/test_*.py`, `Makefile` | cocotb docs on async tests, clocks, triggers, and DUT signal access |
+| `iverilog` / `vvp` | compiles and runs the Verilog simulation | `Makefile` | Icarus Verilog docs for `-g2012`, `-s`, and VPI runtime flags |
+| `sv2v` | converts SystemVerilog to Verilog before compilation | `Makefile`, `README.md` | sv2v README usage and CLI options |
+| `gtkwave` | optional waveform/debug viewer | `Makefile` (`show_%` rule and TODO only) | GTKWave docs for post-mortem waveform inspection |
+
+## Important caveats
+
+- The README discusses a cache as if it were part of the architecture, but there is no cache module in `src/` and no cache instantiated in `src/gpu.sv`.
+- The build flow assumes a pre-existing `build/` directory; the `Makefile` does not create it.
+- `test/test_matmul.py` contains a function named `test_matadd`, so the filename and exported test name do not match.
+- The source is SystemVerilog, while parts of the README describe the project more loosely as “Verilog.” The actual flow uses `sv2v` first, then `iverilog`.
+
+## Overall assessment
+
+This is a disciplined educational hardware repository with a clear module boundary at the GPU/core/thread levels, a compact verification surface, and unusually readable conceptual documentation. Its main rough edge is not architectural chaos; it is the gap between conceptual discussion in the README and what is concretely implemented in the checked-in RTL.
diff --git a/docs/report/01-repository-structure.md b/docs/report/01-repository-structure.md
new file mode 100644
index 0000000..55d3e05
--- /dev/null
+++ b/docs/report/01-repository-structure.md
@@ -0,0 +1,105 @@
+# Repository Structure
+
+## Top-level layout
+
+Confirmed from the local checkout:
+
+```text
+.
+├── docs/
+│   ├── images/
+│   └── report/
+├── gds/
+│   ├── 0/gpu.gds
+│   └── 1/gpu.gds
+├── Makefile
+├── README.md
+├── src/
+└── test/
+```
+
+## Directory roles
+
+### `src/`
+
+This is the complete checked-in RTL implementation:
+
+- `gpu.sv` — top-level integration and external memory/control interface
+- `core.sv` — per-core composition boundary
+- `dispatch.sv` — block dispatch and completion tracking
+- `controller.sv` — shared memory arbitration primitive for program/data memory
+- `dcr.sv` — device control register storing `thread_count`
+- `scheduler.sv` — core execution state machine
+- `fetcher.sv` — instruction fetch unit
+- `decoder.sv` — ISA decode logic
+- `registers.sv` — per-thread register file plus SIMD metadata registers
+- `pc.sv` — next-PC and NZP handling
+- `lsu.sv` — per-thread load/store unit
+- `alu.sv` — per-thread arithmetic unit
+
+There is no `cache.sv` or similar cache implementation in the current tree.
+
+### `test/`
+
+This directory holds the cocotb testbench and helper code:
+
+- `test_matadd.py` — matrix-add kernel simulation
+- `test_matmul.py` — matrix-multiply kernel simulation
+- `helpers/setup.py` — clock/reset/program/data/thread-count setup
+- `helpers/memory.py` — software-backed program/data memory model
+- `helpers/format.py` — trace-format helpers
+- `helpers/logger.py` — log writer for execution traces
+- `logs/.gitkeep` — retained log output directory
+
+The tests act as the “host system” for the GPU by loading memory contents, writing the thread count through the device-control interface, asserting `start`, and then emulating memory readiness and responses in Python.
+
+### `docs/`
+
+- `docs/images/` contains the static diagrams referenced by the project README.
+- `docs/report/` contains this source-verified narrative analysis set.
+
+### `gds/`
+
+`gds/0/gpu.gds` and `gds/1/gpu.gds` are physical-layout artifacts. The repository does not explain how they were generated or what distinguishes the two directories, so they should be treated as concrete artifacts with undocumented provenance.
+
+## Supporting project files
+
+### `README.md`
+
+Acts as the main conceptual document. It explains the motivation for the project, the top-level architecture, the ISA, the example kernels, the simulation flow, and a roadmap of future enhancements.
+
+### `Makefile`
+
+Encodes the practical compile and test entrypoints. It transpiles SystemVerilog with `sv2v`, compiles the generated Verilog with `iverilog`, and launches cocotb via `vvp`.
+
+### `.gitignore`
+
+Ignores Python caches, `build/`, generated logs, some generated GDS-adjacent artifacts, `.DS_Store`, and `results.xml`.
+
+## Module hierarchy snapshot
+
+```mermaid
+flowchart TD
+    GPU[gpu.sv] --> DCR[dcr.sv]
+    GPU --> Dispatch[dispatch.sv]
+    GPU --> CtrlData[controller.sv\n(data memory)]
+    GPU --> CtrlProg[controller.sv\n(program memory)]
+    GPU --> Core[core.sv x NUM_CORES]
+
+    Core --> Scheduler[scheduler.sv]
+    Core --> Fetcher[fetcher.sv]
+    Core --> Decoder[decoder.sv]
+    Core --> ALU[alu.sv x THREADS_PER_BLOCK]
+    Core --> LSU[lsu.sv x THREADS_PER_BLOCK]
+    Core --> Regs[registers.sv x THREADS_PER_BLOCK]
+    Core --> PC[pc.sv x THREADS_PER_BLOCK]
+```
+
+## Architecture seams that map cleanly to documentation
+
+- **Host/control seam** — `start`, `done`, and the device control register
+- **Dispatch seam** — block formation and assignment to cores
+- **Core execution seam** — fetch/decode/schedule/update lifecycle
+- **Memory seam** — internal requesters versus external memory interfaces
+- **Thread-local seam** — replicated ALU/LSU/register/PC resources under shared scheduling
+- **Verification seam** — RTL DUT versus Python memory model and trace logging
diff --git a/docs/report/02-architecture-and-execution.md b/docs/report/02-architecture-and-execution.md
new file mode 100644
index 0000000..0c8a794
--- /dev/null
+++ b/docs/report/02-architecture-and-execution.md
@@ -0,0 +1,207 @@
+# Architecture and Execution
+
+## Top-level architecture
+
+The design centers on the `gpu` module in `src/gpu.sv`. External DeepWiki material describes the project using the same broad decomposition seen in the local RTL: top-level GPU control, block dispatch, memory arbitration, and per-core execution resources. The wording in this report is still grounded primarily in the checked-in source.
+
+The top-level interface makes the execution model explicit:
+
+- host-side control (`start`, `done`, and the device-control write interface)
+- external program-memory read interface
+- external data-memory read/write interface
+
+The top-level module wires together four major subsystems:
+
+1. `dcr` — stores the thread count for the active kernel
+2. `dispatch` — divides threads into blocks and assigns them to cores
+3. `controller` instances — arbitrate between internal requesters and external memory channels
+4. `core` instances — execute blocks of threads
+
+## System architecture diagram
+
+```mermaid
+flowchart TD
+    Host[Host / Testbench] --> DCR[Device Control Register]
+    Host --> PM[Program Memory]
+    Host --> DM[Data Memory]
+    Host --> Start[start]
+
+    DCR --> Dispatch[Dispatcher]
+    Start --> Dispatch
+    Dispatch --> Cores[Compute Cores]
+
+    Cores --> ProgCtrl[Program Memory Controller]
+    ProgCtrl --> PM
+
+    Cores --> DataCtrl[Data Memory Controller]
+    DataCtrl --> DM
+
+    subgraph CoreInternals[One Core]
+        Scheduler[Scheduler] --> Fetcher[Fetcher]
+        Scheduler --> Decoder[Decoder]
+        Scheduler --> Threads[Per-thread ALU / LSU / Registers / PC]
+    end
+```
+
+## Confirmed execution model
+
+The core execution pipeline is not just described in prose; it is encoded directly in `src/scheduler.sv` as a state machine:
+
+- `IDLE`
+- `FETCH`
+- `DECODE`
+- `REQUEST`
+- `WAIT`
+- `EXECUTE`
+- `UPDATE`
+- `DONE`
+
+The scheduler processes one block at a time through these stages. `WAIT` explicitly inspects LSU state and does not advance until no thread is still waiting on memory.
+
+## Instruction flow
+
+```mermaid
+flowchart LR
+    Idle[IDLE] --> Fetch[FETCH]
+    Fetch --> Decode[DECODE]
+    Decode --> Request[REQUEST]
+    Request --> Wait[WAIT]
+    Wait --> Execute[EXECUTE]
+    Execute --> Update[UPDATE]
+    Update -->|RET or block complete| Done[DONE]
+    Update -->|next instruction| Fetch
+```
+
+## Per-core organization
+
+Each `core` contains:
+
+- one `fetcher`
+- one `decoder`
+- one `scheduler`
+- one ALU per supported thread
+- one LSU per supported thread
+- one register file per supported thread
+- one PC unit per supported thread
+
+This replication is created with a `generate` block in `src/core.sv`, so the amount of thread-local hardware scales with `THREADS_PER_BLOCK`.
+
+## Core-to-thread decomposition
+
+```mermaid
+flowchart TD
+    Core[core.sv] --> Fetcher[fetcher.sv]
+    Core --> Decoder[decoder.sv]
+    Core --> Scheduler[scheduler.sv]
+    Core --> ThreadUnits[Thread units]
+
+    ThreadUnits --> ALU[alu.sv]
+    ThreadUnits --> LSU[lsu.sv]
+    ThreadUnits --> Registers[registers.sv]
+    ThreadUnits --> PC[pc.sv]
+```
+
+## SIMD-style thread model
+
+The SIMD flavor comes from replicated per-thread state plus shared control flow. Confirmed details from `src/registers.sv`:
+
+- 16 registers per thread
+- `R0` through `R12` are writable general-purpose registers
+- register 13 stores `%blockIdx`
+- register 14 stores `%blockDim`
+- register 15 stores `%threadIdx`
+
+This lets each thread execute the same decoded instruction stream while operating on different local data.
+
+## Dispatch behavior
+
+`src/dispatch.sv` computes the number of blocks as:
+
+```text
+total_blocks = (thread_count + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK
+```
+
+It then assigns blocks to available cores, tracking both `blocks_dispatched` and `blocks_done`. The final block may contain fewer threads than `THREADS_PER_BLOCK`; that case is handled by passing a reduced `core_thread_count`.
+
+## Memory system
+
+The memory model is intentionally simple but concrete.
+
+### Program memory
+
+- read-only at the controller level (`WRITE_ENABLE = 0` in `gpu.sv`)
+- one fetcher per core issues requests
+- fetched instructions are 16 bits wide
+
+### Data memory
+
+- read/write through LSU-generated requests
+- multiple LSU consumers share a controller
+- the controller tracks which consumer each channel is currently serving
+
+The `controller` module is therefore a reusable arbitration layer rather than a one-off memory wrapper.
+
+## Memory and control relationships
+
+```mermaid
+flowchart TD
+    Fetcher[Fetcher] --> ProgCtrl[Program Memory Controller]
+    ProgCtrl --> ProgMem[Program Memory]
+
+    LSUs[Per-thread LSUs] --> DataCtrl[Data Memory Controller]
+    DataCtrl --> DataMem[Data Memory]
+
+    Scheduler[Scheduler] --> Fetcher
+    Scheduler --> LSUs
+    Decoder[Decoder] --> LSUs
+```
+
+## Branching and convergence
+
+The design supports `CMP` and `BRnzp`, but branch handling is intentionally naive.
+
+Confirmed from `src/scheduler.sv`, `src/pc.sv`, and `src/core.sv`:
+
+- each thread computes its own `next_pc`
+- the scheduler updates the shared `current_pc` specifically from `next_pc[THREADS_PER_BLOCK-1]`
+- the source contains an explicit TODO around branch divergence
+
+The practical interpretation is that the current implementation assumes per-block control-flow convergence rather than supporting true branch divergence.
+
+There is also a narrower implementation caveat for short final blocks: `pc` instances are enabled only when `i < thread_count` in `src/core.sv`, but the scheduler still takes `next_pc[THREADS_PER_BLOCK-1]` as the block-wide next PC. In a partial block, that means the shared PC is sourced from the last thread slot even when that slot is not active.
+
+## ISA implementation notes
+
+`src/decoder.sv` decodes the following opcode set:
+
+- `NOP`
+- `BRnzp`
+- `CMP`
+- `ADD`
+- `SUB`
+- `MUL`
+- `DIV`
+- `LDR`
+- `STR`
+- `CONST`
+- `RET`
+
+That matches the ISA described in the README.
+
+## DeepWiki-aligned interpretation
+
+DeepWiki’s external documentation is useful here because it reinforces the same top-level understanding without changing the source-grounded conclusions:
+
+- `gpu.sv` is the orchestration boundary
+- `dispatch.sv` is the scheduling boundary between kernel-wide work and core-local work
+- `core.sv` is the composition boundary for execution resources
+- `scheduler.sv` is the canonical control-flow definition for instruction progression
+- `controller.sv` is the memory arbitration boundary shared by instruction and data access paths
+
+## Implemented behavior versus conceptual roadmap
+
+The README discusses cache, branch divergence, memory coalescing, pipelining, and warp scheduling. In the checked-in RTL, those are future-facing concepts rather than implemented subsystems. Any architecture explanation should therefore separate:
+
+- **implemented today** — the modules in `src/` and their current behaviors
+- **described conceptually** — the README’s explanations of how real GPUs are optimized
+- **planned work** — items called out in the README’s future-facing sections
diff --git a/docs/report/03-build-and-test-workflow.md b/docs/report/03-build-and-test-workflow.md
new file mode 100644
index 0000000..8bd6624
--- /dev/null
+++ b/docs/report/03-build-and-test-workflow.md
@@ -0,0 +1,148 @@
+# Build and Test Workflow
+
+## Toolchain
+
+The current repository workflow depends on:
+
+- `sv2v`
+- `iverilog`
+- `vvp`
+- `cocotb`
+- Python for the cocotb tests and helpers
+- optionally `gtkwave` for waveform inspection
+
+The README also instructs users to create a `build/` directory manually before running the flow.
+
+## Confirmed Makefile behavior
+
+From `Makefile`:
+
+### `make compile`
+
+1. Runs `make compile_alu`
+2. Runs `sv2v -I src/* -w build/gpu.v`
+3. Appends `build/alu.v` into `build/gpu.v`
+4. Prepends a `` `timescale 1ns/1ns `` line via a temporary file
+
+### `make test_matadd` / `make test_matmul`
+
+1. Calls `make compile`
+2. Runs:
+
+```bash
+iverilog -o build/sim.vvp -s gpu -g2012 build/gpu.v
+```
+
+3. Runs cocotb through `vvp` with:
+
+```bash
+MODULE=test.test_$* vvp -M $(cocotb-config --prefix)/cocotb/libs -m libcocotbvpi_icarus build/sim.vvp
+```
+
+That means `make test_matadd` loads `test.test_matadd`, and `make test_matmul` loads `test.test_matmul`.
+
+## Tool flow diagram
+
+```mermaid
+flowchart TD
+    Start[make test_matadd or make test_matmul] --> Compile[make compile]
+    Compile --> SV2V[sv2v converts SystemVerilog to Verilog]
+    SV2V --> Icarus[iverilog builds build/sim.vvp]
+    Icarus --> VVP[vvp launches simulator runtime]
+    VVP --> Cocotb[cocotb test module runs]
+    Cocotb --> Memory[Python memory models serve program/data requests]
+    Cocotb --> Logs[test/logs/log_*.txt]
+```
+
+## Testbench behavior
+
+The cocotb setup in `test/helpers/setup.py` does the following:
+
+1. starts a clock with `Clock(dut.clk, 25, units="us")`
+2. asserts then deasserts reset
+3. loads program memory contents from a Python list
+4. loads data memory contents from a Python list
+5. writes the desired thread count into the DUT device-control register
+6. asserts `dut.start`
+
+`setup()` does not deassert `dut.start` afterward, so the testbench effectively drives a level-high start condition for the remainder of the run.
+
+The main test loop then repeatedly:
+
+- runs the software memory models (`program_memory.run()` and `data_memory.run()`)
+- waits for a read-only phase
+- logs the trace state
+- advances one clock edge
+
+Execution ends when `dut.done` becomes `1`.
+
+## Why the cocotb structure matches the official model
+
+The repository’s cocotb code follows the documented cocotb pattern closely:
+
+- tests are `async` Python coroutines
+- DUT signals are accessed as `dut.<signal>.value`
+- the testbench uses `Clock` to drive `dut.clk`
+- synchronization uses triggers such as `RisingEdge` and read-only phases
+
+That is exactly the kind of flow cocotb’s documentation presents for coroutine-based HDL tests.
+
+## Why the Icarus command line matches the official model
+
+The `iverilog` invocation also matches the documented usage:
+
+- `-g2012` enables IEEE 1800-2012 / SystemVerilog language support
+- `-s gpu` explicitly selects `gpu` as the root module for elaboration
+- `vvp -M ... -m libcocotbvpi_icarus` loads the cocotb VPI module into the simulator runtime
+
+This is the key bridge between the generated Verilog simulation and the Python cocotb test process.
+
+## `sv2v` in this repository
+
+The `sv2v` project documents itself as a SystemVerilog-to-Verilog converter aimed at synthesizable constructs, with `-w` used to write output files and `-I` / `--incdir` used to extend the include search path. That explains the intent of this repository’s compile stage, which as written emits generated Verilog into `build/` before invoking `iverilog`.
+
+One subtle but important nuance from the official `sv2v` usage docs: users usually pass all relevant SystemVerilog files together so the converter can resolve cross-file constructs. This repository instead runs the atypical compile rules exactly as written in the `Makefile`: `alu.sv` is converted separately and appended into `build/gpu.v`, while the main `sv2v -I src/* -w build/gpu.v` invocation does not explicitly list input files in the rule itself. The report therefore describes the Makefile faithfully, but should not imply that this is the standard or obviously complete `sv2v` invocation pattern.
+
+## Memory simulation strategy
+
+The hardware does not connect to an external simulator-owned memory model directly. Instead, `test/helpers/memory.py` emulates the external memories in Python by:
+
+- decoding DUT read-valid signals and addresses
+- returning ready/data when a valid request is present
+- updating internal Python memory arrays on writes
+
+This keeps the tests compact and makes memory behavior easy to inspect in logs, but it also means timing is idealized compared with a more realistic external-memory model.
+
+## Output artifacts
+
+The logger in `test/helpers/logger.py` writes trace logs to:
+
+```text
+test/logs/log_<timestamp>.txt
+```
+
+These logs include:
+
+- memory dumps
+- instruction traces
+- core state
+- fetcher and LSU state
+- register contents
+
+If waveform dumping is enabled separately, GTKWave is the natural viewer for offline inspection of those dump files. GTKWave’s own docs describe it as a post-mortem waveform browser for formats such as VCD and related traces.
+
+## External reference map
+
+| Tool | Repo use | Most relevant official concepts |
+| --- | --- | --- |
+| `cocotb` | Python coroutine testbench | `@cocotb.test`, `Clock`, `RisingEdge`, `ReadOnly`, DUT signal access |
+| `iverilog` | compile generated Verilog | `-g2012`, `-s <topmodule>`, Verilog/SystemVerilog elaboration |
+| `vvp` | run compiled simulation | `-M` module search path, `-m` load VPI module |
+| `sv2v` | convert SystemVerilog before compile | `-w` output file, `-I/--incdir`, synthesizable SystemVerilog focus |
+| `gtkwave` | optional debug viewer | offline waveform browsing and signal inspection |
+
+## Notable rough edges
+
+- The `Makefile` does not create `build/`, so a first-time user who skips the README setup step will fail early.
+- `test/test_matmul.py` exports a test function named `test_matadd`, which is confusing even though the module name still determines which file cocotb loads.
+- The flow is optimized for local learning and simulation, not for a polished CI or packaging experience.
diff --git a/docs/report/04-risks-open-questions.md b/docs/report/04-risks-open-questions.md
new file mode 100644
index 0000000..c0f8c50
--- /dev/null
+++ b/docs/report/04-risks-open-questions.md
@@ -0,0 +1,74 @@
+# Risks, Limitations, and Open Questions
+
+## Confirmed current limitations
+
+These points are supported by the checked-in source, not merely by the README’s narrative.
+
+### 1. No implemented cache module
+
+The README describes cache as part of the architecture, but the source tree contains no cache module and `gpu.sv` instantiates no cache layer. This is the single largest source-to-documentation mismatch in the repo.
+
+### 2. No branch divergence support
+
+`scheduler.sv` contains an explicit TODO indicating that branch divergence is not implemented and that the current design assumes next-PC convergence. That keeps the design simple, but it also means kernels that require divergent per-thread control flow are out of scope.
+
+### 3. No pipelining or overlap beyond basic async memory waiting
+
+The scheduler waits for one instruction lifecycle to finish before progressing to the next instruction. That matches the repository’s educational goal, but it also means throughput-oriented optimizations are intentionally absent.
+
+### 4. Simple memory timing assumptions in tests
+
+The Python memory model responds as soon as valid requests are observed. This is good for clarity, but it idealizes memory timing and does not stress the design with realistic latency or backpressure behavior.
+
+## Documented versus implemented
+
+```mermaid
+flowchart LR
+    Docs[README conceptual model] --> Shared[Current overlap]
+    Source[src/ implemented RTL] --> Shared
+
+    Docs --> Roadmap[Cache / divergence / coalescing / pipelining / warp scheduling]
+    Source --> Current[DCR / dispatch / controllers / cores / scheduler / cocotb harness]
+```
+
+The safest interpretation is:
+
+- treat `README.md` as the conceptual guide
+- treat `src/` as the source of truth for implemented behavior
+- treat advanced features as roadmap items unless concrete RTL supports them
+
+## Repository inconsistencies worth preserving in docs
+
+### README uses present-tense architecture language for cache
+
+That wording can lead a new reader to expect cache logic in `src/`, but the current checkout does not support that expectation.
+
+### `test/test_matmul.py` naming mismatch
+
+The file is clearly the matrix-multiplication test, but its exported cocotb function is named `test_matadd`. That does not invalidate the test flow, yet it is still a maintenance smell.
+
+### Build-directory prerequisite lives in README, not in automation
+
+The current flow assumes `build/` already exists. A more ergonomic local workflow would create it from the Makefile itself.
+
+## Open questions raised by the current checkout
+
+### GDS provenance
+
+`gds/0/gpu.gds` and `gds/1/gpu.gds` exist, but the repository does not explain:
+
+- what flow produced them
+- what distinguishes `0/` from `1/`
+- whether they represent revisions, experiments, or packaging targets
+
+### Physical-design context
+
+There are hints that physical-design constraints shaped parts of the RTL, but the repository does not include a dedicated note explaining the larger downstream flow.
+
+### Scope of future work versus architectural commitments
+
+The README’s “Next Steps” section mentions cache, branch divergence, memory coalescing, pipelining, graphics, and other extensions. Those should be read as roadmap intent, not as partially implemented subsystems, unless the source clearly shows otherwise.
+
+## Recommended interpretation for future readers
+
+This repository is best understood as a compact educational GPU whose main value is architectural clarity. The code is concrete enough to simulate real kernels, but many of the README’s advanced topics are there to teach what real GPUs do next, not to claim that tiny-gpu already does it.
diff --git a/docs/report/05-matadd-trace-walkthrough.md b/docs/report/05-matadd-trace-walkthrough.md
new file mode 100644
index 0000000..9071d1c
--- /dev/null
+++ b/docs/report/05-matadd-trace-walkthrough.md
@@ -0,0 +1,373 @@
+# matadd Execution Trace Walkthrough
+
+## Goal of this report
+
+This chapter explains how to read the `matadd` simulation trace as a beginner.
+
+Instead of translating every repeated log line, it focuses on the moments that actually teach you how the design works:
+
+- how the testbench sets up the run
+- what one scheduler cycle means
+- why the first real instruction does not appear immediately
+- how eight threads move through the same instruction stream
+- when the actual data-memory writes happen
+
+All observations below are grounded in the log file from a fresh run:
+
+- log: `test/logs/log_20260402201022.txt`
+- completed in: `178 cycles`
+
+## What this test is trying to prove
+
+`test/test_matadd.py` loads a tiny kernel which adds two 1×8 vectors stored in data memory.
+
+The input layout is:
+
+- addresses `0..7`: matrix A = `0 1 2 3 4 5 6 7`
+- addresses `8..15`: matrix B = `0 1 2 3 4 5 6 7`
+- addresses `16..23`: output buffer, initially all zero
+
+The expected result is:
+
+```text
+0 2 4 6 8 10 12 14
+```
+
+The final memory dump in the log confirms exactly that:
+
+- `data[16..23] = 0, 2, 4, 6, 8, 10, 12, 14`
+
+## What the testbench does every cycle
+
+The cocotb harness is simple but very important to understand.
+
+From `test/helpers/setup.py` and `test/test_matadd.py`, the flow is:
+
+1. start the clock
+2. pulse reset
+3. load program memory with the kernel instructions
+4. load data memory with A and B
+5. write `threads = 8` into the device control register
+6. raise `start`
+7. on every loop iteration:
+   - `data_memory.run(cycle=cycles)` services data-memory reads/writes
+   - `program_memory.run()` services instruction fetches
+   - `format_cycle(dut, cycles)` logs the current machine state
+   - then the test advances one rising edge
+
+That means the trace is a combined view of:
+
+- the RTL state inside the GPU
+- the Python-side memory model acting like “external memory”
+
+## How to read one trace block
+
+Every cycle dump follows the same structure.
+
+For each active core, and then for each active thread in that core, the logger shows:
+
+- `PC`
+- decoded `Instruction`
+- `Core State`
+- `Fetcher State`
+- `LSU State`
+- all register values
+- selected datapath outputs like `ALU Out`, `LSU Out`, or `Constant`
+
+For this design, the scheduler state machine is the key rhythm:
+
+```text
+FETCH -> DECODE -> REQUEST -> WAIT -> EXECUTE -> UPDATE
+```
+
+Even pure ALU or CONST instructions still pass through this full sequence, because the scheduler is intentionally simple and uniform.
+
+## Step 1: initial memory state and idle machine
+
+At the very top of the log, before cycle 0, the testbench prints the initial data memory table.
+
+Then cycle 0 shows both cores still idle.
+
+Important details from cycle 0:
+
+- Core 0 contains logical threads `0..3`
+- Core 1 contains logical threads `4..7`
+- `%blockDim = 4`, so each core handles a block of four threads
+- `%blockIdx = 0` on Core 0 and `%blockIdx = 1` on Core 1 once dispatch is active
+
+At cycle 0 the trace still shows `Instruction: NOP` and `Core State: IDLE`. That is not a bug. It just means the kernel launch has been requested, but the fetch pipeline has not yet produced the first instruction word.
+
+## Step 2: why the first real instruction appears at cycle 7
+
+From cycles `0..6`, the machine transitions from `IDLE` into fetch activity.
+
+At cycle 6, both cores are already in:
+
+- `Core State: FETCH`
+- `Fetcher State: FETCHING`
+
+But the visible instruction is still `NOP`.
+
+At cycle 7, the first real instruction appears:
+
+```text
+PC: 0
+Instruction: MUL R0, %blockIdx, %blockDim
+Core State: FETCH
+Fetcher State: FETCHED
+```
+
+This is the first good example of how to read the fetcher:
+
+- `FETCHING` means the request is still in flight
+- `FETCHED` means the instruction bits have arrived and can now be decoded on later cycles
+
+So the reason the log starts with several cycles of `NOP` is simply that the fetch pipeline takes a few cycles before the first instruction becomes visible in the core.
+
+## Step 3: one instruction takes six scheduler phases
+
+The first instruction is:
+
+```text
+MUL R0, %blockIdx, %blockDim
+```
+
+Conceptually, this computes the block base index:
+
+```text
+R0 = blockIdx * blockDim
+```
+
+In the trace, one instruction does **not** complete in one cycle.
+It walks through the scheduler phases:
+
+- `FETCH`
+- `DECODE`
+- `REQUEST`
+- `WAIT`
+- `EXECUTE`
+- `UPDATE`
+
+This is why a single kernel instruction occupies many trace entries. The trace is showing the scheduler micro-steps, not just ISA-level steps.
+
+## Step 4: how the eight threads get different indices while sharing the same instruction stream
+
+The second kernel instruction is:
+
+```text
+ADD R0, R0, %threadIdx
+```
+
+That is the classic GPU indexing step:
+
+```text
+i = blockIdx * blockDim + threadIdx
+```
+
+Because all threads execute the same instruction but each thread has a different `%threadIdx`, the register values diverge naturally.
+
+For example:
+
+- thread 0 ends with `R0 = 0`
+- thread 1 ends with `R0 = 1`
+- ...
+- thread 7 ends with `R0 = 7`
+
+This is one of the most important “GPU ideas” visible in the trace:
+
+> same instruction stream, different per-thread register state
+
+## Step 5: loading constants establishes the memory layout
+
+The three `CONST` instructions load:
+
+- `R1 = 0`  → base of A
+- `R2 = 8`  → base of B
+- `R3 = 16` → base of C
+
+After that, each thread can compute addresses using only register arithmetic.
+
+This is why later `ADD` instructions are easy to interpret:
+
+- `ADD R4, R1, R0` means “address of A[i]”
+- `ADD R5, R2, R0` means “address of B[i]”
+- `ADD R7, R3, R0` means “address of C[i]”
+
+## Step 6: the first real memory-dependent phase is `LDR R4, R4`
+
+A very educational trace point appears around cycle 73.
+
+At that point, Core 0 shows:
+
+```text
+PC: 6
+Instruction: LDR R4, R4
+```
+
+And each thread already has a different `R4`:
+
+- thread 0: `R4 = 0`
+- thread 1: `R4 = 1`
+- thread 2: `R4 = 2`
+- thread 3: `R4 = 3`
+
+So this one ISA instruction means:
+
+- thread 0 loads `A[0]`
+- thread 1 loads `A[1]`
+- thread 2 loads `A[2]`
+- thread 3 loads `A[3]`
+
+At this point the LSU becomes interesting, because `LDR` and `STR` are the instructions that actually interact with the external Python memory model.
+
+When reading these parts of the log, pay attention to:
+
+- `LSU State`
+- `Core State: WAIT`
+
+Those two fields tell you when the machine is stalled waiting for memory rather than computing.
+
+## Step 7: why stores show up as `[memwrite]` lines
+
+The actual memory writes are not only visible through thread state. They are also explicitly emitted by `test/helpers/memory.py` as lines like:
+
+```text
+[memwrite] data cycle=151 lane=0 addr=19 old=0 new=6
+```
+
+These lines are generated by the Python memory model when it sees a valid data-memory write request.
+
+That makes them the clearest proof of “the kernel has written its results.”
+
+## Step 8: the first result-store burst happens at cycle 151
+
+The first store burst in the log is:
+
+```text
+[memwrite] data cycle=151 lane=0 addr=19 old=0 new=6
+[memwrite] data cycle=151 lane=1 addr=18 old=0 new=4
+[memwrite] data cycle=151 lane=2 addr=17 old=0 new=2
+[memwrite] data cycle=151 lane=3 addr=16 old=0 new=0
+```
+
+This is Core 0 writing results for threads `0..3`.
+
+Notice two things:
+
+1. the addresses are reversed by lane order in the log output (`19, 18, 17, 16`)
+2. but the values are exactly the expected sums for the first four elements:
+
+- `C[0] = 0 + 0 = 0`
+- `C[1] = 1 + 1 = 2`
+- `C[2] = 2 + 2 = 4`
+- `C[3] = 3 + 3 = 6`
+
+Around this same point, the threads are sitting on:
+
+```text
+Instruction: STR R7, R6
+Core State: WAIT
+LSU State: WAITING
+```
+
+That combination tells you the core has already issued the store and is waiting for the memory-side handshake to complete.
+
+## Step 9: the second result-store burst happens at cycle 158
+
+The second store burst is:
+
+```text
+[memwrite] data cycle=158 lane=0 addr=23 old=0 new=14
+[memwrite] data cycle=158 lane=1 addr=22 old=0 new=12
+[memwrite] data cycle=158 lane=2 addr=21 old=0 new=10
+[memwrite] data cycle=158 lane=3 addr=20 old=0 new=8
+```
+
+This is Core 1 writing results for threads `4..7`:
+
+- `C[4] = 4 + 4 = 8`
+- `C[5] = 5 + 5 = 10`
+- `C[6] = 6 + 6 = 12`
+- `C[7] = 7 + 7 = 14`
+
+So the two cores finish their store phases in two distinct bursts:
+
+- Core 0 writes addresses `16..19`
+- Core 1 writes addresses `20..23`
+
+This is a nice concrete example of how the blocks are split across cores.
+
+## Step 10: why there are duplicate-looking memwrite lines on the next cycle
+
+You also see another burst immediately after each first burst:
+
+```text
+[memwrite] data cycle=152 ... old=6 new=6
+...
+[memwrite] data cycle=159 ... old=14 new=14
+```
+
+These look redundant because they are redundant from a data-content perspective.
+
+The most useful beginner interpretation is:
+
+> the memory model is still observing asserted write-valid behavior across another cycle, so it logs another write handshake even though the stored value is unchanged.
+
+The important lesson is not “there are two different mathematical writes.”
+The important lesson is:
+
+- the first burst proves the result values were produced
+- the second burst is a protocol-level repeat, not a second different answer
+
+## Step 11: end of execution
+
+Near the end of the log, the machine returns to idle and the logger prints:
+
+```text
+Completed in 178 cycles
+```
+
+Then the final data-memory table shows:
+
+```text
+Addr 16..23 = 0, 2, 4, 6, 8, 10, 12, 14
+```
+
+So the full story of the run is:
+
+1. launch two 4-thread blocks
+2. fetch the kernel
+3. compute each thread’s global index
+4. load A[i]
+5. load B[i]
+6. add them into `R6`
+7. compute destination address in `R7`
+8. store to output buffer
+9. return
+
+## What to focus on when you read this log yourself
+
+If you reopen the log, do **not** try to understand every line equally.
+
+Read in this order:
+
+1. initial data-memory dump
+2. cycle 7 — first real fetched instruction
+3. the first `ADD R0, R0, %threadIdx` sequence
+4. the first `LDR` sequence around cycle 73
+5. the `[memwrite]` bursts at cycles 151 and 158
+6. final data-memory dump
+
+That reading order gives you the high-level execution story first.
+
+## Beginner takeaway
+
+This trace is a good example of SIMD execution in a tiny GPU:
+
+- one shared control flow per core
+- one private register file per thread
+- identical instructions across threads
+- different data because `%threadIdx` and `%blockIdx` differ
+- memory latency exposed through `WAIT` and `LSU State`
+
+Once that picture clicks, the log becomes much easier to read.
diff --git a/docs/report/06-matmul-trace-walkthrough.md b/docs/report/06-matmul-trace-walkthrough.md
new file mode 100644
index 0000000..98028c8
--- /dev/null
+++ b/docs/report/06-matmul-trace-walkthrough.md
@@ -0,0 +1,422 @@
+# matmul Execution Trace Walkthrough
+
+## Goal of this report
+
+This chapter explains the `matmul` execution trace in a way that is useful for a Verilog beginner.
+
+This kernel is more interesting than `matadd` because it contains:
+
+- address arithmetic
+- a loop
+- a compare instruction
+- a branch instruction
+- repeated loads and multiplies
+- a final store after the loop finishes
+
+All observations below are grounded in a fresh run:
+
+- log: `test/logs/log_20260402201026.txt`
+- completed in: `491 cycles`
+
+## What this test is trying to prove
+
+`test/test_matmul.py` multiplies two 2×2 matrices:
+
+- matrix A at addresses `0..3` = `1 2 3 4`
+- matrix B at addresses `4..7` = `1 2 3 4`
+- output C at addresses `8..11`, initially zero
+
+The expected result matrix is:
+
+```text
+7 10
+15 22
+```
+
+The final memory dump confirms:
+
+- `data[8..11] = 7, 10, 15, 22`
+
+## One very important logging detail
+
+Unlike `matadd`, this test does **not** log every thread’s full state.
+
+It calls:
+
+```python
+format_cycle(dut, cycles, thread_id=1)
+```
+
+So the per-cycle trace is filtered to **logical thread 1 only**.
+
+That means:
+
+- register dumps are only for thread 1
+- instruction/state lines are only for thread 1
+- but `[memwrite]` lines still show all four lanes when the store happens
+
+This is why the report focuses on thread 1 as the “main character,” then uses the final memwrite burst to show that all threads completed.
+
+## What thread 1 is computing
+
+For a 2×2 matrix multiply, there are 4 output elements, so there are 4 threads.
+
+Thread 1 corresponds to global index `i = 1`.
+
+The kernel computes:
+
+- `row = i // N`
+- `col = i % N`
+
+with `N = 2`.
+
+So for thread 1:
+
+- `row = 0`
+- `col = 1`
+
+That means thread 1 is responsible for output element:
+
+```text
+C[0,1]
+```
+
+Mathematically:
+
+```text
+C[0,1] = A[0,0] * B[0,1] + A[0,1] * B[1,1]
+       = 1 * 2 + 2 * 4
+       = 10
+```
+
+The final trace shows thread 1 ends with `R8 = 10`, which matches that expected dot product.
+
+## How to read the scheduler rhythm
+
+Just like `matadd`, each ISA instruction goes through the same scheduler stages:
+
+```text
+FETCH -> DECODE -> REQUEST -> WAIT -> EXECUTE -> UPDATE
+```
+
+This is why even simple instructions like `CONST` appear across multiple cycles.
+
+For example, around cycles `40..45`, the trace shows `CONST R2, #2` moving through:
+
+- FETCH
+- DECODE
+- REQUEST
+- WAIT
+- EXECUTE
+- UPDATE
+
+Only after that does `R2` actually become `2` in the register dump.
+
+That is the right beginner mental model:
+
+> the trace is showing the scheduler’s control steps, not just the instruction list in assembly.
+
+## Step 1: startup and first real instruction
+
+As in `matadd`, the early cycles are not yet “real compute.”
+
+At cycles `0..6`, thread 1 is still showing `NOP` while fetch is warming up.
+
+At cycle 7, the first real instruction appears:
+
+```text
+PC: 0
+Instruction: MUL R0, %blockIdx, %blockDim
+```
+
+Then the second instruction is:
+
+```text
+ADD R0, R0, %threadIdx
+```
+
+For thread 1, that gives:
+
+```text
+R0 = 1
+```
+
+which is its global thread index.
+
+## Step 2: setting up constants and thread coordinates
+
+The next few instructions establish the constants needed for matrix math:
+
+- `R1 = 1`  → increment value for loop counter
+- `R2 = 2`  → matrix dimension N
+- `R3 = 0`  → base of A
+- `R4 = 4`  → base of B
+- `R5 = 8`  → base of C
+
+Then the kernel computes thread-specific coordinates:
+
+- `DIV R6, R0, R2` → row
+- `MUL R7, R6, R2`
+- `SUB R7, R0, R7` → col
+
+For thread 1 the resulting interpretation is:
+
+- `R6 = 0` → row 0
+- `R7 = 1` → col 1
+
+This exactly matches the expected output location `C[0,1]`.
+
+## Step 3: the loop body starts at PC 12
+
+The loop body begins at program counter 12.
+
+Around cycle 296, after the first branch-back, the trace shows:
+
+```text
+PC: 12
+Instruction: MUL R10, R6, R2
+```
+
+That is the first line of the loop body.
+
+From there the loop computes two addresses:
+
+1. address into A using `row * N + k + baseA`
+2. address into B using `k * N + col + baseB`
+
+Then it performs:
+
+- `LDR R10, R10`
+- `LDR R11, R11`
+- `MUL R12, R10, R11`
+- `ADD R8, R8, R12`
+
+That is the classic dot-product inner loop.
+
+## Step 4: understanding one loop iteration for thread 1
+
+Thread 1 executes two loop iterations because `N = 2`.
+
+### First iteration (`k = 0`)
+
+The relevant values become:
+
+- load `A[0,0] = 1`
+- load `B[0,1] = 2`
+- multiply → `1 * 2 = 2`
+- accumulate → `R8 = 2`
+
+You can see this state clearly near the first branch-back region.
+Around cycles `284..291`, thread 1 has:
+
+- `R8 = 2`
+- `R9 = 1`
+- `R10 = 1`
+- `R11 = 2`
+- `R12 = 2`
+
+That register snapshot is exactly what you want after the first dot-product term has been accumulated.
+
+### First branch decision: loop continues
+
+At cycle 284 the trace is approaching the loop branch logic:
+
+```text
+PC: 24
+Instruction: CMP R9, R2
+```
+
+At this point:
+
+- `R9 = 1`
+- `R2 = 2`
+
+So the comparison means “is `k` still less than `N`?”
+
+The following branch sequence is visible in cycles `285..291`:
+
+- branch instruction fetched and executed
+- at cycle 291, `PC` jumps back to `12`
+
+That jump back to `PC: 12` is the proof that the loop continues for another iteration.
+
+## Step 5: second iteration (`k = 1`)
+
+In the second loop round, thread 1 should use:
+
+- `A[0,1] = 2`
+- `B[1,1] = 4`
+
+So the second product is:
+
+```text
+2 * 4 = 8
+```
+
+Adding that to the previous accumulator value gives:
+
+```text
+R8 = 2 + 8 = 10
+```
+
+By the time the loop-exit compare happens later, the trace confirms exactly that state.
+
+Near cycles `432..447`, thread 1 shows:
+
+- `R8 = 10`
+- `R9 = 2`
+- `R10 = 2`
+- `R11 = 4`
+- `R12 = 8`
+
+This is the clearest “the dot product is finished” snapshot in the trace.
+
+## Step 6: second branch decision means loop exit
+
+The second compare/branch sequence appears around cycles `432..447`.
+
+This time the key values are:
+
+- `R9 = 2`
+- `R2 = 2`
+
+So `k` has reached `N`.
+
+The branch no longer goes back to the loop body.
+
+You can see the difference in the PC progression:
+
+- after the branch update, the trace moves to `PC: 25`, then `PC: 26`
+- it does **not** jump back to `PC: 12`
+
+That is how the log shows “loop exit” without printing a high-level English sentence.
+
+For beginners, this is the most important branch-reading trick in the whole file:
+
+> watch where the PC goes after the branch completes.
+
+- if it returns to `PC 12`, the loop continues
+- if it advances to `PC 25/26`, the loop is over
+
+## Step 7: computing the output address
+
+After the loop exits, the kernel computes the destination address for C.
+
+Around cycle 459 the trace shows:
+
+```text
+PC: 26
+Instruction: ADD R9, R5, R0
+```
+
+At this moment:
+
+- `R5 = 8`  → base of output matrix
+- `R0 = 1`  → thread/global output index
+
+So thread 1 computes:
+
+```text
+R9 = 9
+```
+
+That is exactly the correct address for output element `C[0,1]`.
+
+## Step 8: final store and the all-thread memwrite burst
+
+At cycle 464 the next real instruction becomes:
+
+```text
+Instruction: STR R9, R8
+```
+
+For thread 1, that means:
+
+- store to address `R9 = 9`
+- store value `R8 = 10`
+
+Then the `[memwrite]` lines show that **all four lanes** write their results at cycle 471:
+
+```text
+[memwrite] data cycle=471 lane=0 addr=11 old=0 new=22
+[memwrite] data cycle=471 lane=1 addr=10 old=0 new=15
+[memwrite] data cycle=471 lane=2 addr=9 old=0 new=10
+[memwrite] data cycle=471 lane=3 addr=8 old=0 new=7
+```
+
+Even though the per-cycle trace only logs thread 1, these four memwrite lines reveal the final outputs of all threads:
+
+- addr 8  → 7
+- addr 9  → 10
+- addr 10 → 15
+- addr 11 → 22
+
+So the final matrix result is visible in one compact burst.
+
+Just like `matadd`, the next cycle also shows repeated writes with unchanged values:
+
+```text
+[memwrite] data cycle=472 ... old=22 new=22
+...
+```
+
+The safest interpretation is again that the write handshake remains visible for another cycle, not that the mathematical result changed.
+
+## Step 9: return and completion
+
+Near the end of the log, thread 1 reaches:
+
+```text
+PC: 27
+Instruction: RET
+Core State: DONE
+Core Done: 1
+```
+
+Then the test prints:
+
+```text
+Completed in 491 cycles
+```
+
+And the final memory table confirms:
+
+```text
+data[8..11] = 7, 10, 15, 22
+```
+
+## What to focus on when you read this log yourself
+
+A good reading order is:
+
+1. initial data-memory dump
+2. cycle 7 — first real fetched instruction
+3. the `DIV` / `SUB` steps that establish row and column
+4. the first `LDR R10` / `LDR R11` sequence
+5. the first `CMP` + branch-back sequence around cycles `284..291`
+6. the second `CMP` + branch-exit sequence around cycles `432..447`
+7. the final `STR R9, R8` sequence around cycles `464..471`
+8. final data-memory dump
+
+That reading order lets you see the kernel as:
+
+- setup
+- loop iteration 1
+- loop iteration 2
+- loop exit
+- final writeback
+
+## Beginner takeaway
+
+This trace is valuable because it shows how a tiny GPU kernel can still express a real control-flow pattern:
+
+- per-thread indexing
+- address generation
+- repeated loads from memory
+- accumulation in a register
+- a compare-and-branch loop
+- final store to global memory
+
+If `matadd` teaches “same instruction, different thread data,” then `matmul` teaches the next layer:
+
+> a GPU thread can also run a local sequential algorithm, and the trace shows exactly how that algorithm unfolds through fetch, wait, execute, and update phases.
diff --git a/docs/report/README.md b/docs/report/README.md
new file mode 100644
index 0000000..e77439a
--- /dev/null
+++ b/docs/report/README.md
@@ -0,0 +1,32 @@
+# tiny-gpu Repository Analysis Report
+
+This directory contains a source-verified documentation set for the `tiny-gpu` repository.
+
+## Files
+
+- `00-executive-overview.md` — project purpose, scope, major findings, and external toolchain summary
+- `01-repository-structure.md` — top-level layout, module inventory, and hierarchy mapping
+- `02-architecture-and-execution.md` — hardware organization, execution stages, and key module relationships
+- `03-build-and-test-workflow.md` — compile flow, cocotb harness behavior, and simulator/tooling references
+- `04-risks-open-questions.md` — confirmed limitations, documentation mismatches, and roadmap boundaries
+- `05-matadd-trace-walkthrough.md` — cycle-by-cycle guided reading of the matrix-addition execution log
+- `06-matmul-trace-walkthrough.md` — cycle-by-cycle guided reading of the matrix-multiplication execution log
+
+## Method
+
+This report was cross-checked against four sources of evidence:
+
+1. the local checkout (`README.md`, `Makefile`, all RTL modules in `src/`, and cocotb code in `test/`)
+2. the repository remote (`adam-maj/tiny-gpu`)
+3. DeepWiki repository documentation for `adam-maj/tiny-gpu`
+4. official tool documentation for `cocotb`, `iverilog`, `sv2v`, and `GTKWave`
+
+## Reading guide
+
+- Treat `src/` as the source of truth for implemented behavior.
+- Treat `README.md` as the conceptual guide and roadmap.
+- Treat external tool references in this report as usage context for the simulation flow, not as proof of repository behavior.
+
+## Diagram note
+
+Mermaid diagrams are included as source fences in the Markdown so the structure remains readable even when diagrams are not rendered by the viewer.
diff --git a/docs/setup-verilog-simulation.md b/docs/setup-verilog-simulation.md
new file mode 100644
index 0000000..b889d07
--- /dev/null
+++ b/docs/setup-verilog-simulation.md
@@ -0,0 +1,404 @@
+# tiny-gpu Setup Guide for a Systems Software Engineer
+
+This guide is written for someone who is comfortable with Linux, shells, compilers, and build systems, but is new to Verilog/SystemVerilog simulation.
+
+It covers:
+
+1. what this repository is doing at a high level
+2. what tools are required
+3. how to install them
+4. the exact compatibility issues I hit on this machine
+5. how to compile and run the simulations successfully
+
+---
+
+## 1. Mental model: what you are actually running
+
+If you come from systems software, the easiest analogy is this:
+
+- **SystemVerilog RTL (`src/*.sv`)** is the hardware design source code
+- **`sv2v`** is a source-to-source translator that converts SystemVerilog into plain Verilog
+- **`iverilog`** is the compiler/elaborator for the generated Verilog
+- **`vvp`** is the simulation runtime that executes the compiled design
+- **`cocotb`** is the Python test harness that plays the role of a software testbench
+
+So the flow is roughly:
+
+```text
+SystemVerilog source
+  -> sv2v
+Verilog output
+  -> iverilog
+compiled simulation image (.vvp)
+  -> vvp + cocotb Python test
+simulated execution + assertions + logs
+```
+
+In this repo, cocotb acts like the “host system” for the GPU. It:
+
+- starts the clock
+- resets the DUT
+- loads program memory and data memory
+- writes the thread count into the device control register
+- asserts `start`
+- waits until `done`
+- checks the output values
+
+---
+
+## 2. What this repository needs
+
+From the repo's `README.md` and `Makefile`, the practical requirements are:
+
+- Python 3
+- `pip`
+- `make`
+- `sv2v`
+- `iverilog`
+- `vvp` (comes with Icarus Verilog)
+- `cocotb`
+- a `build/` directory
+
+Optional:
+
+- `gtkwave` for waveform viewing
+
+---
+
+## 3. Important compatibility note: cocotb version matters here
+
+This repository's `Makefile` uses:
+
+```bash
+cocotb-config --prefix
+```
+
+That works with **cocotb 1.9.x**, but it does **not** work with cocotb 2.0.x, where `--prefix` was removed from `cocotb-config`.
+
+### What happened on this machine
+
+I first installed `cocotb 2.0.1`, and the repo became incompatible with the existing `Makefile`.
+
+The working fix was to pin cocotb to:
+
+```bash
+cocotb==1.9.2
+```
+
+If you keep the current `Makefile` unchanged, **use cocotb 1.9.2**.
+
+---
+
+## 4. Recommended install paths
+
+There are two reasonable setup modes.
+
+### Option A — easiest if you have sudo
+
+Install system packages via apt, and install cocotb with pip:
+
+```bash
+sudo apt-get update
+sudo apt-get install -y iverilog gtkwave unzip curl
+python3 -m pip install --user 'cocotb==1.9.2'
+```
+
+Then install `sv2v` from the official release:
+
+```bash
+mkdir -p "$HOME/.local/opt/downloads"
+curl -L https://github.com/zachjs/sv2v/releases/download/v0.0.13/sv2v-Linux.zip \
+  -o "$HOME/.local/opt/downloads/sv2v-Linux.zip"
+unzip -o "$HOME/.local/opt/downloads/sv2v-Linux.zip" \
+  -d "$HOME/.local/opt/downloads/sv2v-linux"
+ln -sfn "$HOME/.local/opt/downloads/sv2v-linux/sv2v-Linux/sv2v" "$HOME/.local/bin/sv2v"
+```
+
+This is the simplest route if you control the machine.
+
+### Option B — no sudo / user-local install only
+
+This is what I used successfully on this machine.
+
+---
+
+## 5. No-sudo setup that actually worked here
+
+### 5.1 Install cocotb in your user site-packages
+
+```bash
+python3 -m pip install --user --force-reinstall 'cocotb==1.9.2'
+```
+
+Verify:
+
+```bash
+cocotb-config --version
+cocotb-config --help | head
+```
+
+Expected: version `1.9.2`, and help output should include `--prefix`.
+
+### 5.2 Download and unpack Icarus Verilog locally
+
+Create directories:
+
+```bash
+mkdir -p "$HOME/.local/opt/iverilog"
+mkdir -p "$HOME/.local/opt/downloads"
+```
+
+Download the Ubuntu package without installing it system-wide:
+
+```bash
+cd /path/to/tiny-gpu
+apt-get download iverilog
+```
+
+That produces a file like:
+
+```text
+iverilog_11.0-1.1_amd64.deb
+```
+
+Extract it into your home directory:
+
+```bash
+dpkg-deb -x ./iverilog_11.0-1.1_amd64.deb "$HOME/.local/opt/iverilog"
+```
+
+### 5.3 Fix the internal helper path for the extracted Icarus package
+
+This matters.
+
+When you install `iverilog` normally with apt, the package layout and hard-coded helper paths line up automatically. But when you simply extract the `.deb` under your home directory, the main `iverilog` binary still expects helper programs under a slightly different prefix.
+
+Without this fix, `iverilog` fails like this:
+
+```text
+ivlpp: not found
+ivl: not found
+```
+
+Create a compatibility symlink:
+
+```bash
+mkdir -p "$HOME/.local/opt/iverilog/usr/x86_64-linux-gnu"
+ln -sfn ../lib/x86_64-linux-gnu/ivl "$HOME/.local/opt/iverilog/usr/x86_64-linux-gnu/ivl"
+```
+
+### 5.4 Download the official sv2v Linux release
+
+```bash
+curl -L https://github.com/zachjs/sv2v/releases/download/v0.0.13/sv2v-Linux.zip \
+  -o "$HOME/.local/opt/downloads/sv2v-Linux.zip"
+
+unzip -o "$HOME/.local/opt/downloads/sv2v-Linux.zip" \
+  -d "$HOME/.local/opt/downloads/sv2v-linux"
+```
+
+The binary ends up here:
+
+```text
+$HOME/.local/opt/downloads/sv2v-linux/sv2v-Linux/sv2v
+```
+
+### 5.5 Export PATH for this repo session
+
+```bash
+export PATH="$HOME/.local/opt/iverilog/usr/bin:$HOME/.local/opt/downloads/sv2v-linux/sv2v-Linux:$HOME/.local/bin:$PATH"
+```
+
+Verify the full toolchain:
+
+```bash
+iverilog -V
+vvp -V
+sv2v --version
+cocotb-config --version
+```
+
+The working versions on this machine were:
+
+- `iverilog` 11.0
+- `vvp` 11.0
+- `sv2v` v0.0.13
+- `cocotb` 1.9.2
+
+---
+
+## 6. Running the repo for the first time
+
+From the repo root:
+
+```bash
+mkdir -p build
+make test_matadd
+make test_matmul
+```
+
+### What these targets do
+
+`make test_matadd` and `make test_matmul` both do the following:
+
+1. `make compile`
+2. run `sv2v`
+3. compile `build/gpu.v` with `iverilog`
+4. run the compiled simulation with `vvp`
+5. load the relevant cocotb Python test module
+
+### Expected successful output
+
+You should see cocotb output like:
+
+```text
+Running on Icarus Verilog version 11.0 (stable)
+Running tests with cocotb v1.9.2
+...
+PASS
+```
+
+On this machine, both of these passed successfully:
+
+- `make test_matadd`
+- `make test_matmul`
+
+One small repo oddity: `test/test_matmul.py` exports a test function named `test_matadd`, so the second run still reports `running test_matadd`. That is a naming mismatch in the repo, not an environment issue.
+
+---
+
+## 7. Files and outputs you should expect
+
+### Build outputs
+
+The build places generated artifacts in:
+
+```text
+build/
+```
+
+Important files include:
+
+- `build/alu.v`
+- `build/gpu.v`
+- `build/sim.vvp`
+
+### Log outputs
+
+The Python logger writes execution logs under:
+
+```text
+test/logs/
+```
+
+These logs are useful when you want to inspect:
+
+- the initial memory state
+- the instruction-by-instruction trace
+- final memory contents
+
+---
+
+## 8. What each tool is doing, in plain software-engineering terms
+
+### `sv2v`
+
+Think of this as a compatibility transpiler.
+
+The source code is written in **SystemVerilog**, but the simulation compiler used here (`iverilog`) is happiest with plain Verilog in this workflow. So `sv2v` translates the source before compilation.
+
+### `iverilog`
+
+Think of this as the compile + elaboration step.
+
+It takes the generated Verilog and produces a simulation image (`.vvp`) containing the hardware design.
+
+### `vvp`
+
+Think of this as the runtime loader / executor for the compiled simulation image.
+
+### `cocotb`
+
+Think of this as a Python-based integration test harness for hardware.
+
+Instead of writing HDL testbench code, you write Python coroutines that:
+
+- drive signals
+- wait on edges or time
+- observe outputs
+- assert expected behavior
+
+---
+
+## 9. Known rough edges in this repository
+
+These are worth knowing up front so you do not waste time blaming yourself:
+
+1. **`build/` is not created automatically**
+   - you must run `mkdir -p build`
+
+2. **The current Makefile assumes cocotb 1.x behavior**
+   - specifically `cocotb-config --prefix`
+   - cocotb 2.0 breaks this assumption
+
+3. **The `sv2v` invocation is unusual**
+   - the Makefile uses:
+     ```bash
+     sv2v -I src/* -w build/gpu.v
+     ```
+   - this is not the clearest standard `sv2v` usage pattern, but it worked in this repo during validation
+
+4. **`test_matmul.py` has a naming mismatch**
+   - the module is correct
+   - the exported test function name is confusing
+
+---
+
+## 10. Recommended shell snippet for future sessions
+
+If you plan to work on this repo repeatedly without sudo, add something like this to your shell rc file:
+
+```bash
+export PATH="$HOME/.local/opt/iverilog/usr/bin:$HOME/.local/opt/downloads/sv2v-linux/sv2v-Linux:$HOME/.local/bin:$PATH"
+```
+
+Then open a new shell and verify:
+
+```bash
+iverilog -V
+sv2v --version
+cocotb-config --version
+```
+
+---
+
+## 11. Fast sanity-check checklist
+
+If you just want the minimum sequence to confirm the environment works:
+
+```bash
+export PATH="$HOME/.local/opt/iverilog/usr/bin:$HOME/.local/opt/downloads/sv2v-linux/sv2v-Linux:$HOME/.local/bin:$PATH"
+mkdir -p build
+cocotb-config --version
+iverilog -V
+sv2v --version
+make test_matadd
+make test_matmul
+```
+
+If both tests pass, your local simulation environment is usable.
+
+---
+
+## 12. Where to look next if you are new to RTL
+
+If you want to understand the code after setup, read in this order:
+
+1. `README.md` — project overview and architecture explanation
+2. `src/gpu.sv` — top-level module
+3. `src/core.sv` — per-core composition
+4. `src/scheduler.sv` — control flow and execution stages
+5. `test/helpers/setup.py` — how software launches the design
+6. `test/test_matadd.py` — the simplest end-to-end example
+
+That order maps well to a systems engineer’s instincts: start with the top-level architecture, then look at orchestration, then execution, then the test harness.
diff --git a/src/alu.sv b/src/alu.sv
index 4d23614..c7a306a 100644
--- a/src/alu.sv
+++ b/src/alu.sv
@@ -2,54 +2,87 @@
 `timescale 1ns/1ns
 
 // ARITHMETIC-LOGIC UNIT
-// > Executes computations on register values
+// > Executes computations on register values for ONE thread slot inside ONE core.
 // > In this minimal implementation, the ALU supports the 4 basic arithmetic operations
-// > Each thread in each core has it's own ALU
-// > ADD, SUB, MUL, DIV instructions are all executed here
+//   plus the compare path used by the CMP instruction.
+// > Important mental model for beginners:
+//   - This module is synchronous: it updates its output on the clock edge.
+//   - It only does useful work when the scheduler has moved the core into EXECUTE.
+//   - Every enabled thread has its own private ALU instance.
+// 新手导读：
+// 1. `module alu (...)` 表示定义一个独立硬件模块，圆括号里列的是它的输入输出端口。
+// 2. `input wire` / `output wire` 常用于“连线型”信号；`reg` 常用于 always 块里被时序逻辑保存的信号。
+// 3. `always @(posedge clk)` 表示“每个时钟上升沿执行一次下面的时序逻辑”，这是最常见的寄存器写法。
+// 4. `<=` 是非阻塞赋值，适合时序电路；可以把它理解成“本拍决定，拍沿统一更新”。
+// 5. 这个 ALU 不是组合逻辑直出，而是把结果先存进 `alu_out_reg`，所以输出会晚一个时钟边沿可见。
 module alu (
     input wire clk,
     input wire reset,
-    input wire enable, // If current block has less threads then block size, some ALUs will be inactive
+    input wire enable, // If current block has fewer active threads than capacity, some ALUs stay idle.
 
+    // Shared core stage. The ALU only computes during the EXECUTE stage.
     input reg [2:0] core_state,
 
+    // Decoder control signals:
+    // - arithmetic_mux selects ADD/SUB/MUL/DIV
+    // - output_mux selects arithmetic result vs comparison result
     input reg [1:0] decoded_alu_arithmetic_mux,
     input reg decoded_alu_output_mux,
 
+    // Source operands read from this thread's register file.
     input reg [7:0] rs,
     input reg [7:0] rt,
+
+    // Final ALU result visible to other units (register file / PC unit).
     output wire [7:0] alu_out
 );
+    // Small local encoding table for the arithmetic sub-operations.
+    // `localparam` 是“只在本模块内部可见的常量”，适合给状态码、操作码取名字。
     localparam ADD = 2'b00,
         SUB = 2'b01,
         MUL = 2'b10,
         DIV = 2'b11;
 
+    // Registered output: this ALU writes its result on the rising edge of clk.
     reg [7:0] alu_out_reg;
+    // `assign` 表示连续赋值，相当于把输出线永久连接到内部寄存器上。
     assign alu_out = alu_out_reg;
 
     always @(posedge clk) begin 
         if (reset) begin 
+            // Reset clears the stored ALU output.
             alu_out_reg <= 8'b0;
         end else if (enable) begin
-            // Calculate alu_out when core_state = EXECUTE
+            // Only perform ALU work during the EXECUTE stage of the core pipeline.
             if (core_state == 3'b101) begin 
                 if (decoded_alu_output_mux == 1) begin 
-                    // Set values to compare with NZP register in alu_out[2:0]
+                    // CMP uses the ALU comparison path instead of normal arithmetic.
+                    // The low 3 bits are packed as condition flags for the PC/NZP logic:
+                    //   alu_out[2] = (rs > rt)
+                    //   alu_out[1] = (rs == rt)
+                    //   alu_out[0] = (rs < rt)
+                    // The upper 5 bits are padded with zeros because alu_out is 8 bits wide.
+                    // `{a, b, c}` 是拼接运算符，表示把多个 bit/向量按顺序拼成一个更宽的向量。
                     alu_out_reg <= {5'b0, (rs - rt > 0), (rs - rt == 0), (rs - rt < 0)};
                 end else begin 
-                    // Execute the specified arithmetic instruction
+                    // Normal arithmetic path selected by the decoder.
+                    // `case (...)` 很像软件里的 switch，用来根据选择信号挑一种子操作。
                     case (decoded_alu_arithmetic_mux)
                         ADD: begin 
+                            // R[rd] = rs + rt
                             alu_out_reg <= rs + rt;
                         end
                         SUB: begin 
+                            // R[rd] = rs - rt
                             alu_out_reg <= rs - rt;
                         end
                         MUL: begin 
+                            // R[rd] = rs * rt
                             alu_out_reg <= rs * rt;
                         end
                         DIV: begin 
+                            // R[rd] = rs / rt
+                            // This toy design assumes the program avoids divide-by-zero.
                             alu_out_reg <= rs / rt;
                         end
                     endcase
diff --git a/src/controller.sv b/src/controller.sv
index eeedef2..c043bf4 100644
--- a/src/controller.sv
+++ b/src/controller.sv
@@ -2,9 +2,19 @@
 `timescale 1ns/1ns
 
 // MEMORY CONTROLLER
-// > Receives memory requests from all cores
-// > Throttles requests based on limited external memory bandwidth
-// > Waits for responses from external memory and distributes them back to cores
+// > Arbitrates many requesters onto a smaller number of external memory channels.
+// > Used twice in this design:
+//   - once for data memory (serving all LSUs)
+//   - once for program memory (serving all fetchers)
+// > Beginner mental model:
+//   many internal clients may want memory at the same time, but the outside world has only a
+//   few ports. This controller is the traffic cop that assigns channels and relays responses.
+// 新手导读：
+// 1. 这个模块是“仲裁器 + 转发器”：左边连很多消费者，右边连较少的 memory channel。
+// 2. 端口里像 `signal [NUM_CONSUMERS-1:0]` 这样的写法表示向量；像 `signal [NUM_CONSUMERS-1:0]` 后面再跟数组下标，则是数组端口。
+// 3. `current_consumer[i]` 记录“第 i 个外部通道当前在服务哪个内部客户端”。
+// 4. `channel_serving_consumer` 是一个位图，防止两个 channel 同时抢到同一个请求。
+// 5. 读懂这个模块的关键不是每一行赋值，而是先抓住每个 channel 都有自己的小状态机。
 module controller #(
     parameter ADDR_BITS = 8,
     parameter DATA_BITS = 16,
@@ -15,7 +25,7 @@ module controller #(
     input wire clk,
     input wire reset,
 
-    // Consumer Interface (Fetchers / LSUs)
+    // Consumer-facing handshake ports (fetchers or LSUs, depending on instantiation).
     input reg [NUM_CONSUMERS-1:0] consumer_read_valid,
     input reg [ADDR_BITS-1:0] consumer_read_address [NUM_CONSUMERS-1:0],
     output reg [NUM_CONSUMERS-1:0] consumer_read_ready,
@@ -25,7 +35,7 @@ module controller #(
     input reg [DATA_BITS-1:0] consumer_write_data [NUM_CONSUMERS-1:0],
     output reg [NUM_CONSUMERS-1:0] consumer_write_ready,
 
-    // Memory Interface (Data / Program)
+    // External memory-facing channels.
     output reg [NUM_CHANNELS-1:0] mem_read_valid,
     output reg [ADDR_BITS-1:0] mem_read_address [NUM_CHANNELS-1:0],
     input reg [NUM_CHANNELS-1:0] mem_read_ready,
@@ -35,19 +45,26 @@ module controller #(
     output reg [DATA_BITS-1:0] mem_write_data [NUM_CHANNELS-1:0],
     input reg [NUM_CHANNELS-1:0] mem_write_ready
 );
+    // Per-channel FSM states.
+    // 每个外部 memory channel 都会在这些状态之间独立切换。
     localparam IDLE = 3'b000, 
         READ_WAITING = 3'b010, 
         WRITE_WAITING = 3'b011,
         READ_RELAYING = 3'b100,
         WRITE_RELAYING = 3'b101;
 
-    // Keep track of state for each channel and which jobs each channel is handling
+    // Each channel behaves like a tiny independent worker.
+    // `controller_state [NUM_CHANNELS-1:0]` 表示“每个通道各自保存一个状态值”。
     reg [2:0] controller_state [NUM_CHANNELS-1:0];
-    reg [$clog2(NUM_CONSUMERS)-1:0] current_consumer [NUM_CHANNELS-1:0]; // Which consumer is each channel currently serving
-    reg [NUM_CONSUMERS-1:0] channel_serving_consumer; // Which channels are being served? Prevents many workers from picking up the same request.
+    reg [$clog2(NUM_CONSUMERS)-1:0] current_consumer [NUM_CHANNELS-1:0];
+
+    // Bookkeeping bitmask: a 1 means some channel already claimed that consumer's pending request.
+    // This prevents two channels from accidentally servicing the same client in parallel.
+    reg [NUM_CONSUMERS-1:0] channel_serving_consumer;
 
     always @(posedge clk) begin
         if (reset) begin 
+            // Reset clears both sides of the handshake and returns every channel to IDLE.
             mem_read_valid <= 0;
             mem_read_address <= 0;
 
@@ -64,23 +81,28 @@ module controller #(
 
             channel_serving_consumer = 0;
         end else begin 
-            // For each channel, we handle processing concurrently
+            // Process every external channel in parallel.
+            // 这里的 for 循环是在 RTL 中“复制相似逻辑到每个通道”，不是软件串行跑很多次的意思。
             for (int i = 0; i < NUM_CHANNELS; i = i + 1) begin 
                 case (controller_state[i])
                     IDLE: begin
-                        // While this channel is idle, cycle through consumers looking for one with a pending request
+                        // Greedily scan consumers to find one pending request this idle channel can adopt.
                         for (int j = 0; j < NUM_CONSUMERS; j = j + 1) begin 
                             if (consumer_read_valid[j] && !channel_serving_consumer[j]) begin 
+                                // Claim this consumer so no other channel grabs it.
+                                // 这里用阻塞赋值 `=` 改位图，是想在本拍后续逻辑里立刻看到“已被占用”的效果。
                                 channel_serving_consumer[j] = 1;
                                 current_consumer[i] <= j;
 
+                                // Forward the read request out to memory on this channel.
                                 mem_read_valid[i] <= 1;
                                 mem_read_address[i] <= consumer_read_address[j];
                                 controller_state[i] <= READ_WAITING;
 
-                                // Once we find a pending request, pick it up with this channel and stop looking for requests
+                                // One channel handles at most one consumer at a time.
                                 break;
                             end else if (consumer_write_valid[j] && !channel_serving_consumer[j]) begin 
+                                // Same idea for writes.
                                 channel_serving_consumer[j] = 1;
                                 current_consumer[i] <= j;
 
@@ -89,31 +111,35 @@ module controller #(
                                 mem_write_data[i] <= consumer_write_data[j];
                                 controller_state[i] <= WRITE_WAITING;
 
-                                // Once we find a pending request, pick it up with this channel and stop looking for requests
+                                // Stop scanning once this channel has adopted one request.
                                 break;
                             end
                         end
                     end
                     READ_WAITING: begin
-                        // Wait for response from memory for pending read request
+                        // Keep waiting until external memory accepts/completes the read.
                         if (mem_read_ready[i]) begin 
                             mem_read_valid[i] <= 0;
+
+                            // Relay the returned data back to the original consumer.
                             consumer_read_ready[current_consumer[i]] <= 1;
                             consumer_read_data[current_consumer[i]] <= mem_read_data[i];
                             controller_state[i] <= READ_RELAYING;
                         end
                     end
                     WRITE_WAITING: begin 
-                        // Wait for response from memory for pending write request
+                        // Wait until external memory acknowledges the write.
                         if (mem_write_ready[i]) begin 
                             mem_write_valid[i] <= 0;
                             consumer_write_ready[current_consumer[i]] <= 1;
                             controller_state[i] <= WRITE_RELAYING;
                         end
                     end
-                    // Wait until consumer acknowledges it received response, then reset
+                    // Keep ready asserted until the original consumer drops its valid signal.
+                    // That "valid goes low" acts like an acknowledgement in this simple protocol.
                     READ_RELAYING: begin
                         if (!consumer_read_valid[current_consumer[i]]) begin 
+                            // 当消费者自己把 valid 拉低，说明这次读响应已经被它消费完了。
                             channel_serving_consumer[current_consumer[i]] = 0;
                             consumer_read_ready[current_consumer[i]] <= 0;
                             controller_state[i] <= IDLE;
@@ -121,6 +147,7 @@ module controller #(
                     end
                     WRITE_RELAYING: begin 
                         if (!consumer_write_valid[current_consumer[i]]) begin 
+                            // Release the claimed consumer so some future request can be serviced.
                             channel_serving_consumer[current_consumer[i]] = 0;
                             consumer_write_ready[current_consumer[i]] <= 0;
                             controller_state[i] <= IDLE;
diff --git a/src/core.sv b/src/core.sv
index 80a0b00..c43d642 100644
--- a/src/core.sv
+++ b/src/core.sv
@@ -2,9 +2,19 @@
 `timescale 1ns/1ns
 
 // COMPUTE CORE
-// > Handles processing 1 block at a time
-// > The core also has it's own scheduler to manage control flow
-// > Each core contains 1 fetcher & decoder, and register files, ALUs, LSUs, PC for each thread
+// > Executes exactly one block at a time.
+// > Contains one shared control path (scheduler + fetcher + decoder) and one replicated data path
+//   per thread slot (registers + ALU + LSU + PC).
+// > Beginner mental model:
+//   think of a core as "one instruction stream controlling several thread lanes in parallel."
+//   All active lanes see the same decoded instruction, but each lane has its own registers,
+//   arithmetic, load/store state, and branch-condition state.
+// 新手导读：
+// 1. 这个文件是整个设计里最值得反复读的地方，因为它把共享控制路径和每线程私有数据路径拼在一起了。
+// 2. fetcher、decoder、scheduler 在一个 core 内只实例化一份；registers、alu、lsu、pc 会按线程数复制很多份。
+// 3. `wire [7:0] next_pc[THREADS_PER_BLOCK-1:0];` 这种写法要分开看：每个元素 8 bit，一共有 THREADS_PER_BLOCK 个元素。
+// 4. `generate for (...) begin : threads` 不是运行时循环，而是“在综合/展开时复制硬件结构”。
+// 5. 如果你是 Verilog 新手，先把这个模块当成“装配图”，顺着信号名看模块之间怎么连，比死抠每一拍更容易入门。
 module core #(
     parameter DATA_MEM_ADDR_BITS = 8,
     parameter DATA_MEM_DATA_BITS = 8,
@@ -15,21 +25,21 @@ module core #(
     input wire clk,
     input wire reset,
 
-    // Kernel Execution
+    // Dispatcher <-> core launch handshake.
     input wire start,
     output wire done,
 
-    // Block Metadata
+    // Metadata for the specific block currently assigned to this core.
     input wire [7:0] block_id,
     input wire [$clog2(THREADS_PER_BLOCK):0] thread_count,
 
-    // Program Memory
+    // Shared program-memory request path for this core's single fetcher.
     output reg program_mem_read_valid,
     output reg [PROGRAM_MEM_ADDR_BITS-1:0] program_mem_read_address,
     input reg program_mem_read_ready,
     input reg [PROGRAM_MEM_DATA_BITS-1:0] program_mem_read_data,
 
-    // Data Memory
+    // Per-thread data-memory request paths for the replicated LSUs.
     output reg [THREADS_PER_BLOCK-1:0] data_mem_read_valid,
     output reg [DATA_MEM_ADDR_BITS-1:0] data_mem_read_address [THREADS_PER_BLOCK-1:0],
     input reg [THREADS_PER_BLOCK-1:0] data_mem_read_ready,
@@ -39,12 +49,14 @@ module core #(
     output reg [DATA_MEM_DATA_BITS-1:0] data_mem_write_data [THREADS_PER_BLOCK-1:0],
     input reg [THREADS_PER_BLOCK-1:0] data_mem_write_ready
 );
-    // State
+    // Shared control-path state.
+    // 这一组信号是整个 core 共享的，所有 lane 都一起看它们。
     reg [2:0] core_state;
     reg [2:0] fetcher_state;
     reg [15:0] instruction;
 
-    // Intermediate Signals
+    // Cross-module datapath signals.
+    // 下一拍 PC、源操作数、LSU 状态等则按线程 lane 分别保存。
     reg [7:0] current_pc;
     wire [7:0] next_pc[THREADS_PER_BLOCK-1:0];
     reg [7:0] rs[THREADS_PER_BLOCK-1:0];
@@ -53,14 +65,14 @@ module core #(
     reg [7:0] lsu_out[THREADS_PER_BLOCK-1:0];
     wire [7:0] alu_out[THREADS_PER_BLOCK-1:0];
     
-    // Decoded Instruction Signals
+    // Raw instruction fields extracted by the shared decoder.
     reg [3:0] decoded_rd_address;
     reg [3:0] decoded_rs_address;
     reg [3:0] decoded_rt_address;
     reg [2:0] decoded_nzp;
     reg [7:0] decoded_immediate;
 
-    // Decoded Control Signals
+    // Shared control outputs from the decoder, broadcast to all thread-local units.
     reg decoded_reg_write_enable;           // Enable writing to a register
     reg decoded_mem_read_enable;            // Enable reading from memory
     reg decoded_mem_write_enable;           // Enable writing to memory
@@ -71,7 +83,7 @@ module core #(
     reg decoded_pc_mux;                     // Select source of next PC
     reg decoded_ret;
 
-    // Fetcher
+    // Shared instruction fetch stage for this core.
     fetcher #(
         .PROGRAM_MEM_ADDR_BITS(PROGRAM_MEM_ADDR_BITS),
         .PROGRAM_MEM_DATA_BITS(PROGRAM_MEM_DATA_BITS)
@@ -88,7 +100,7 @@ module core #(
         .instruction(instruction) 
     );
 
-    // Decoder
+    // Shared instruction decoder for this core.
     decoder decoder_instance (
         .clk(clk),
         .reset(reset),
@@ -110,7 +122,7 @@ module core #(
         .decoded_ret(decoded_ret)
     );
 
-    // Scheduler
+    // Core-wide stage machine.
     scheduler #(
         .THREADS_PER_BLOCK(THREADS_PER_BLOCK),
     ) scheduler_instance (
@@ -128,14 +140,17 @@ module core #(
         .done(done)
     );
 
-    // Dedicated ALU, LSU, registers, & PC unit for each thread this core has capacity for
+    // Generate one complete thread lane worth of datapath resources per supported thread slot.
+    // Lanes with index >= thread_count are disabled for partially full final blocks.
+    // `genvar i; generate for (...)` 表示让编译器生成多份几乎相同的子模块实例。
     genvar i;
     generate
         for (i = 0; i < THREADS_PER_BLOCK; i = i + 1) begin : threads
-            // ALU
+            // Thread-local ALU.
             alu alu_instance (
                 .clk(clk),
                 .reset(reset),
+                // `i < thread_count` 会在部分填充的尾块里关闭多余 lane。
                 .enable(i < thread_count),
                 .core_state(core_state),
                 .decoded_alu_arithmetic_mux(decoded_alu_arithmetic_mux),
@@ -145,7 +160,7 @@ module core #(
                 .alu_out(alu_out[i])
             );
 
-            // LSU
+            // Thread-local LSU.
             lsu lsu_instance (
                 .clk(clk),
                 .reset(reset),
@@ -167,7 +182,7 @@ module core #(
                 .lsu_out(lsu_out[i])
             );
 
-            // Register File
+            // Thread-local register file containing both general registers and special SIMD IDs.
             registers #(
                 .THREADS_PER_BLOCK(THREADS_PER_BLOCK),
                 .THREAD_ID(i),
@@ -190,7 +205,10 @@ module core #(
                 .rt(rt[i])
             );
 
-            // Program Counter
+            // Thread-local PC/NZP logic.
+            // Even though every lane computes a next PC independently, the scheduler later assumes
+            // they all converge and selects one shared current_pc for the next instruction.
+            // 这就是这个 toy GPU 对 SIMD 控制流的简化处理：每 lane 可算 next_pc，但最终只保留一个共同 PC。
             pc #(
                 .DATA_MEM_DATA_BITS(DATA_MEM_DATA_BITS),
                 .PROGRAM_MEM_ADDR_BITS(PROGRAM_MEM_ADDR_BITS)
diff --git a/src/dcr.sv b/src/dcr.sv
index 97c0b41..b707e01 100644
--- a/src/dcr.sv
+++ b/src/dcr.sv
@@ -2,27 +2,45 @@
 `timescale 1ns/1ns
 
 // DEVICE CONTROL REGISTER
-// > Used to configure high-level settings
-// > In this minimal example, the DCR is used to configure the number of threads to run for the kernel
+// > Used to configure high-level GPU launch settings.
+// > In this minimal example, the DCR only stores one thing: the total number of threads
+//   that should be launched for the next kernel.
+// > Beginner mental model:
+//   software/testbench writes one 8-bit value here, and the dispatcher later reads it.
+// 新手导读：
+// 1. 这是一个最简单的“配置寄存器”模块，本质上就是在时钟边沿把外部写进来的值存起来。
+// 2. Verilog 里 `input`/`output` 描述端口方向，`[7:0]` 表示这个端口宽度是 8 bit。
+// 3. `assign thread_count = ...` 表示把内部寄存器的某几位直接连到输出端口。
+// 4. 这里没有复杂协议，只有一个写使能 `device_control_write_enable`，为 1 时就在该拍写入数据。
 module dcr (
     input wire clk,
     input wire reset,
 
+    // Simple write interface from the outside world / testbench.
     input wire device_control_write_enable,
     input wire [7:0] device_control_data,
+
+    // Current configured total thread count for the kernel launch.
     output wire [7:0] thread_count,
 );
-    // Store device control data in dedicated register
+    // Internal storage register for the device control data.
+    // `reg [7:0]` 表示定义一个 8 bit 的寄存器变量，用来跨时钟保存状态。
     reg [7:0] device_conrol_register;
+
+    // In this design, the low 8 bits directly represent the kernel's total thread count.
+    // 这行没有时钟，属于组合连线：输出 thread_count 永远等于内部寄存器当前值。
     assign thread_count = device_conrol_register[7:0];
 
     always @(posedge clk) begin
         if (reset) begin
+            // Reset clears the launch configuration.
             device_conrol_register <= 8'b0;
         end else begin
             if (device_control_write_enable) begin 
+                // Latch the new launch configuration when write_enable is high.
+                // 非阻塞赋值 `<=` 表示在这个上升沿把输入数据写进内部寄存器。
                 device_conrol_register <= device_control_data;
             end
         end
     end
-endmodule
\ No newline at end of file
+endmodule
diff --git a/src/decoder.sv b/src/decoder.sv
index dd6b896..fa05b25 100644
--- a/src/decoder.sv
+++ b/src/decoder.sv
@@ -2,23 +2,34 @@
 `timescale 1ns/1ns
 
 // INSTRUCTION DECODER
-// > Decodes an instruction into the control signals necessary to execute it
-// > Each core has it's own decoder
+// > Converts one 16-bit instruction into the control signals that drive the whole core.
+// > Each core has one shared decoder because all active threads in that core execute the
+//   same instruction together.
+// > Beginner mental model:
+//   the decoder is the "meaning extractor" for the raw instruction bits. It does not
+//   perform the work itself; it tells the other units what kind of work to do next.
+// 新手导读：
+// 1. 译码器的核心工作是“看 instruction 的不同 bit 段，然后产出控制信号”。
+// 2. `instruction[15:12]` 这种写法表示取位切片；这里高 4 bit 被当成 opcode。
+// 3. 同一条 16 bit 指令，不同模块会复用其中某些字段，例如 rd/rs/rt/immediate/nzp。
+// 4. 这个模块在 DECODE 阶段才更新，因此其他模块默认在后续阶段消费这些控制信号。
+// 5. 先把所有控制信号清零、再按 opcode 拉高需要的信号，是硬件里很常见的防“脏状态残留”写法。
 module decoder (
     input wire clk,
     input wire reset,
 
+    // Decoder only acts during the DECODE stage.
     input reg [2:0] core_state,
     input reg [15:0] instruction,
     
-    // Instruction Signals
+    // Raw fields pulled directly out of the instruction encoding.
     output reg [3:0] decoded_rd_address,
     output reg [3:0] decoded_rs_address,
     output reg [3:0] decoded_rt_address,
     output reg [2:0] decoded_nzp,
     output reg [7:0] decoded_immediate,
     
-    // Control Signals
+    // Control signals consumed by register file, ALU, LSU, and PC logic.
     output reg decoded_reg_write_enable,           // Enable writing to a register
     output reg decoded_mem_read_enable,            // Enable reading from memory
     output reg decoded_mem_write_enable,           // Enable writing to memory
@@ -31,6 +42,8 @@ module decoder (
     // Return (finished executing thread)
     output reg decoded_ret
 );
+    // Opcode table. instruction[15:12] selects one of these operations.
+    // 这些名字是给 opcode 编码取别名，方便后面的 case 语句阅读。
     localparam NOP = 4'b0000,
         BRnzp = 4'b0001,
         CMP = 4'b0010,
@@ -45,6 +58,7 @@ module decoder (
 
     always @(posedge clk) begin 
         if (reset) begin 
+            // Reset clears all remembered instruction fields and control outputs.
             decoded_rd_address <= 0;
             decoded_rs_address <= 0;
             decoded_rt_address <= 0;
@@ -60,16 +74,21 @@ module decoder (
             decoded_pc_mux <= 0;
             decoded_ret <= 0;
         end else begin 
-            // Decode when core_state = DECODE
+            // Decode exactly once per instruction, during the core's DECODE stage.
             if (core_state == 3'b010) begin 
-                // Get instruction signals from instruction every time
+                // Split the instruction into its reusable fields.
+                // Different instruction formats overlap these bit positions, so the same raw
+                // slices are later interpreted differently by different instructions.
+                // `decoded_rd_address <= instruction[11:8];` 这种写法就是“把位段直接拆出来”。
                 decoded_rd_address <= instruction[11:8];
                 decoded_rs_address <= instruction[7:4];
                 decoded_rt_address <= instruction[3:0];
                 decoded_immediate <= instruction[7:0];
                 decoded_nzp <= instruction[11:9];
 
-                // Control signals reset on every decode and set conditionally by instruction
+                // Important pattern: first clear every control signal, then only assert the ones
+                // needed by this opcode. This avoids accidentally carrying old control values
+                // forward from the previous instruction.
                 decoded_reg_write_enable <= 0;
                 decoded_mem_read_enable <= 0;
                 decoded_mem_write_enable <= 0;
@@ -80,51 +99,65 @@ module decoder (
                 decoded_pc_mux <= 0;
                 decoded_ret <= 0;
 
-                // Set the control signals for each instruction
+                // Raise the specific controls required by the chosen opcode.
+                // 这里的 `case` 是译码器最核心的部分：opcode 不同，整个数据通路的控制方式就不同。
                 case (instruction[15:12])
                     NOP: begin 
-                        // no-op
+                        // NOP intentionally leaves every control signal deasserted.
                     end
                     BRnzp: begin 
+                        // Tell the PC unit to use branch logic instead of plain PC+1.
                         decoded_pc_mux <= 1;
                     end
                     CMP: begin 
+                        // CMP uses the ALU comparison path and writes the result into NZP,
+                        // not into the general-purpose register file.
                         decoded_alu_output_mux <= 1;
                         decoded_nzp_write_enable <= 1;
                     end
                     ADD: begin 
+                        // Write back arithmetic result selected from the ALU ADD sub-op.
                         decoded_reg_write_enable <= 1;
                         decoded_reg_input_mux <= 2'b00;
                         decoded_alu_arithmetic_mux <= 2'b00;
                     end
                     SUB: begin 
+                        // Same datapath as ADD, but different ALU sub-op.
                         decoded_reg_write_enable <= 1;
                         decoded_reg_input_mux <= 2'b00;
                         decoded_alu_arithmetic_mux <= 2'b01;
                     end
                     MUL: begin 
+                        // Same arithmetic datapath, using multiply mode.
                         decoded_reg_write_enable <= 1;
                         decoded_reg_input_mux <= 2'b00;
                         decoded_alu_arithmetic_mux <= 2'b10;
                     end
                     DIV: begin 
+                        // Same arithmetic datapath, using divide mode.
                         decoded_reg_write_enable <= 1;
                         decoded_reg_input_mux <= 2'b00;
                         decoded_alu_arithmetic_mux <= 2'b11;
                     end
                     LDR: begin 
+                        // LDR both requests a memory read and later writes the returned value
+                        // into rd through the MEMORY register-input mux path.
+                        // `decoded_reg_input_mux <= 2'b01` 的意思是让寄存器写回来源选到 LSU 输出。
                         decoded_reg_write_enable <= 1;
                         decoded_reg_input_mux <= 2'b01;
                         decoded_mem_read_enable <= 1;
                     end
                     STR: begin 
+                        // STR only triggers a memory write. No register write-back occurs.
                         decoded_mem_write_enable <= 1;
                     end
                     CONST: begin 
+                        // CONST writes the immediate byte directly into rd.
                         decoded_reg_write_enable <= 1;
                         decoded_reg_input_mux <= 2'b10;
                     end
                     RET: begin 
+                        // RET is consumed by the scheduler to mark block completion.
                         decoded_ret <= 1;
                     end
                 endcase
diff --git a/src/dispatch.sv b/src/dispatch.sv
index f1d5d55..93f6185 100644
--- a/src/dispatch.sv
+++ b/src/dispatch.sv
@@ -2,9 +2,17 @@
 `timescale 1ns/1ns
 
 // BLOCK DISPATCH
-// > The GPU has one dispatch unit at the top level
-// > Manages processing of threads and marks kernel execution as done
-// > Sends off batches of threads in blocks to be executed by available compute cores
+// > Top-level unit that converts one kernel-wide thread_count into a sequence of per-core blocks.
+// > Keeps all cores busy by handing them a new block whenever they finish the previous one.
+// > Announces kernel completion once every block has been dispatched and then finished.
+// > Beginner mental model:
+//   software says "launch N total threads"; dispatch groups them into chunks of
+//   THREADS_PER_BLOCK threads and assigns those chunks to the available cores.
+// 新手导读：
+// 1. dispatch 负责把“总线程数”切成一个个 block，并把 block 派给空闲 core。
+// 2. `total_blocks` 的计算用了向上取整思路，所以最后不满一个 block 的尾块也会被算进去。
+// 3. `core_start/core_reset/core_done` 组成了一组非常简化的核心级握手信号。
+// 4. 这个模块没有显式状态机名字，但本质上仍然是在每个时钟拍里做分发、回收和完成统计。
 module dispatch #(
     parameter NUM_CORES = 2,
     parameter THREADS_PER_BLOCK = 4
@@ -13,27 +21,30 @@ module dispatch #(
     input wire reset,
     input wire start,
 
-    // Kernel Metadata
+    // Launch metadata from the device control register.
     input wire [7:0] thread_count,
 
-    // Core States
+    // Per-core control/status handshake.
     input reg [NUM_CORES-1:0] core_done,
     output reg [NUM_CORES-1:0] core_start,
     output reg [NUM_CORES-1:0] core_reset,
     output reg [7:0] core_block_id [NUM_CORES-1:0],
     output reg [$clog2(THREADS_PER_BLOCK):0] core_thread_count [NUM_CORES-1:0],
 
-    // Kernel Execution
+    // Global kernel-complete signal.
     output reg done
 );
-    // Calculate the total number of blocks based on total threads & threads per block
+    // Round up so partially full final blocks still count as one block.
     wire [7:0] total_blocks;
+    // `(a + b - 1) / b` 是整数除法里常见的向上取整公式。
     assign total_blocks = (thread_count + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
 
-    // Keep track of how many blocks have been processed
-    reg [7:0] blocks_dispatched; // How many blocks have been sent to cores?
-    reg [7:0] blocks_done; // How many blocks have finished processing?
-    reg start_execution; // EDA: Unimportant hack used because of EDA tooling
+    // Global launch bookkeeping.
+    reg [7:0] blocks_dispatched; // Number of blocks already handed to some core.
+    reg [7:0] blocks_done;       // Number of blocks that have fully completed.
+
+    // Small helper flag used to emulate a one-time launch edge from the level-sensitive start signal.
+    reg start_execution;
 
     always @(posedge clk) begin
         if (reset) begin
@@ -49,7 +60,8 @@ module dispatch #(
                 core_thread_count[i] <= THREADS_PER_BLOCK;
             end
         end else if (start) begin    
-            // EDA: Indirect way to get @(posedge start) without driving from 2 different clocks
+            // Treat the first observed cycle of start=1 as the launch event.
+            // This avoids building separate logic driven directly by the start signal edge.
             if (!start_execution) begin 
                 start_execution <= 1;
                 for (int i = 0; i < NUM_CORES; i++) begin
@@ -57,7 +69,7 @@ module dispatch #(
                 end
             end
 
-            // If the last block has finished processing, mark this kernel as done executing
+            // Kernel is complete only after every block has reported completion.
             if (blocks_done == total_blocks) begin 
                 done <= 1;
             end
@@ -66,10 +78,13 @@ module dispatch #(
                 if (core_reset[i]) begin 
                     core_reset[i] <= 0;
 
-                    // If this core was just reset, check if there are more blocks to be dispatched
+                    // A core leaving reset is ready to accept fresh work.
                     if (blocks_dispatched < total_blocks) begin 
                         core_start[i] <= 1;
                         core_block_id[i] <= blocks_dispatched;
+
+                        // Most blocks are full-sized. Only the final block may be partially full.
+                        // 三元运算符 `cond ? a : b` 的意思是：条件真时取 a，否则取 b。
                         core_thread_count[i] <= (blocks_dispatched == total_blocks - 1) 
                             ? thread_count - (blocks_dispatched * THREADS_PER_BLOCK)
                             : THREADS_PER_BLOCK;
@@ -81,7 +96,8 @@ module dispatch #(
 
             for (int i = 0; i < NUM_CORES; i++) begin
                 if (core_start[i] && core_done[i]) begin
-                    // If a core just finished executing it's current block, reset it
+                    // Once a core finishes its current block, reset it so the next loop iteration
+                    // can either assign a new block or leave it idle if all work is done.
                     core_reset[i] <= 1;
                     core_start[i] <= 0;
                     blocks_done = blocks_done + 1;
@@ -89,4 +105,4 @@ module dispatch #(
             end
         end
     end
-endmodule
\ No newline at end of file
+endmodule
diff --git a/src/fetcher.sv b/src/fetcher.sv
index 53ef2de..5be0563 100644
--- a/src/fetcher.sv
+++ b/src/fetcher.sv
@@ -2,8 +2,17 @@
 `timescale 1ns/1ns
 
 // INSTRUCTION FETCHER
-// > Retrieves the instruction at the current PC from global data memory
-// > Each core has it's own fetcher
+// > Retrieves the instruction at the current PC from program memory.
+// > Each core has one fetcher shared by all threads in that core.
+// > Beginner mental model:
+//   - The scheduler says "go fetch" by moving the core into FETCH state.
+//   - The fetcher raises a read request for current_pc.
+//   - When memory returns the instruction, the fetcher stores it locally.
+// 新手导读：
+// 1. 这个模块就是“取指单元”，负责把 current_pc 送到程序存储器，然后把返回的指令缓存起来。
+// 2. 它内部也有一个小状态机 `fetcher_state`，所以不是单纯一拍就完成，而是会经历请求中、已取回等状态。
+// 3. `mem_read_valid` / `mem_read_ready` 是典型握手机制：valid 表示我发请求，ready 表示对方已经响应。
+// 4. `instruction` 是寄存后的输出，所以 decoder 读取的是“已经稳定保存的一条指令”。
 module fetcher #(
     parameter PROGRAM_MEM_ADDR_BITS = 8,
     parameter PROGRAM_MEM_DATA_BITS = 16
@@ -11,34 +20,42 @@ module fetcher #(
     input wire clk,
     input wire reset,
     
-    // Execution State
+    // Shared core execution state and the current converged PC for this core.
     input reg [2:0] core_state,
     input reg [7:0] current_pc,
 
-    // Program Memory
+    // Program memory handshake interface.
     output reg mem_read_valid,
     output reg [PROGRAM_MEM_ADDR_BITS-1:0] mem_read_address,
     input reg mem_read_ready,
     input reg [PROGRAM_MEM_DATA_BITS-1:0] mem_read_data,
 
-    // Fetcher Output
+    // Fetcher outputs back into the core.
     output reg [2:0] fetcher_state,
     output reg [PROGRAM_MEM_DATA_BITS-1:0] instruction,
 );
+    // Small fetcher-local FSM:
+    // IDLE     -> waiting for the scheduler to enter FETCH
+    // FETCHING -> request has been sent, waiting for memory response
+    // FETCHED  -> instruction captured, waiting for core to move on to DECODE
+    // 用 `localparam` 给状态编码命名，比直接写 3'b001、3'b010 更容易读懂状态机。
     localparam IDLE = 3'b000, 
         FETCHING = 3'b001, 
         FETCHED = 3'b010;
     
     always @(posedge clk) begin
         if (reset) begin
+            // Reset puts the fetcher back into its idle state and clears outputs.
             fetcher_state <= IDLE;
             mem_read_valid <= 0;
             mem_read_address <= 0;
             instruction <= {PROGRAM_MEM_DATA_BITS{1'b0}};
         end else begin
+            // `case (fetcher_state)` 表示根据当前状态执行不同分支，这是写有限状态机最常见的写法。
             case (fetcher_state)
                 IDLE: begin
-                    // Start fetching when core_state = FETCH
+                    // Start a fetch only when the core scheduler enters FETCH.
+                    // We sample current_pc here and use it as the program memory address.
                     if (core_state == 3'b001) begin
                         fetcher_state <= FETCHING;
                         mem_read_valid <= 1;
@@ -46,15 +63,18 @@ module fetcher #(
                     end
                 end
                 FETCHING: begin
-                    // Wait for response from program memory
+                    // Wait until program memory acknowledges the request and returns data.
                     if (mem_read_ready) begin
                         fetcher_state <= FETCHED;
-                        instruction <= mem_read_data; // Store the instruction when received
+                        // Latch the fetched instruction so the decoder can read it next.
+                        // 这一步很关键：即使后面 program memory 总线变了，decoder 看到的仍是这拍锁存下来的值。
+                        instruction <= mem_read_data;
                         mem_read_valid <= 0;
                     end
                 end
                 FETCHED: begin
-                    // Reset when core_state = DECODE
+                    // Once the core has moved into DECODE, this fetch cycle is complete.
+                    // The fetcher can safely go back to IDLE and wait for the next PC.
                     if (core_state == 3'b010) begin 
                         fetcher_state <= IDLE;
                     end
diff --git a/src/gpu.sv b/src/gpu.sv
index e3d8fcd..8a647dc 100644
--- a/src/gpu.sv
+++ b/src/gpu.sv
@@ -2,11 +2,21 @@
 `timescale 1ns/1ns
 
 // GPU
-// > Built to use an external async memory with multi-channel read/write
-// > Assumes that the program is loaded into program memory, data into data memory, and threads into
-//   the device control register before the start signal is triggered
-// > Has memory controllers to interface between external memory and its multiple cores
-// > Configurable number of cores and thread capacity per core
+// > Top-level wrapper that wires together launch control, dispatch, cores, and memory controllers.
+// > Assumes software/testbench has already:
+//   1. loaded program memory,
+//   2. loaded data memory,
+//   3. written thread_count into the device control register,
+//   4. pulsed start.
+// > Beginner mental model:
+//   this file is mostly plumbing. It does not add new execution behavior so much as connect the
+//   major subsystems into one device.
+// 新手导读：
+// 1. 这是顶层模块，主要职责是“把子模块连起来”，而不是自己实现很多算法。
+// 2. 如果你第一次看大型 Verilog，建议先看端口区：那里定义了芯片对外暴露的所有接口。
+// 3. 然后看内部信号区：那里把 dispatch、controller、core 之间需要的中间连线列出来。
+// 4. 最后看实例化区：`xxx instance (...)` 就是在顶层里放入一个子模块，并把信号一根根接上。
+// 5. 这个文件里最容易卡住的新手点是数组端口和 generate 桥接逻辑，我在下面相应位置补了中文说明。
 module gpu #(
     parameter DATA_MEM_ADDR_BITS = 8,        // Number of bits in data memory address (256 rows)
     parameter DATA_MEM_DATA_BITS = 8,        // Number of bits in data memory value (8 bit data)
@@ -20,21 +30,21 @@ module gpu #(
     input wire clk,
     input wire reset,
 
-    // Kernel Execution
+    // External kernel launch handshake.
     input wire start,
     output wire done,
 
-    // Device Control Register
+    // Software-visible launch configuration write port.
     input wire device_control_write_enable,
     input wire [7:0] device_control_data,
 
-    // Program Memory
+    // External program-memory interface.
     output wire [PROGRAM_MEM_NUM_CHANNELS-1:0] program_mem_read_valid,
     output wire [PROGRAM_MEM_ADDR_BITS-1:0] program_mem_read_address [PROGRAM_MEM_NUM_CHANNELS-1:0],
     input wire [PROGRAM_MEM_NUM_CHANNELS-1:0] program_mem_read_ready,
     input wire [PROGRAM_MEM_DATA_BITS-1:0] program_mem_read_data [PROGRAM_MEM_NUM_CHANNELS-1:0],
 
-    // Data Memory
+    // External data-memory interface.
     output wire [DATA_MEM_NUM_CHANNELS-1:0] data_mem_read_valid,
     output wire [DATA_MEM_ADDR_BITS-1:0] data_mem_read_address [DATA_MEM_NUM_CHANNELS-1:0],
     input wire [DATA_MEM_NUM_CHANNELS-1:0] data_mem_read_ready,
@@ -44,17 +54,20 @@ module gpu #(
     output wire [DATA_MEM_DATA_BITS-1:0] data_mem_write_data [DATA_MEM_NUM_CHANNELS-1:0],
     input wire [DATA_MEM_NUM_CHANNELS-1:0] data_mem_write_ready
 );
-    // Control
+    // Launch metadata produced by the device control register.
+    // 从 DCR 读出来的 thread_count 会被 dispatch 拿去切分成若干 block。
     wire [7:0] thread_count;
 
-    // Compute Core State
+    // Dispatcher-managed per-core launch/status signals.
     reg [NUM_CORES-1:0] core_start;
     reg [NUM_CORES-1:0] core_reset;
     reg [NUM_CORES-1:0] core_done;
     reg [7:0] core_block_id [NUM_CORES-1:0];
     reg [$clog2(THREADS_PER_BLOCK):0] core_thread_count [NUM_CORES-1:0];
 
-    // LSU <> Data Memory Controller Channels
+    // Flattened LSU <-> data-memory-controller wiring.
+    // There is one LSU per thread slot per core, so total LSU count is NUM_CORES * THREADS_PER_BLOCK.
+    // `flattened` 的意思是：本来是“每个 core 里又有多个 lane”的二维结构，这里被摊平成一维数组方便 controller 统一仲裁。
     localparam NUM_LSUS = NUM_CORES * THREADS_PER_BLOCK;
     reg [NUM_LSUS-1:0] lsu_read_valid;
     reg [DATA_MEM_ADDR_BITS-1:0] lsu_read_address [NUM_LSUS-1:0];
@@ -65,14 +78,15 @@ module gpu #(
     reg [DATA_MEM_DATA_BITS-1:0] lsu_write_data [NUM_LSUS-1:0];
     reg [NUM_LSUS-1:0] lsu_write_ready;
 
-    // Fetcher <> Program Memory Controller Channels
+    // Flattened fetcher <-> program-memory-controller wiring.
+    // 每个 core 只有一个 fetcher，所以 program memory 这边只需要按 core 数量展开。
     localparam NUM_FETCHERS = NUM_CORES;
     reg [NUM_FETCHERS-1:0] fetcher_read_valid;
     reg [PROGRAM_MEM_ADDR_BITS-1:0] fetcher_read_address [NUM_FETCHERS-1:0];
     reg [NUM_FETCHERS-1:0] fetcher_read_ready;
     reg [PROGRAM_MEM_DATA_BITS-1:0] fetcher_read_data [NUM_FETCHERS-1:0];
     
-    // Device Control Register
+    // Stores the total thread_count for the next launch.
     dcr dcr_instance (
         .clk(clk),
         .reset(reset),
@@ -82,7 +96,7 @@ module gpu #(
         .thread_count(thread_count)
     );
 
-    // Data Memory Controller
+    // Arbitrates all LSU traffic onto the external data-memory channels.
     controller #(
         .ADDR_BITS(DATA_MEM_ADDR_BITS),
         .DATA_BITS(DATA_MEM_DATA_BITS),
@@ -111,7 +125,7 @@ module gpu #(
         .mem_write_ready(data_mem_write_ready)
     );
 
-    // Program Memory Controller
+    // Arbitrates per-core fetch traffic onto the external program-memory channels.
     controller #(
         .ADDR_BITS(PROGRAM_MEM_ADDR_BITS),
         .DATA_BITS(PROGRAM_MEM_DATA_BITS),
@@ -133,7 +147,7 @@ module gpu #(
         .mem_read_data(program_mem_read_data),
     );
 
-    // Dispatcher
+    // Splits total thread_count into blocks and assigns them to free cores.
     dispatch #(
         .NUM_CORES(NUM_CORES),
         .THREADS_PER_BLOCK(THREADS_PER_BLOCK)
@@ -150,12 +164,15 @@ module gpu #(
         .done(done)
     );
 
-    // Compute Cores
+    // Instantiate the compute cores and bridge each core's local LSU bundle into the flattened
+    // global controller-facing LSU arrays.
+    // 这里的 generate 是顶层最关键的结构之一：它会生成 NUM_CORES 个 core，以及配套的桥接逻辑。
     genvar i;
     generate
         for (i = 0; i < NUM_CORES; i = i + 1) begin : cores
-            // EDA: We create separate signals here to pass to cores because of a requirement
-            // by the OpenLane EDA flow (uses Verilog 2005) that prevents slicing the top-level signals
+            // OpenLane / Verilog-2005 compatibility note:
+            // separate local arrays are introduced here because that flow dislikes directly slicing
+            // some top-level packed/unpacked combinations when passing them into submodules.
             reg [THREADS_PER_BLOCK-1:0] core_lsu_read_valid;
             reg [DATA_MEM_ADDR_BITS-1:0] core_lsu_read_address [THREADS_PER_BLOCK-1:0];
             reg [THREADS_PER_BLOCK-1:0] core_lsu_read_ready;
@@ -165,11 +182,15 @@ module gpu #(
             reg [DATA_MEM_DATA_BITS-1:0] core_lsu_write_data [THREADS_PER_BLOCK-1:0];
             reg [THREADS_PER_BLOCK-1:0] core_lsu_write_ready;
 
-            // Pass through signals between LSUs and data memory controller
+            // Bridge this core's per-thread LSU ports into the flattened global LSU arrays.
+            // lsu_index computes the unique global LSU slot for core i, thread lane j.
+            // 公式 `i * THREADS_PER_BLOCK + j` 很重要：它把二维坐标 `(core, lane)` 映射到一维索引。
             genvar j;
             for (j = 0; j < THREADS_PER_BLOCK; j = j + 1) begin
                 localparam lsu_index = i * THREADS_PER_BLOCK + j;
                 always @(posedge clk) begin 
+                    // Core -> controller direction.
+                    // 这几行是在做“扁平化转接”：把 core 内部第 j 个 lane 的访存信号搬到全局第 lsu_index 槽位。
                     lsu_read_valid[lsu_index] <= core_lsu_read_valid[j];
                     lsu_read_address[lsu_index] <= core_lsu_read_address[j];
 
@@ -177,13 +198,15 @@ module gpu #(
                     lsu_write_address[lsu_index] <= core_lsu_write_address[j];
                     lsu_write_data[lsu_index] <= core_lsu_write_data[j];
                     
+                    // Controller -> core direction.
+                    // 返回路径同理：把 controller 的响应再送回该 core 的对应 lane。
                     core_lsu_read_ready[j] <= lsu_read_ready[lsu_index];
                     core_lsu_read_data[j] <= lsu_read_data[lsu_index];
                     core_lsu_write_ready[j] <= lsu_write_ready[lsu_index];
                 end
             end
 
-            // Compute Core
+            // One compute core instance.
             core #(
                 .DATA_MEM_ADDR_BITS(DATA_MEM_ADDR_BITS),
                 .DATA_MEM_DATA_BITS(DATA_MEM_DATA_BITS),
diff --git a/src/lsu.sv b/src/lsu.sv
index 77b716b..ce9b2d5 100644
--- a/src/lsu.sv
+++ b/src/lsu.sv
@@ -2,26 +2,38 @@
 `timescale 1ns/1ns
 
 // LOAD-STORE UNIT
-// > Handles asynchronous memory load and store operations and waits for response
-// > Each thread in each core has it's own LSU
-// > LDR, STR instructions are executed here
+// > Handles asynchronous memory load and store operations for ONE thread slot.
+// > Each thread in each core has its own LSU instance.
+// > This is where LDR and STR turn into memory handshake signals.
+// > Beginner mental model:
+//   - The decoder says whether the current instruction is a load or store.
+//   - The LSU starts a request during REQUEST/WAIT stages.
+//   - It waits for memory to answer, then reports completion back to the core.
+// 新手导读：
+// 1. LSU = Load Store Unit，负责把 LDR/STR 这样的指令翻译成真正的存储器握手信号。
+// 2. 这个模块内部有一个 4 状态小 FSM，所以一次 load/store 会跨多个时钟拍完成。
+// 3. 这里的 ready/valid 是和外部 memory controller 或 testbench memory model 对接的关键接口。
+// 4. `rs` 被约定为地址寄存器，`rt` 在 STR 指令里被当作要写出去的数据。
 module lsu (
     input wire clk,
     input wire reset,
-    input wire enable, // If current block has less threads then block size, some LSUs will be inactive
+    input wire enable, // If current block has fewer active threads than capacity, some LSUs stay idle.
 
-    // State
+    // Shared core stage from the scheduler.
     input reg [2:0] core_state,
 
-    // Memory Control Sgiansl
+    // Decoder control signals telling us whether the current instruction is LDR or STR.
     input reg decoded_mem_read_enable,
     input reg decoded_mem_write_enable,
 
-    // Registers
+    // Register values from this thread's register file.
+    // Convention in this design:
+    //   rs = memory address
+    //   rt = store data (for STR)
     input reg [7:0] rs,
     input reg [7:0] rt,
 
-    // Data Memory
+    // External data-memory handshake interface for this thread.
     output reg mem_read_valid,
     output reg [7:0] mem_read_address,
     input reg mem_read_ready,
@@ -31,14 +43,21 @@ module lsu (
     output reg [7:0] mem_write_data,
     input reg mem_write_ready,
 
-    // LSU Outputs
+    // LSU outputs back into the core.
     output reg [1:0] lsu_state,
     output reg [7:0] lsu_out
 );
+    // Small per-thread LSU FSM:
+    // IDLE       -> nothing pending
+    // REQUESTING -> request is being issued
+    // WAITING    -> waiting for memory ready/response
+    // DONE       -> request completed, waiting for core UPDATE to reset state
+    // 学习这类模块时，先看状态，再看每个状态下拉高了哪些握手信号，会更容易把时序串起来。
     localparam IDLE = 2'b00, REQUESTING = 2'b01, WAITING = 2'b10, DONE = 2'b11;
 
     always @(posedge clk) begin
         if (reset) begin
+            // Reset clears all pending memory activity.
             lsu_state <= IDLE;
             lsu_out <= 0;
             mem_read_valid <= 0;
@@ -47,29 +66,36 @@ module lsu (
             mem_write_address <= 0;
             mem_write_data <= 0;
         end else if (enable) begin
-            // If memory read enable is triggered (LDR instruction)
+            // Handle LDR instruction flow.
             if (decoded_mem_read_enable) begin 
+                // 同一个 always 块里套 `case (lsu_state)`，就是标准“时序型状态机”的写法。
                 case (lsu_state)
                     IDLE: begin
-                        // Only read when core_state = REQUEST
+                        // The scheduler has a dedicated REQUEST stage where the LSU is allowed
+                        // to start a new memory operation.
                         if (core_state == 3'b011) begin 
                             lsu_state <= REQUESTING;
                         end
                     end
                     REQUESTING: begin 
+                        // Raise the read request and present the address from rs.
+                        // 请求发出去后进入 WAITING，表示这笔事务已经在路上了。
                         mem_read_valid <= 1;
                         mem_read_address <= rs;
                         lsu_state <= WAITING;
                     end
                     WAITING: begin
+                        // Hold the request until memory says the read data is ready.
                         if (mem_read_ready == 1) begin
                             mem_read_valid <= 0;
+                            // Capture the returned load data so the register file can write it back.
                             lsu_out <= mem_read_data;
                             lsu_state <= DONE;
                         end
                     end
                     DONE: begin 
-                        // Reset when core_state = UPDATE
+                        // Reset this LSU once the core reaches UPDATE and has had a chance
+                        // to use lsu_out / acknowledge completion.
                         if (core_state == 3'b110) begin 
                             lsu_state <= IDLE;
                         end
@@ -77,29 +103,32 @@ module lsu (
                 endcase
             end
 
-            // If memory write enable is triggered (STR instruction)
+            // Handle STR instruction flow.
             if (decoded_mem_write_enable) begin 
+                // store 和 load 复用同一套状态寄存器，只是握手信号方向不同。
                 case (lsu_state)
                     IDLE: begin
-                        // Only read when core_state = REQUEST
+                        // Just like LDR, stores are launched from the REQUEST stage.
                         if (core_state == 3'b011) begin 
                             lsu_state <= REQUESTING;
                         end
                     end
                     REQUESTING: begin 
+                        // Present write address/data and raise the write-valid handshake.
                         mem_write_valid <= 1;
                         mem_write_address <= rs;
                         mem_write_data <= rt;
                         lsu_state <= WAITING;
                     end
                     WAITING: begin
+                        // Wait for memory/controller to acknowledge the store.
                         if (mem_write_ready) begin
                             mem_write_valid <= 0;
                             lsu_state <= DONE;
                         end
                     end
                     DONE: begin 
-                        // Reset when core_state = UPDATE
+                        // Return to IDLE during UPDATE, matching the core's per-instruction rhythm.
                         if (core_state == 3'b110) begin 
                             lsu_state <= IDLE;
                         end
diff --git a/src/pc.sv b/src/pc.sv
index 04185ae..e95357f 100644
--- a/src/pc.sv
+++ b/src/pc.sv
@@ -2,11 +2,17 @@
 `timescale 1ns/1ns
 
 // PROGRAM COUNTER
-// > Calculates the next PC for each thread to update to (but currently we assume all threads
-//   update to the same PC and don't support branch divergence)
-// > Currently, each thread in each core has it's own calculation for next PC
-// > The NZP register value is set by the CMP instruction (based on >/=/< comparison) to 
-//   initiate the BRnzp instruction for branching
+// > Calculates the next PC for each thread slot.
+// > This design gives each thread its own next_pc calculation and NZP register,
+//   but the scheduler later assumes all threads converge back to the same PC.
+// > The NZP register stores the result of the previous CMP instruction and is used
+//   by BRnzp to decide whether to branch.
+// 新手导读：
+// 1. `#(...)` 是参数列表，表示这个模块在实例化时可以被配置成不同位宽。
+// 2. 这个模块做两件事：一是在 EXECUTE 阶段算 next_pc，二是在 UPDATE 阶段更新 NZP 条件寄存器。
+// 3. `decoded_pc_mux` 可以把它理解为“下一条 PC 走顺序执行还是走分支跳转”的选择开关。
+// 4. `decoded_nzp` 是指令里写的分支条件掩码，`nzp` 是上一次 CMP 实际产生的条件标志。
+// 5. `&` 是按位与，不是逻辑与；这里用它判断“指令要求的条件位”和“当前已保存的条件位”是否有重合。
 module pc #(
     parameter DATA_MEM_DATA_BITS = 8,
     parameter PROGRAM_MEM_ADDR_BITS = 8
@@ -15,49 +21,59 @@ module pc #(
     input wire reset,
     input wire enable, // If current block has less threads then block size, some PCs will be inactive
 
-    // State
+    // Shared core execution stage.
     input reg [2:0] core_state,
 
-    // Control Signals
+    // Decoded branch / CMP-related control signals.
     input reg [2:0] decoded_nzp,
     input reg [DATA_MEM_DATA_BITS-1:0] decoded_immediate,
     input reg decoded_nzp_write_enable,
     input reg decoded_pc_mux, 
 
-    // ALU Output - used for alu_out[2:0] to compare with NZP register
+    // ALU output. During CMP, alu_out[2:0] carries the comparison condition bits.
     input reg [DATA_MEM_DATA_BITS-1:0] alu_out,
 
-    // Current & Next PCs
+    // Current converged PC from the scheduler, and this thread's locally computed next PC.
     input reg [PROGRAM_MEM_ADDR_BITS-1:0] current_pc,
     output reg [PROGRAM_MEM_ADDR_BITS-1:0] next_pc
 );
+
+    // Internal NZP register for this thread.
+    // Bit usage in this design follows the ALU's comparison packing.
+    // 每个线程 lane 都有自己独立的 nzp，因此不同线程理论上可以保留不同的分支条件状态。
     reg [2:0] nzp;
 
     always @(posedge clk) begin
         if (reset) begin
+            // Reset clears both branch condition state and next_pc.
             nzp <= 3'b0;
             next_pc <= 0;
         end else if (enable) begin
-            // Update PC when core_state = EXECUTE
+            // Compute next_pc during the EXECUTE stage.
             if (core_state == 3'b101) begin 
                 if (decoded_pc_mux == 1) begin 
+                    // BRnzp is selected. We branch if any requested NZP condition bit matches
+                    // the stored NZP state from a previous CMP.
+                    // `!= 3'b0` 的意思是“按位与结果不是全 0”，也就是至少有一个条件位命中。
                     if (((nzp & decoded_nzp) != 3'b0)) begin 
-                        // On BRnzp instruction, branch to immediate if NZP case matches previous CMP
+                        // Take the branch by jumping to the immediate program address.
                         next_pc <= decoded_immediate;
                     end else begin 
-                        // Otherwise, just update to PC + 1 (next line)
+                        // Branch not taken -> continue to the next instruction.
                         next_pc <= current_pc + 1;
                     end
                 end else begin 
-                    // By default update to PC + 1 (next line)
+                    // Non-branch instructions advance sequentially.
                     next_pc <= current_pc + 1;
                 end
             end   
 
-            // Store NZP when core_state = UPDATE   
+            // Update the NZP register during UPDATE, after ALU comparison results are available.
             if (core_state == 3'b110) begin 
-                // Write to NZP register on CMP instruction
+                // Only CMP-like instructions request an NZP update.
                 if (decoded_nzp_write_enable) begin
+                    // Copy the ALU's low 3 comparison bits into the NZP register.
+                    // 这里逐位写入而不是整段赋值，只是为了把每一位语义写得更直观。
                     nzp[2] <= alu_out[2];
                     nzp[1] <= alu_out[1];
                     nzp[0] <= alu_out[0];
diff --git a/src/registers.sv b/src/registers.sv
index b33af22..86bfab2 100644
--- a/src/registers.sv
+++ b/src/registers.sv
@@ -2,8 +2,20 @@
 `timescale 1ns/1ns
 
 // REGISTER FILE
-// > Each thread within each core has it's own register file with 13 free registers and 3 read-only registers
-// > Read-only registers hold the familiar %blockIdx, %blockDim, and %threadIdx values critical to SIMD
+// > Each thread slot in a core owns a private 16-entry register file.
+// > Registers R0-R12 are normal read/write registers for kernel code.
+// > Registers R13-R15 are special read-only metadata registers:
+//   - R13 = %blockIdx   (which block this core is currently executing)
+//   - R14 = %blockDim   (threads per block in this hardware configuration)
+//   - R15 = %threadIdx  (this thread slot's local index inside the block)
+// > Beginner mental model:
+//   each generated thread gets its own little bank of registers, so all threads can run
+//   the same instruction at once but on different data.
+// 新手导读：
+// 1. 这个模块是“每个线程私有的一组寄存器”，所以同一个 core 里会实例化很多份。
+// 2. `reg [7:0] registers[15:0];` 要分两层看：左边 `[7:0]` 是每个元素的位宽，右边 `[15:0]` 是数组有 16 个元素。
+// 3. REQUEST 阶段负责读源操作数，UPDATE 阶段负责把 ALU/LSU/立即数写回目标寄存器。
+// 4. R13-R15 被设计成只读特殊寄存器，分别保存 blockIdx、blockDim、threadIdx。
 module registers #(
     parameter THREADS_PER_BLOCK = 4,
     parameter THREAD_ID = 0,
@@ -13,43 +25,46 @@ module registers #(
     input wire reset,
     input wire enable, // If current block has less threads then block size, some registers will be inactive
 
-    // Kernel Execution
+    // Block metadata currently assigned to this core.
     input reg [7:0] block_id,
 
-    // State
+    // Shared core state from the scheduler.
     input reg [2:0] core_state,
 
-    // Instruction Signals
+    // Register addresses extracted by the decoder from the current instruction.
     input reg [3:0] decoded_rd_address,
     input reg [3:0] decoded_rs_address,
     input reg [3:0] decoded_rt_address,
 
-    // Control Signals
+    // Register write-back control from the decoder.
     input reg decoded_reg_write_enable,
     input reg [1:0] decoded_reg_input_mux,
     input reg [DATA_BITS-1:0] decoded_immediate,
 
-    // Thread Unit Outputs
+    // Candidate write-back values produced by this thread's other execution units.
     input reg [DATA_BITS-1:0] alu_out,
     input reg [DATA_BITS-1:0] lsu_out,
 
-    // Registers
+    // Source operands exposed to the ALU / LSU for the current instruction.
     output reg [7:0] rs,
     output reg [7:0] rt
 );
+    // Selects which producer writes back into rd during UPDATE.
+    // 这里相当于给“寄存器写回来源选择器”定义三个枚举值。
     localparam ARITHMETIC = 2'b00,
         MEMORY = 2'b01,
         CONSTANT = 2'b10;
 
-    // 16 registers per thread (13 free registers and 3 read-only registers)
+    // Physical storage array for this one thread's architectural registers.
+    // 读法：16 个寄存器槽位，每个槽位 8 bit。
     reg [7:0] registers[15:0];
 
     always @(posedge clk) begin
         if (reset) begin
-            // Empty rs, rt
+            // Clear the currently presented source operands.
             rs <= 0;
             rt <= 0;
-            // Initialize all free registers
+            // Initialize all general-purpose registers.
             registers[0] <= 8'b0;
             registers[1] <= 8'b0;
             registers[2] <= 8'b0;
@@ -63,35 +78,42 @@ module registers #(
             registers[10] <= 8'b0;
             registers[11] <= 8'b0;
             registers[12] <= 8'b0;
-            // Initialize read-only registers
+            // Initialize the special metadata registers.
+            // These are read by kernel code but should never be overwritten by instructions.
             registers[13] <= 8'b0;              // %blockIdx
             registers[14] <= THREADS_PER_BLOCK; // %blockDim
             registers[15] <= THREAD_ID;         // %threadIdx
         end else if (enable) begin 
-            // [Bad Solution] Shouldn't need to set this every cycle
-            registers[13] <= block_id; // Update the block_id when a new block is issued from dispatcher
+            // Keep %blockIdx synchronized with the block currently assigned to this core.
+            // The original author notes this is a simple-but-inelegant approach because it is
+            // rewritten every cycle instead of only when a new block arrives.
+            // 这也说明寄存器并不一定只在“写回阶段”更新，某些特殊寄存器可以由控制逻辑持续维护。
+            registers[13] <= block_id;
             
-            // Fill rs/rt when core_state = REQUEST
+            // During REQUEST, snapshot the source operands named by the decoded instruction.
+            // Those values will then feed the ALU / LSU in later stages.
             if (core_state == 3'b011) begin 
+                // 读寄存器在这里表现成“按地址索引数组”。
                 rs <= registers[decoded_rs_address];
                 rt <= registers[decoded_rt_address];
             end
 
-            // Store rd when core_state = UPDATE
+            // During UPDATE, commit the chosen result into rd.
             if (core_state == 3'b110) begin 
-                // Only allow writing to R0 - R12
+                // Protect the three metadata registers by only allowing writes to R0-R12.
                 if (decoded_reg_write_enable && decoded_rd_address < 13) begin
+                    // `decoded_rd_address < 13` 用来保护 R13-R15，不允许普通指令把特殊寄存器覆盖掉。
                     case (decoded_reg_input_mux)
                         ARITHMETIC: begin 
-                            // ADD, SUB, MUL, DIV
+                            // Arithmetic result from this thread's ALU.
                             registers[decoded_rd_address] <= alu_out;
                         end
                         MEMORY: begin 
-                            // LDR
+                            // Load result returned by this thread's LSU.
                             registers[decoded_rd_address] <= lsu_out;
                         end
                         CONSTANT: begin 
-                            // CONST
+                            // Immediate constant embedded directly in the instruction.
                             registers[decoded_rd_address] <= decoded_immediate;
                         end
                     endcase
diff --git a/src/scheduler.sv b/src/scheduler.sv
index 6838f91..bc8ae90 100644
--- a/src/scheduler.sv
+++ b/src/scheduler.sv
@@ -2,17 +2,22 @@
 `timescale 1ns/1ns
 
 // SCHEDULER
-// > Manages the entire control flow of a single compute core processing 1 block
-// 1. FETCH - Retrieve instruction at current program counter (PC) from program memory
-// 2. DECODE - Decode the instruction into the relevant control signals
-// 3. REQUEST - If we have an instruction that accesses memory, trigger the async memory requests from LSUs
-// 4. WAIT - Wait for all async memory requests to resolve (if applicable)
-// 5. EXECUTE - Execute computations on retrieved data from registers / memory
-// 6. UPDATE - Update register values (including NZP register) and program counter
-// > Each core has it's own scheduler where multiple threads can be processed with
-//   the same control flow at once.
-// > Technically, different instructions can branch to different PCs, requiring "branch divergence." In
-//   this minimal implementation, we assume no branch divergence (naive approach for simplicity)
+// > Drives the per-instruction stage machine for one core processing one block.
+// > All active threads inside that core advance through the same high-level stages together.
+// > Stage sequence used by this simple design:
+//   1. FETCH   - request instruction from program memory
+//   2. DECODE  - turn instruction bits into control signals
+//   3. REQUEST - read source registers / launch LSU requests
+//   4. WAIT    - stall until any memory operations complete
+//   5. EXECUTE - compute ALU results and tentative next PCs
+//   6. UPDATE  - write registers/NZP and commit next PC
+// > Important simplification: the scheduler assumes all active threads reconverge to one PC.
+//   Real GPUs must handle branch divergence much more carefully.
+// 新手导读：
+// 1. 这个模块是一个“核心级大状态机”，决定一个 core 当前在取指、译码、访存还是回写。
+// 2. 它不是线程调度器意义上的复杂 GPU warp scheduler，而是这个教学工程里的每指令节拍控制器。
+// 3. 其他模块大多会看 `core_state`，只在指定阶段做自己的工作，所以你可以把它当成全局节拍器。
+// 4. `next_pc` 是一个数组，表示每个线程 lane 各自算出来的下一条 PC；这里最后只选一个代表值。
 module scheduler #(
     parameter THREADS_PER_BLOCK = 4,
 ) (
@@ -20,23 +25,25 @@ module scheduler #(
     input wire reset,
     input wire start,
     
-    // Control Signals
+    // A few decoded instruction properties the scheduler cares about.
     input reg decoded_mem_read_enable,
     input reg decoded_mem_write_enable,
     input reg decoded_ret,
 
-    // Memory Access State
+    // Progress signals from the fetcher and per-thread LSUs.
     input reg [2:0] fetcher_state,
     input reg [1:0] lsu_state [THREADS_PER_BLOCK-1:0],
 
-    // Current & Next PC
+    // The scheduler holds the converged current PC for the core and later chooses one next PC.
     output reg [7:0] current_pc,
     input reg [7:0] next_pc [THREADS_PER_BLOCK-1:0],
 
-    // Execution State
+    // Shared execution stage broadcast to the rest of the core.
     output reg [2:0] core_state,
     output reg done
 );
+    // Core-wide stage encodings.
+    // 这些状态名帮助你把整个 core 的执行流程按时序串起来看。
     localparam IDLE = 3'b000, // Waiting to start
         FETCH = 3'b001,       // Fetch instructions from program memory
         DECODE = 3'b010,      // Decode instructions into control signals
@@ -52,63 +59,69 @@ module scheduler #(
             core_state <= IDLE;
             done <= 0;
         end else begin 
+            // 典型有限状态机主干：当前状态不同，下一拍转移规则也不同。
             case (core_state)
                 IDLE: begin
-                    // Here after reset (before kernel is launched, or after previous block has been processed)
+                    // Reset entry point before a block begins.
                     if (start) begin 
-                        // Start by fetching the next instruction for this block based on PC
+                        // A new block starts at PC 0, so the first action is instruction fetch.
                         core_state <= FETCH;
                     end
                 end
                 FETCH: begin 
-                    // Move on once fetcher_state = FETCHED
+                    // Wait until the fetcher has latched the instruction.
                     if (fetcher_state == 3'b010) begin 
                         core_state <= DECODE;
                     end
                 end
                 DECODE: begin
-                    // Decode is synchronous so we move on after one cycle
+                    // Decoder updates its outputs on this cycle's clock edge.
                     core_state <= REQUEST;
                 end
                 REQUEST: begin 
-                    // Request is synchronous so we move on after one cycle
+                    // Register operands are sampled and any LSU operations are launched here.
                     core_state <= WAIT;
                 end
                 WAIT: begin
-                    // Wait for all LSUs to finish their request before continuing
+                    // For non-memory instructions, the LSUs stay idle and this stage exits quickly.
+                    // For loads/stores, wait until every active LSU has finished.
+                    // 这里在 always 块内部声明局部变量，是 SystemVerilog 允许的写法。
                     reg any_lsu_waiting = 1'b0;
                     for (int i = 0; i < THREADS_PER_BLOCK; i++) begin
-                        // Make sure no lsu_state = REQUESTING or WAITING
+                        // REQUESTING or WAITING means this thread still has an in-flight memory op.
+                        // `for (...)` 在这里是“描述硬件中的重复检查逻辑”，不是软件里那种耗时循环概念。
                         if (lsu_state[i] == 2'b01 || lsu_state[i] == 2'b10) begin
                             any_lsu_waiting = 1'b1;
                             break;
                         end
                     end
 
-                    // If no LSU is waiting for a response, move onto the next stage
+                    // Once all memory activity is settled, arithmetic / branch logic may proceed.
                     if (!any_lsu_waiting) begin
                         core_state <= EXECUTE;
                     end
                 end
                 EXECUTE: begin
-                    // Execute is synchronous so we move on after one cycle
+                    // ALUs and PCs compute their outputs during this stage.
                     core_state <= UPDATE;
                 end
                 UPDATE: begin 
                     if (decoded_ret) begin 
-                        // If we reach a RET instruction, this block is done executing
+                        // RET ends execution for the whole block in this simplified SIMD model.
                         done <= 1;
                         core_state <= DONE;
                     end else begin 
-                        // TODO: Branch divergence. For now assume all next_pc converge
+                        // Major simplification: just trust that all active threads computed the same
+                        // next PC, and pick one representative value.
+                        // 这里直接取 `next_pc[THREADS_PER_BLOCK-1]`，前提是所有活跃线程已经重新收敛到同一个控制流。
                         current_pc <= next_pc[THREADS_PER_BLOCK-1];
 
-                        // Update is synchronous so we move on after one cycle
+                        // Begin the next instruction.
                         core_state <= FETCH;
                     end
                 end
                 DONE: begin 
-                    // no-op
+                    // Terminal state for this block until the dispatcher resets the core.
                 end
             endcase
         end
diff --git a/test/helpers/debug.py b/test/helpers/debug.py
new file mode 100644
index 0000000..0ccacef
--- /dev/null
+++ b/test/helpers/debug.py
@@ -0,0 +1,42 @@
+import os
+
+
+_DEBUGPY_INITIALIZED = False
+
+
+def _env_enabled(name: str) -> bool:
+    value = os.getenv(name, "").strip().lower()
+    return value in {"1", "true", "yes", "on"}
+
+
+def maybe_enable_debugpy() -> None:
+    global _DEBUGPY_INITIALIZED
+
+    if not _env_enabled("COCOTB_DEBUGPY"):
+        return
+
+    if _DEBUGPY_INITIALIZED:
+        return
+
+    try:
+        import debugpy
+    except ImportError as exc:
+        raise RuntimeError(
+            "COCOTB_DEBUGPY=1 but debugpy is not installed in the cocotb Python environment"
+        ) from exc
+
+    host = os.getenv("COCOTB_DEBUGPY_HOST", "127.0.0.1")
+    port = int(os.getenv("COCOTB_DEBUGPY_PORT", "5678"))
+    wait_for_attach = _env_enabled("COCOTB_DEBUGPY_WAIT")
+
+    debugpy.listen((host, port))
+    _DEBUGPY_INITIALIZED = True
+
+    print(f"[cocotb-debug] Listening on {host}:{port}", flush=True)
+
+    if wait_for_attach:
+        print(
+            f"[cocotb-debug] Waiting for debugger attach on {host}:{port}", flush=True
+        )
+        debugpy.wait_for_client()
+        print(f"[cocotb-debug] Debugger attached on {host}:{port}", flush=True)
diff --git a/test/helpers/format.py b/test/helpers/format.py
index 109130b..f004a6b 100644
--- a/test/helpers/format.py
+++ b/test/helpers/format.py
@@ -1,9 +1,15 @@
 from typing import List, Optional
 from .logger import logger
 
+
+# 这个文件不参与功能逻辑，它的职责是“把仿真过程中难读的二进制/状态值格式化成人能看懂的文本”。
+# 对调试 GPU 这种时序设计很有帮助，因为你可以直接从日志里看到：
+# 当前执行了哪条指令、每个线程寄存器是多少、LSU 在什么状态、这拍是否完成等。
 def format_register(register: int) -> str:
+    # 普通寄存器 0~12 直接显示成 R0~R12。
     if register < 13:
         return f"R{register}"
+    # 13~15 是这个设计里约定的特殊寄存器。
     if register == 13:
         return f"%blockIdx"
     if register == 14:
@@ -11,16 +17,24 @@ def format_register(register: int) -> str:
     if register == 15:
         return f"%threadIdx"
     
+
+# 这个函数把一条 16 bit 指令的二进制字符串，翻译成类似汇编的人类可读文本。
+# 学习时可以把它和 decoder.sv 对照着看，会很容易理解指令编码格式。
 def format_instruction(instruction: str) -> str:
+    # Python 切片 `instruction[0:4]` 表示取前 4 个字符；这里假设输入是 16 位二进制字符串。
     opcode = instruction[0:4]
     rd = format_register(int(instruction[4:8], 2))
     rs = format_register(int(instruction[8:12], 2))
     rt = format_register(int(instruction[12:16], 2))
+
+    # 下面三行把 BRnzp 指令里附带的 N/Z/P 条件位翻译成字母。
+    # 例如只对负数分支时，可能得到 "N"；如果三个位都打开，就会得到 "NZP"。
     n = "N" if instruction[4] == 1 else ""
     z = "Z" if instruction[5] == 1 else ""
     p = "P" if instruction[6] == 1 else ""
     imm = f"#{int(instruction[8:16], 2)}"
 
+    # 一长串 if/elif 相当于手写一个软件版 decoder。
     if opcode == "0000":
         return "NOP"
     elif opcode == "0001":
@@ -45,6 +59,8 @@ def format_instruction(instruction: str) -> str:
         return "RET"
     return "UNKNOWN"
 
+
+# 下面几个函数都是“状态码 -> 字符串名字”的查表工具。
 def format_core_state(core_state: str) -> str:
     core_state_map = {
         "000": "IDLE",
@@ -85,18 +101,26 @@ def format_memory_controller_state(controller_state: str) -> str:
     }
     return controller_state_map[controller_state]
 
+
+# 这个函数把寄存器数组打印成一串“寄存器名 = 数值”的文本。
 def format_registers(registers: List[str]) -> str:
     formatted_registers = []
     for i, reg_value in enumerate(registers):
-        decimal_value = int(reg_value, 2)  # Convert binary string to decimal
-        reg_idx = 15 - i # Register data is provided in reverse order
+        # cocotb 读出来通常是二进制字符串，这里先转成十进制方便看。
+        decimal_value = int(reg_value, 2)
+        # 这里减 15 的原因是：当前 register 数组输出顺序和逻辑寄存器编号顺序相反。
+        reg_idx = 15 - i
         formatted_registers.append(f"{format_register(reg_idx)} = {decimal_value}")
+    # reverse() 就地反转列表，让最终输出顺序重新变成 R0...R15。
     formatted_registers.reverse()
     return ', '.join(formatted_registers)
 
+
+# 这个函数是调试核心：它在每个仿真周期把 core / thread 的关键内部状态打印到日志里。
 def format_cycle(dut, cycle_id: int, thread_id: Optional[int] = None):
     logger.debug(f"\n================================== Cycle {cycle_id} ==================================")
 
+    # dut.cores 来自 gpu.sv 里的 generate 块名字 `cores`，cocotb 会把它暴露成可遍历层级。
     for core in dut.cores:
         # Not exactly accurate, but good enough for now
         if int(str(dut.thread_count.value), 2) <= core.i.value * dut.THREADS_PER_BLOCK.value:
@@ -104,12 +128,15 @@ def format_cycle(dut, cycle_id: int, thread_id: Optional[int] = None):
 
         logger.debug(f"\n+--------------------- Core {core.i.value} ---------------------+")
 
+        # 取当前 core 正在执行的那条共享指令。
         instruction = str(core.core_instance.instruction.value)
         for thread in core.core_instance.threads:
-            if int(thread.i.value) < int(str(core.core_instance.thread_count.value), 2): # if enabled
+            # 只显示当前 block 中真正启用的线程 lane。
+            if int(thread.i.value) < int(str(core.core_instance.thread_count.value), 2):
                 block_idx = core.core_instance.block_id.value
                 block_dim = int(core.core_instance.THREADS_PER_BLOCK)
                 thread_idx = thread.register_instance.THREAD_ID.value
+                # 全局线程索引 = blockIdx * blockDim + threadIdx。
                 idx = block_idx * block_dim + thread_idx
 
                 rs = int(str(thread.register_instance.rs.value), 2)
@@ -120,6 +147,7 @@ def format_cycle(dut, cycle_id: int, thread_id: Optional[int] = None):
                 lsu_out = int(str(thread.lsu_instance.lsu_out.value), 2)
                 constant = int(str(core.core_instance.decoded_immediate.value), 2)
 
+                # 如果 thread_id 是 None，就打印所有线程；否则只打印指定线程，方便聚焦调试。
                 if (thread_id is None or thread_id == idx):
                     logger.debug(f"\n+-------- Thread {idx} --------+")
 
diff --git a/test/helpers/logger.py b/test/helpers/logger.py
index 31de5a0..3505639 100644
--- a/test/helpers/logger.py
+++ b/test/helpers/logger.py
@@ -1,17 +1,27 @@
 import datetime
 
+
+# 这是一个非常轻量的日志工具。
+# 它没有依赖 Python 标准 logging 模块，而是直接把字符串追加写进 test/logs 目录下的文件。
+# 好处是简单直接，测试脚本后面还能重新打开同一个日志文件做正则匹配检查。
 class Logger:
     def __init__(self, level="debug"):
+        # 用当前时间生成日志文件名，避免每次测试都覆盖上一次结果。
         self.filename = f"test/logs/log_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}.txt"
         self.level = level
 
     def debug(self, *messages):
+        # `*messages` 表示可变参数，调用时可以一次传很多段消息进来。
         if self.level == "debug":
             self.info(*messages)
 
     def info(self, *messages):
+        # 把所有参数先转成字符串，再用空格拼成一整行。
         full_message = ' '.join(str(message) for message in messages)
+        # 以追加模式打开文件，这样每次写日志都会接在文件末尾。
         with open(self.filename, "a") as log_file:
             log_file.write(full_message + "\n")
 
+
+# 模块级单例，其他文件 `from .logger import logger` 后都共用这一份 logger。
 logger = Logger(level="debug")
\ No newline at end of file
diff --git a/test/helpers/memory.py b/test/helpers/memory.py
index 4d5cc0b..934b3bc 100644
--- a/test/helpers/memory.py
+++ b/test/helpers/memory.py
@@ -1,99 +1,233 @@
 from typing import List
 from .logger import logger
 
+
+# 这个类不是 RTL 里的 RAM 宏，而是 cocotb 测试平台里的“软件内存模型”。
+# 你可以把它理解成：Python 代码在 testbench 外面假装自己是一块存储器，
+# 然后通过 dut 上的读写握手信号，像真正的外设一样和 Verilog 顶层交互。
 class Memory:
+    # Python 里 def 用来定义函数；写在类里面时，这个函数就是“方法”。
+    # __init__ 是构造函数，含义类似“创建对象时先执行的初始化逻辑”。
+    # self 表示“当前这个 Memory 对象本身”，和很多语言里的 this 类似。
     def __init__(self, dut, addr_bits, data_bits, channels, name):
+        # 保存 cocotb 传进来的 DUT 句柄，后面就能通过 self.dut 访问顶层信号。
         self.dut = dut
+        # 保存地址位宽，例如 8 表示地址总线宽度是 8 bit，可寻址 2^8 个单元。
         self.addr_bits = addr_bits
+        # 保存数据位宽，例如 8 表示每个存储单元存 8 bit 数据。
         self.data_bits = data_bits
+        # Python 里的 [0] * N 表示生成一个长度为 N 的列表，并且每个元素初始值都是 0。
+        # 这里用它来模拟一整块内存；2**addr_bits 就是 2 的 addr_bits 次方。
         self.memory = [0] * (2**addr_bits)
+        # channels 表示并行通道数，对应 Verilog 里一次可以同时服务多少个 lane 的访存请求。
         self.channels = channels
+        # name 用来区分 program memory 和 data memory，也用来拼接信号名。
         self.name = name
 
+        # getattr(obj, "attr") 的意思是“按字符串名字，从对象 obj 身上取属性 attr”。
+        # 这里配合 f"{name}_..." 这种 f-string，把 name 动态插进信号名中。
+        # 例如 name="data" 时，最终会取到 dut.data_mem_read_valid 这个句柄。
         self.mem_read_valid = getattr(dut, f"{name}_mem_read_valid")
+        # 读地址总线，DUT 会把想读的地址放在这个信号上。
         self.mem_read_address = getattr(dut, f"{name}_mem_read_address")
+        # 读 ready 总线，由这个 Python 内存模型回填给 DUT，表示“这次读我接住了”。
         self.mem_read_ready = getattr(dut, f"{name}_mem_read_ready")
+        # 读 data 总线，由这个 Python 内存模型回填给 DUT，表示“这是你读到的数据”。
         self.mem_read_data = getattr(dut, f"{name}_mem_read_data")
 
+        # program memory 在这个工程里只读不写，所以只有 data memory 需要接写口。
         if name != "program":
+            # 写 valid：DUT 置 1 表示某个 lane 正在发起写请求。
             self.mem_write_valid = getattr(dut, f"{name}_mem_write_valid")
+            # 写地址：DUT 想把数据写到哪里。
             self.mem_write_address = getattr(dut, f"{name}_mem_write_address")
+            # 写数据：DUT 想写进去的值。
             self.mem_write_data = getattr(dut, f"{name}_mem_write_data")
+            # 写 ready：这个 Python 内存模型回给 DUT，表示“这次写我接受了”。
             self.mem_write_ready = getattr(dut, f"{name}_mem_write_ready")
 
-    def run(self):
-        mem_read_valid = [
-            int(str(self.mem_read_valid.value)[i:i+1], 2)
-            for i in range(0, len(str(self.mem_read_valid.value)), 1)
-        ]
+    # run() 可以理解成“把这块软件内存跑一个仿真周期的组合/握手逻辑”。
+    # cycle 参数不是功能必须项，只是为了写日志时能打印当前周期号。
+    def run(self, cycle=None):
+        # 把 cocotb 句柄里的值转成字符串，便于后面按位切片。
+        # 例如多通道 valid 可能会变成 "1010" 这样的二进制字符串。
+        mem_read_valid_bits = str(self.mem_read_valid.value)
+        # 先创建一个空列表，后面逐个 lane 填入解析后的 valid 位。
+        mem_read_valid = []
+        # range(起点, 终点, 步长) 会生成一个整数序列；这里步长是 1，表示逐位扫描。
+        for i in range(0, len(mem_read_valid_bits), 1):
+            # Python 切片 s[a:b] 表示取下标 a 到 b 之前的内容，也就是左闭右开区间。
+            bit_slice = mem_read_valid_bits[i : i + 1]
+            # int(字符串, 2) 表示把二进制字符串转换成十进制整数。
+            mem_read_valid.append(int(bit_slice, 2))
+
+        # 同样先把读地址总线整体转成字符串。
+        mem_read_address_bits = str(self.mem_read_address.value)
+        # 创建一个空列表，准备保存每个通道各自的读地址。
+        mem_read_address = []
+        # 地址总线是按 addr_bits 一组拼接起来的，所以这里每次跳一个地址宽度。
+        for i in range(0, len(mem_read_address_bits), self.addr_bits):
+            # 取出当前 lane 对应的一段地址比特。
+            address_slice = mem_read_address_bits[i : i + self.addr_bits]
+            # 把二进制地址字符串转成 Python 整数，后面才能拿来做列表下标。
+            mem_read_address.append(int(address_slice, 2))
 
-        mem_read_address = [
-            int(str(self.mem_read_address.value)[i:i+self.addr_bits], 2)
-            for i in range(0, len(str(self.mem_read_address.value)), self.addr_bits)
-        ]
+        # 先默认所有 lane 都不 ready；后面谁真的请求了读，谁再被置成 1。
         mem_read_ready = [0] * self.channels
+        # 先给每个 lane 准备一个返回数据槽位，默认值都是 0。
         mem_read_data = [0] * self.channels
 
+        # 逐个通道处理读请求，这很像硬件里“for each lane”的行为。
         for i in range(self.channels):
+            # 如果 valid 为 1，说明 DUT 当前周期确实发起了读请求。
             if mem_read_valid[i] == 1:
+                # 用请求地址作为 Python 列表下标，从软件内存里取出对应数据。
                 mem_read_data[i] = self.memory[mem_read_address[i]]
+                # 返回 ready=1，告诉 DUT 这次读已经被响应。
                 mem_read_ready[i] = 1
             else:
+                # 如果这个 lane 没请求，就显式返回 ready=0。
                 mem_read_ready[i] = 0
 
-        self.mem_read_data.value = int(''.join(format(d, '0' + str(self.data_bits) + 'b') for d in mem_read_data), 2)
-        self.mem_read_ready.value = int(''.join(format(r, '01b') for r in mem_read_ready), 2)
+        # 创建一个空列表，用来存每个 lane 的数据字符串。
+        mem_read_data_fields = []
+        # 逐个 lane 把整数格式化成固定宽度的二进制字符串。
+        for data in mem_read_data:
+            # format(value, "08b") 这类写法表示输出二进制并且左侧补 0 到固定宽度。
+            # 这里宽度不是写死，而是用 "0" + str(self.data_bits) + "b" 动态拼出来。
+            mem_read_data_fields.append(format(data, "0" + str(self.data_bits) + "b"))
+        # "".join(list) 表示把字符串列表无分隔符拼接成一个完整的大总线字符串。
+        mem_read_data_bus = "".join(mem_read_data_fields)
+        # 再把整个二进制字符串转回整数，赋给 cocotb 信号句柄的 .value。
+        self.mem_read_data.value = int(mem_read_data_bus, 2)
 
+        # 创建一个空列表，用来存 ready 位拼接后的字符串。
+        mem_read_ready_fields = []
+        # 每个 ready 只占 1 bit，所以格式字符串固定写成 01b。
+        for ready in mem_read_ready:
+            mem_read_ready_fields.append(format(ready, "01b"))
+        # 把所有 ready 位拼成一个多通道 ready 总线。
+        mem_read_ready_bus = "".join(mem_read_ready_fields)
+        # 把 ready 总线写回 DUT，让 Verilog 逻辑在这个周期看到握手结果。
+        self.mem_read_ready.value = int(mem_read_ready_bus, 2)
+
+        # 只有 data memory 需要处理写请求；program memory 是只读的，不会走这段。
         if self.name != "program":
-            mem_write_valid = [
-                int(str(self.mem_write_valid.value)[i:i+1], 2)
-                for i in range(0, len(str(self.mem_write_valid.value)), 1)
-            ]
-            mem_write_address = [
-                int(str(self.mem_write_address.value)[i:i+self.addr_bits], 2)
-                for i in range(0, len(str(self.mem_write_address.value)), self.addr_bits)
-            ]
-            mem_write_data = [
-                int(str(self.mem_write_data.value)[i:i+self.data_bits], 2)
-                for i in range(0, len(str(self.mem_write_data.value)), self.data_bits)
-            ]
+            # 先读取整条写 valid 总线，并转成字符串方便切片。
+            mem_write_valid_bits = str(self.mem_write_valid.value)
+            # 创建列表，准备保存每个通道的写 valid。
+            mem_write_valid = []
+            # 逐 bit 切开 valid 总线，因为每个通道只有 1 bit valid。
+            for i in range(0, len(mem_write_valid_bits), 1):
+                # 取出当前通道对应的 1 个 bit。
+                bit_slice = mem_write_valid_bits[i : i + 1]
+                # 转成整数 0 或 1，便于后面判断。
+                mem_write_valid.append(int(bit_slice, 2))
+
+            # 读取整条写地址总线。
+            mem_write_address_bits = str(self.mem_write_address.value)
+            # 创建列表，准备保存每个通道的目标地址。
+            mem_write_address = []
+            # 地址总线按 addr_bits 位一组切分。
+            for i in range(0, len(mem_write_address_bits), self.addr_bits):
+                # 取出当前通道的地址字段。
+                address_slice = mem_write_address_bits[i : i + self.addr_bits]
+                # 二进制字符串转整数，得到真实的地址索引。
+                mem_write_address.append(int(address_slice, 2))
+
+            # 读取整条写数据总线。
+            mem_write_data_bits = str(self.mem_write_data.value)
+            # 创建列表，准备保存每个通道要写入的数据。
+            mem_write_data = []
+            # 数据总线按 data_bits 位一组切分。
+            for i in range(0, len(mem_write_data_bits), self.data_bits):
+                # 取出当前通道的数据字段。
+                data_slice = mem_write_data_bits[i : i + self.data_bits]
+                # 把数据字段转成整数，方便写入 Python 列表。
+                mem_write_data.append(int(data_slice, 2))
+
+            # 默认所有写通道都还没有被接受。
             mem_write_ready = [0] * self.channels
 
+            # 逐个通道处理写请求。
             for i in range(self.channels):
+                # 只有 valid=1 的 lane 才真的执行写入。
                 if mem_write_valid[i] == 1:
+                    # 先读出旧值，后面打印日志时可以看出这次写改了什么。
+                    old_data = self.memory[mem_write_address[i]]
+                    # 用新值覆盖对应地址，等价于 testbench 中这块 RAM 完成一次写操作。
                     self.memory[mem_write_address[i]] = mem_write_data[i]
+                    # 这里用 logger.debug 记录一次写事务，方便测试脚本回头检查写轨迹。
+                    logger.debug(
+                        f"[memwrite] {self.name} cycle={cycle if cycle is not None else -1} "
+                        f"lane={i} addr={mem_write_address[i]} old={old_data} new={mem_write_data[i]}"
+                    )
+                    # 把当前 lane 的 ready 置 1，告诉 DUT 这次写已经完成握手。
                     mem_write_ready[i] = 1
                 else:
+                    # 没有写请求的 lane 返回 ready=0。
                     mem_write_ready[i] = 0
 
-            self.mem_write_ready.value = int(''.join(format(w, '01b') for w in mem_write_ready), 2)
+            # 创建列表，用来收集每个写 ready 位的字符串表示。
+            mem_write_ready_fields = []
+            # 每个写 ready 也只有 1 bit，所以还是格式化成 01b。
+            for ready in mem_write_ready:
+                mem_write_ready_fields.append(format(ready, "01b"))
+            # 拼成整条写 ready 总线。
+            mem_write_ready_bus = "".join(mem_write_ready_fields)
+            # 把 ready 总线写回 DUT，完成本周期的写响应。
+            self.mem_write_ready.value = int(mem_write_ready_bus, 2)
 
+    # write() 是一个更底层的小工具函数，作用是“直接往软件内存某个地址写值”。
     def write(self, address, data):
+        # 先检查地址是否越界，避免 Python 列表访问报错。
         if address < len(self.memory):
+            # 如果地址合法，就把对应位置的数据改成新值。
             self.memory[address] = data
 
+    # load() 用于把一串初始数据批量装进内存，常用于测试开始前预装程序或输入数据。
     def load(self, rows: List[int]):
+        # enumerate(rows) 会返回 (下标, 元素) 二元组，非常适合“地址=序号，数据=内容”这种场景。
         for address, data in enumerate(rows):
+            # 复用前面的 write()，把每个元素依次写到对应地址。
             self.write(address, data)
 
+    # display() 用日志把内存内容打印成表格，便于人工查看。
+    # rows 表示想打印前多少行，decimal=True 表示默认按十进制显示。
     def display(self, rows, decimal=True):
+        # 先打印一个空行，让日志视觉上和前后内容隔开。
         logger.info("\n")
+        # self.name.upper() 会把名字转成大写，例如 data 变成 DATA。
         logger.info(f"{self.name.upper()} MEMORY")
-        
+
+        # 这里估算表格宽度；8*2 表示给两列各预留 8 个字符，再额外补 3 个边框字符。
         table_size = (8 * 2) + 3
+        # 打印表格顶边；字符串乘法 "-" * N 表示把横线重复 N 次。
         logger.info("+" + "-" * (table_size - 3) + "+")
 
+        # 表头固定是地址列和数据列。
         header = "| Addr | Data "
+        # 用空格把表头补齐到统一宽度，再补上右边框。
         logger.info(header + " " * (table_size - len(header) - 1) + "|")
 
+        # 再打印一条分隔线，把表头和数据区隔开。
         logger.info("+" + "-" * (table_size - 3) + "+")
+        # enumerate(self.memory) 表示按“地址 + 数据值”的形式遍历整块软件内存。
         for i, data in enumerate(self.memory):
+            # 只打印用户要求的前 rows 行，避免日志太长。
             if i < rows:
+                # 如果 decimal 为 True，就按十进制打印，更适合看普通数值。
                 if decimal:
+                    # f-string 里的 :<4 表示左对齐并占 4 个字符宽度。
                     row = f"| {i:<4} | {data:<4}"
+                    # 补足右侧空格并补上边框，让每一行长度一致。
                     logger.info(row + " " * (table_size - len(row) - 1) + "|")
                 else:
-                    data_bin = format(data, f'0{16}b')
+                    # 如果要看二进制，就把数据格式化成固定 16 bit 宽度。
+                    data_bin = format(data, f"0{16}b")
+                    # 组成二进制显示行；这里直接把右边框也拼进字符串里。
                     row = f"| {i:<4} | {data_bin} |"
+                    # 同样做右侧补空格，保持表格整齐。
                     logger.info(row + " " * (table_size - len(row) - 1) + "|")
-        logger.info("+" + "-" * (table_size - 3) + "+")
\ No newline at end of file
+        # 打印表格底边，表示这次显示结束。
+        logger.info("+" + "-" * (table_size - 3) + "+")
diff --git a/test/helpers/setup.py b/test/helpers/setup.py
index 5370eb2..9acca72 100644
--- a/test/helpers/setup.py
+++ b/test/helpers/setup.py
@@ -2,36 +2,59 @@
 import cocotb
 from cocotb.clock import Clock
 from cocotb.triggers import RisingEdge
+from .debug import maybe_enable_debugpy
 from .memory import Memory
 
+
+# 这个文件负责把“每个测试都要做的初始化步骤”收口到一个公共函数里。
+# 对新手来说，可以把它理解成：
+# 1. 启动时钟，
+# 2. 拉一次 reset，
+# 3. 预装程序和数据内存，
+# 4. 写 device control register，
+# 5. 拉起 start，正式开始跑 kernel。
 async def setup(
-    dut, 
-    program_memory: Memory, 
+    dut,
+    program_memory: Memory,
     program: List[int],
     data_memory: Memory,
     data: List[int],
-    threads: int
+    threads: int,
 ):
+    # `async def` 表示这是一个协程函数。
+    # 在 cocotb 里，协程非常常见，因为仿真需要“等一个时钟边沿再继续往下执行”。
+
+    maybe_enable_debugpy()
+
     # Setup Clock
+    # `Clock(dut.clk, 25, units="us")` 的意思是给 dut.clk 这个信号挂一个周期为 25 微秒的时钟源。
     clock = Clock(dut.clk, 25, units="us")
+    # `cocotb.start_soon(...)` 会把这个时钟协程在后台启动，让它持续翻转 clk。
     cocotb.start_soon(clock.start())
 
     # Reset
+    # cocotb 里给信号赋值通常用 `.value = ...`。
+    # 这里先把 reset 拉高，再等一个上升沿，模拟硬件复位过程。
     dut.reset.value = 1
+    # `await RisingEdge(dut.clk)` 的意思是“暂停当前协程，直到 dut.clk 出现下一个上升沿”。
     await RisingEdge(dut.clk)
     dut.reset.value = 0
 
     # Load Program Memory
+    # 这一步不是通过总线一拍拍写进去，而是直接调用 Python 内存模型的辅助函数预装程序。
     program_memory.load(program)
 
     # Load Data Memory
     data_memory.load(data)
 
     # Device Control Register
+    # 把线程总数写进 DCR，对应 RTL 里的 dcr.sv 模块。
     dut.device_control_write_enable.value = 1
     dut.device_control_data.value = threads
+    # 等一个时钟上升沿，确保 DCR 在时序逻辑里真正采样到这次写入。
     await RisingEdge(dut.clk)
     dut.device_control_write_enable.value = 0
 
     # Start
+    # 把 start 拉高后，顶层 dispatch 就会开始把 block 分发到各个 core。
     dut.start.value = 1
diff --git a/test/test_matadd.py b/test/test_matadd.py
index d79f516..41e4619 100644
--- a/test/test_matadd.py
+++ b/test/test_matadd.py
@@ -1,3 +1,5 @@
+import re
+
 import cocotb
 from cocotb.triggers import RisingEdge
 from .helpers.setup import setup
@@ -5,62 +7,160 @@
 from .helpers.format import format_cycle
 from .helpers.logger import logger
 
+
+# 这个测试文件验证的是“向量加法”内核是否正确执行。
+# 你可以把整个流程理解成：
+# 1. 准备 program memory 和 data memory，
+# 2. 启动 GPU 仿真，
+# 3. 每拍驱动软件内存模型并打印内部状态，
+# 4. 等待 dut.done 置位，
+# 5. 最后同时检查日志里的写事务轨迹和内存里的最终结果。
+def parse_memwrite_records(log_contents: str):
+    # `re.compile(...)` 会预编译一个正则表达式，用来从日志文本中提取 `[memwrite]` 记录。
+    pattern = re.compile(
+        r"^\[memwrite\] data cycle=(\d+) lane=(\d+) addr=(\d+) old=(\d+) new=(\d+)$"
+    )
+    records = []
+    # splitlines() 会把整个日志按行拆开，便于逐条匹配。
+    for line in log_contents.splitlines():
+        # strip() 去掉首尾空白后再匹配，减少格式噪声影响。
+        match = pattern.match(line.strip())
+        if match:
+            # groups() 会把正则里每个 `(...)` 捕获到的字段按顺序取出来。
+            cycle, lane, addr, old, new = match.groups()
+            records.append(
+                {
+                    "cycle": int(cycle),
+                    "lane": int(lane),
+                    "addr": int(addr),
+                    "old": int(old),
+                    "new": int(new),
+                }
+            )
+    return records
+
+
 @cocotb.test()
 async def test_matadd(dut):
+    # `@cocotb.test()` 是装饰器，告诉 cocotb：下面这个协程就是一个测试入口。
+
     # Program Memory
-    program_memory = Memory(dut=dut, addr_bits=8, data_bits=16, channels=1, name="program")
+    # 这里创建的是 program memory 的 Python 模型，不是 RTL 里的真正 SRAM。
+    program_memory = Memory(
+        dut=dut, addr_bits=8, data_bits=16, channels=1, name="program"
+    )
+    # program 列表里的每个元素都是 16 bit 指令，和 decoder.sv 的指令格式对应。
     program = [
-        0b0101000011011110, # MUL R0, %blockIdx, %blockDim
-        0b0011000000001111, # ADD R0, R0, %threadIdx         ; i = blockIdx * blockDim + threadIdx
-        0b1001000100000000, # CONST R1, #0                   ; baseA (matrix A base address)
-        0b1001001000001000, # CONST R2, #8                   ; baseB (matrix B base address)
-        0b1001001100010000, # CONST R3, #16                  ; baseC (matrix C base address)
-        0b0011010000010000, # ADD R4, R1, R0                 ; addr(A[i]) = baseA + i
-        0b0111010001000000, # LDR R4, R4                     ; load A[i] from global memory
-        0b0011010100100000, # ADD R5, R2, R0                 ; addr(B[i]) = baseB + i
-        0b0111010101010000, # LDR R5, R5                     ; load B[i] from global memory
-        0b0011011001000101, # ADD R6, R4, R5                 ; C[i] = A[i] + B[i]
-        0b0011011100110000, # ADD R7, R3, R0                 ; addr(C[i]) = baseC + i
-        0b1000000001110110, # STR R7, R6                     ; store C[i] in global memory
-        0b1111000000000000, # RET                            ; end of kernel
+        0b0101000011011110,  # MUL R0, %blockIdx, %blockDim
+        0b0011000000001111,  # ADD R0, R0, %threadIdx         ; i = blockIdx * blockDim + threadIdx
+        0b1001000100000000,  # CONST R1, #0                   ; baseA (matrix A base address)
+        0b1001001000001000,  # CONST R2, #8                   ; baseB (matrix B base address)
+        0b1001001100010000,  # CONST R3, #16                  ; baseC (matrix C base address)
+        0b0011010000010000,  # ADD R4, R1, R0                 ; addr(A[i]) = baseA + i
+        0b0111010001000000,  # LDR R4, R4                     ; load A[i] from global memory
+        0b0011010100100000,  # ADD R5, R2, R0                 ; addr(B[i]) = baseB + i
+        0b0111010101010000,  # LDR R5, R5                     ; load B[i] from global memory
+        0b0011011001000101,  # ADD R6, R4, R5                 ; C[i] = A[i] + B[i]
+        0b0011011100110000,  # ADD R7, R3, R0                 ; addr(C[i]) = baseC + i
+        0b1000000001110110,  # STR R7, R6                     ; store C[i] in global memory
+        0b1111000000000000,  # RET                            ; end of kernel
     ]
 
     # Data Memory
+    # data memory 是 8 bit 宽，支持 4 个并行访存通道，对应 GPU 顶层参数。
     data_memory = Memory(dut=dut, addr_bits=8, data_bits=8, channels=4, name="data")
     data = [
-        0, 1, 2, 3, 4, 5, 6, 7, # Matrix A (1 x 8)
-        0, 1, 2, 3, 4, 5, 6, 7  # Matrix B (1 x 8)
+        0,
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,  # Matrix A (1 x 8)
+        0,
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,  # Matrix B (1 x 8)
     ]
 
     # Device Control
+    # 向量加法一共要启动 8 个线程，每个线程负责一个元素。
     threads = 8
 
+    # setup() 会统一完成时钟、复位、预装内存、写 DCR 和拉起 start。
     await setup(
         dut=dut,
         program_memory=program_memory,
         program=program,
         data_memory=data_memory,
         data=data,
-        threads=threads
+        threads=threads,
     )
 
+    # 先打印一次初始内存，便于和最终结果对比。
     data_memory.display(24)
 
     cycles = 0
+    # `dut.done` 来自 GPU 顶层输出，表示整个 kernel 已经执行完成。
     while dut.done.value != 1:
-        data_memory.run()
+        # 每拍都先让 Python 版 data/program memory 处理当前周期的读写请求。
+        data_memory.run(cycle=cycles)
         program_memory.run()
 
+        # `ReadOnly()` 表示等到当前仿真时刻所有 RTL 更新都稳定后，再去读取内部信号做日志。
         await cocotb.triggers.ReadOnly()
         format_cycle(dut, cycles)
-        
+
+        # 最后再等时钟上升沿，推进到下一拍。
         await RisingEdge(dut.clk)
         cycles += 1
 
     logger.info(f"Completed in {cycles} cycles")
     data_memory.display(24)
 
+    # 打开当前测试生成的日志文件，准备做事后分析。
+    with open(logger.filename, "r") as log_file:
+        log_contents = log_file.read()
+
+    memwrite_records = parse_memwrite_records(log_contents)
+    # zip(data[0:8], data[8:16]) 会把 A 和 B 的对应元素两两配对，然后逐项相加得到期望结果。
     expected_results = [a + b for a, b in zip(data[0:8], data[8:16])]
+    expected_addresses = set(range(16, 24))
+
+    # 只保留写到结果区地址范围内的写事务。
+    matching_records = [
+        record for record in memwrite_records if record["addr"] in expected_addresses
+    ]
+
+    # assert 是 Python 里的断言；条件不成立时，测试会立刻失败并打印后面的错误信息。
+    assert matching_records, "Expected [memwrite] data records for matadd writes"
+
+    # 集合推导式 `{... for ... in ...}` 用于提取“实际被写到的地址集合”。
+    addresses_seen = {record["addr"] for record in matching_records}
+    assert addresses_seen == expected_addresses, (
+        "Expected memory write records for addresses 16..23"
+    )
+
+    # 逐个地址检查：日志里至少出现过一次 old=0 -> new=expected 的写入。
+    for i, expected in enumerate(expected_results):
+        relevant_records = [
+            record for record in matching_records if record["addr"] == i + 16
+        ]
+        assert any(
+            record["old"] == 0 and record["new"] == expected
+            for record in relevant_records
+        ), (
+            f"Expected at least one memory write record old=0 new={expected} at address {i + 16}"
+        )
+
+    # 最后再直接检查软件内存模型里的最终值，确保结果确实留在 data memory 中。
     for i, expected in enumerate(expected_results):
         result = data_memory.memory[i + 16]
-        assert result == expected, f"Result mismatch at index {i}: expected {expected}, got {result}"
\ No newline at end of file
+        assert result == expected, (
+            f"Result mismatch at index {i}: expected {expected}, got {result}"
+        )
diff --git a/test/test_matmul.py b/test/test_matmul.py
index 4cc14f7..aef522f 100644
--- a/test/test_matmul.py
+++ b/test/test_matmul.py
@@ -1,3 +1,5 @@
+import re
+
 import cocotb
 from cocotb.triggers import RisingEdge
 from .helpers.setup import setup
@@ -5,50 +7,89 @@
 from .helpers.format import format_cycle
 from .helpers.logger import logger
 
+
+# 这个测试文件验证的是“2x2 矩阵乘法”内核。
+# 和 matadd 相比，它更能覆盖分支、循环、乘法和多次 load/store 的组合行为。
+def parse_memwrite_records(log_contents: str):
+    # 和 matadd 里同名函数作用相同：把日志中的内存写事务解析成结构化记录。
+    pattern = re.compile(
+        r"^\[memwrite\] data cycle=(\d+) lane=(\d+) addr=(\d+) old=(\d+) new=(\d+)$"
+    )
+    records = []
+    for line in log_contents.splitlines():
+        match = pattern.match(line.strip())
+        if match:
+            cycle, lane, addr, old, new = match.groups()
+            records.append(
+                {
+                    "cycle": int(cycle),
+                    "lane": int(lane),
+                    "addr": int(addr),
+                    "old": int(old),
+                    "new": int(new),
+                }
+            )
+    return records
+
+
 @cocotb.test()
 async def test_matadd(dut):
+    # 函数名虽然写成了 test_matadd，但因为有 `@cocotb.test()` 装饰器，它仍会被当作一个独立测试执行。
+    # 这里我只补解释，不改行为。
+
     # Program Memory
-    program_memory = Memory(dut=dut, addr_bits=8, data_bits=16, channels=1, name="program")
+    program_memory = Memory(
+        dut=dut, addr_bits=8, data_bits=16, channels=1, name="program"
+    )
+    # 这段程序实现的是 2x2 矩阵乘法内核：
+    # 先根据线程号算出 row/col，再在 LOOP 里遍历 k，累加 A[row,k] * B[k,col]。
     program = [
-        0b0101000011011110, # MUL R0, %blockIdx, %blockDim
-        0b0011000000001111, # ADD R0, R0, %threadIdx         ; i = blockIdx * blockDim + threadIdx
-        0b1001000100000001, # CONST R1, #1                   ; increment
-        0b1001001000000010, # CONST R2, #2                   ; N (matrix inner dimension)
-        0b1001001100000000, # CONST R3, #0                   ; baseA (matrix A base address)
-        0b1001010000000100, # CONST R4, #4                   ; baseB (matrix B base address)
-        0b1001010100001000, # CONST R5, #8                   ; baseC (matrix C base address)
-        0b0110011000000010, # DIV R6, R0, R2                 ; row = i // N
-        0b0101011101100010, # MUL R7, R6, R2
-        0b0100011100000111, # SUB R7, R0, R7                 ; col = i % N
-        0b1001100000000000, # CONST R8, #0                   ; acc = 0
-        0b1001100100000000, # CONST R9, #0                   ; k = 0
-                            # LOOP:
-        0b0101101001100010, #   MUL R10, R6, R2
-        0b0011101010101001, #   ADD R10, R10, R9
-        0b0011101010100011, #   ADD R10, R10, R3             ; addr(A[i]) = row * N + k + baseA
-        0b0111101010100000, #   LDR R10, R10                 ; load A[i] from global memory
-        0b0101101110010010, #   MUL R11, R9, R2
-        0b0011101110110111, #   ADD R11, R11, R7
-        0b0011101110110100, #   ADD R11, R11, R4             ; addr(B[i]) = k * N + col + baseB
-        0b0111101110110000, #   LDR R11, R11                 ; load B[i] from global memory
-        0b0101110010101011, #   MUL R12, R10, R11
-        0b0011100010001100, #   ADD R8, R8, R12              ; acc = acc + A[i] * B[i]
-        0b0011100110010001, #   ADD R9, R9, R1               ; increment k
-        0b0010000010010010, #   CMP R9, R2
-        0b0001100000001100, #   BRn LOOP                     ; loop while k < N
-        0b0011100101010000, # ADD R9, R5, R0                 ; addr(C[i]) = baseC + i 
-        0b1000000010011000, # STR R9, R8                     ; store C[i] in global memory
-        0b1111000000000000  # RET                            ; end of kernel
+        0b0101000011011110,  # MUL R0, %blockIdx, %blockDim
+        0b0011000000001111,  # ADD R0, R0, %threadIdx         ; i = blockIdx * blockDim + threadIdx
+        0b1001000100000001,  # CONST R1, #1                   ; increment
+        0b1001001000000010,  # CONST R2, #2                   ; N (matrix inner dimension)
+        0b1001001100000000,  # CONST R3, #0                   ; baseA (matrix A base address)
+        0b1001010000000100,  # CONST R4, #4                   ; baseB (matrix B base address)
+        0b1001010100001000,  # CONST R5, #8                   ; baseC (matrix C base address)
+        0b0110011000000010,  # DIV R6, R0, R2                 ; row = i // N
+        0b0101011101100010,  # MUL R7, R6, R2
+        0b0100011100000111,  # SUB R7, R0, R7                 ; col = i % N
+        0b1001100000000000,  # CONST R8, #0                   ; acc = 0
+        0b1001100100000000,  # CONST R9, #0                   ; k = 0
+        # LOOP:
+        0b0101101001100010,  #   MUL R10, R6, R2
+        0b0011101010101001,  #   ADD R10, R10, R9
+        0b0011101010100011,  #   ADD R10, R10, R3             ; addr(A[i]) = row * N + k + baseA
+        0b0111101010100000,  #   LDR R10, R10                 ; load A[i] from global memory
+        0b0101101110010010,  #   MUL R11, R9, R2
+        0b0011101110110111,  #   ADD R11, R11, R7
+        0b0011101110110100,  #   ADD R11, R11, R4             ; addr(B[i]) = k * N + col + baseB
+        0b0111101110110000,  #   LDR R11, R11                 ; load B[i] from global memory
+        0b0101110010101011,  #   MUL R12, R10, R11
+        0b0011100010001100,  #   ADD R8, R8, R12              ; acc = acc + A[i] * B[i]
+        0b0011100110010001,  #   ADD R9, R9, R1               ; increment k
+        0b0010000010010010,  #   CMP R9, R2
+        0b0001100000001100,  #   BRn LOOP                     ; loop while k < N
+        0b0011100101010000,  # ADD R9, R5, R0                 ; addr(C[i]) = baseC + i
+        0b1000000010011000,  # STR R9, R8                     ; store C[i] in global memory
+        0b1111000000000000,  # RET                            ; end of kernel
     ]
 
     # Data Memory
     data_memory = Memory(dut=dut, addr_bits=8, data_bits=8, channels=4, name="data")
     data = [
-        1, 2, 3, 4, # Matrix A (2 x 2)
-        1, 2, 3, 4, # Matrix B (2 x 2)
+        1,
+        2,
+        3,
+        4,  # Matrix A (2 x 2)
+        1,
+        2,
+        3,
+        4,  # Matrix B (2 x 2)
     ]
 
     # Device Control
+    # 2x2 结果矩阵有 4 个元素，因此这里启动 4 个线程，每个线程负责一个输出位置。
     threads = 4
 
     await setup(
@@ -57,27 +98,47 @@ async def test_matadd(dut):
         program=program,
         data_memory=data_memory,
         data=data,
-        threads=threads
+        threads=threads,
     )
 
     data_memory.display(12)
 
     cycles = 0
     while dut.done.value != 1:
-        data_memory.run()
+        # 仿真主循环的结构和 matadd 相同：先驱动 memory model，再读稳定信号，最后推进时钟。
+        data_memory.run(cycle=cycles)
         program_memory.run()
 
         await cocotb.triggers.ReadOnly()
+        # 这里把 thread_id=1 传进去，表示只重点打印一个线程的详细内部状态，避免日志过大。
         format_cycle(dut, cycles, thread_id=1)
-        
+
         await RisingEdge(dut.clk)
         cycles += 1
 
     logger.info(f"Completed in {cycles} cycles")
     data_memory.display(12)
 
+    with open(logger.filename, "r") as log_file:
+        log_contents = log_file.read()
+
+    memwrite_records = parse_memwrite_records(log_contents)
+    expected_addresses = set(range(8, 12))
+
+    # 结果矩阵被约定写回到 data memory 的 8..11 地址范围。
+    matching_records = [
+        record for record in memwrite_records if record["addr"] in expected_addresses
+    ]
+
+    assert matching_records, "Expected [memwrite] data records for matmul writes"
+
+    addresses_seen = {record["addr"] for record in matching_records}
+    assert addresses_seen == expected_addresses, (
+        "Expected memory write records for addresses 8..11"
+    )
 
     # Assuming the matrices are 2x2 and the result is stored starting at address 9
+    # 先把一维 data 列表重新切成两个 2x2 矩阵，便于直接写出数学期望值。
     matrix_a = [data[0:2], data[2:4]]  # First matrix (2x2)
     matrix_b = [data[4:6], data[6:8]]  # Second matrix (2x2)
     expected_results = [
@@ -87,5 +148,18 @@ async def test_matadd(dut):
         matrix_a[1][0] * matrix_b[0][1] + matrix_a[1][1] * matrix_b[1][1],  # C[1,1]
     ]
     for i, expected in enumerate(expected_results):
+        relevant_records = [
+            record for record in matching_records if record["addr"] == i + 8
+        ]
+        assert any(
+            record["old"] == 0 and record["new"] == expected
+            for record in relevant_records
+        ), (
+            f"Expected at least one memory write record old=0 new={expected} at address {i + 8}"
+        )
+
+        # 和 matadd 一样，再直接检查最终 data memory 的实际落值。
         result = data_memory.memory[i + 8]  # Results start at address 9
-        assert result == expected, f"Result mismatch at index {i}: expected {expected}, got {result}"
+        assert result == expected, (
+            f"Result mismatch at index {i}: expected {expected}, got {result}"
+        )