From 7b2c64108770e6a56bfd8b5caca02120f35779eb Mon Sep 17 00:00:00 2001
From: rockdu <kangruidu@gmail.com>
Date: Tue, 9 Jun 2026 23:47:08 -0700
Subject: [PATCH 1/2] ci: bootstrap CPU CI ported from radixark/miles
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Port the AST-discovered test framework, reusable workflow, pre-commit
config, and seed tests from the miles main repo, trimmed to CPU-only.

Framework (tests/ci/, direct copy):
- ci_register.py / ci_utils.py / run_suite.py — AST-parsed register_cpu_ci
  markers + LPT partitioning + pytest invocation
- cpu_stubs/sgl_kernel — MagicMock stub so sglang's import chain succeeds
  on ubuntu-latest where sgl_kernel (GPU-only) cannot install

Adaptations for miles-D (vs miles main):
- labels.py: rewritten KNOWN_LABELS for miles-D domains
- run_suite.py: PER_COMMIT_SUITES[HWBackend.CUDA] = [] (no GPU runners)
- _run-ci.yml: stripped GPU run: job + Megatron checkout/install
- pr-test.yml: removed all GPU stages
- .pre-commit-config.yaml: dropped ban-mpu-get local hook; added
  exclude: ^flow_grpo/ on all 4 formatter hooks
- pyproject.toml: asyncio_mode = "auto" + ruff extend-exclude flow_grpo

Seed test:
- tests/fast/utils/test_misc.py exercises FunctionRegistry, load_function,
  and should_run_periodic_action

Support helper:
- miles/utils/misc.py: add FunctionRegistry class used by the seed test
  for registering callables under test names

requirements.txt: add pytest + pytest-asyncio for the test runner.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_run-ci.yml              | 137 ++++++++
 .github/workflows/bot-slash-lint.yaml      | 110 ++++++
 .github/workflows/pr-test.yml              |  69 ++++
 .github/workflows/pre-commit.yml           |  41 +++
 .pre-commit-config.yaml                    |  50 +++
 miles/utils/misc.py                        |  38 ++-
 pyproject.toml                             |   4 +
 requirements.txt                           |   4 +-
 tests/__init__.py                          |   0
 tests/ci/__init__.py                       |   0
 tests/ci/ci_register.py                    | 240 +++++++++++++
 tests/ci/ci_utils.py                       | 372 +++++++++++++++++++++
 tests/ci/cpu_stubs/pyproject.toml          |  11 +
 tests/ci/cpu_stubs/sgl_kernel/__init__.py  |  18 +
 tests/ci/cpu_stubs/sgl_kernel/kvcacheio.py |   9 +
 tests/ci/labels.py                         |  27 ++
 tests/ci/run_suite.py                      | 357 ++++++++++++++++++++
 tests/fast/__init__.py                     |   0
 tests/fast/utils/__init__.py               |   0
 tests/fast/utils/test_misc.py              |  84 +++++
 20 files changed, 1569 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/_run-ci.yml
 create mode 100644 .github/workflows/bot-slash-lint.yaml
 create mode 100644 .github/workflows/pr-test.yml
 create mode 100644 .github/workflows/pre-commit.yml
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 tests/__init__.py
 create mode 100644 tests/ci/__init__.py
 create mode 100644 tests/ci/ci_register.py
 create mode 100644 tests/ci/ci_utils.py
 create mode 100644 tests/ci/cpu_stubs/pyproject.toml
 create mode 100644 tests/ci/cpu_stubs/sgl_kernel/__init__.py
 create mode 100644 tests/ci/cpu_stubs/sgl_kernel/kvcacheio.py
 create mode 100644 tests/ci/labels.py
 create mode 100644 tests/ci/run_suite.py
 create mode 100644 tests/fast/__init__.py
 create mode 100644 tests/fast/utils/__init__.py
 create mode 100644 tests/fast/utils/test_misc.py

diff --git a/.github/workflows/_run-ci.yml b/.github/workflows/_run-ci.yml
new file mode 100644
index 00000000..af0ca236
--- /dev/null
+++ b/.github/workflows/_run-ci.yml
@@ -0,0 +1,137 @@
+name: CI Job
+
+# Reusable workflow ported from radixark/miles (.github/workflows/_run-ci.yml).
+#
+# CPU-only path: GitHub-hosted ubuntu-latest. Trimmed from miles main:
+#   * GPU `run:` job removed entirely — miles-D has no self-hosted GPU
+#     runners. To re-add: copy the `run:` job block back from miles main,
+#     reintroduce `runs_on / container_image / skip_dependency_install /
+#     cpu_runner` inputs, and add a matching `if: !inputs.cpu_runner` gate.
+#   * Megatron-LM checkout/install removed: miles-D source has zero
+#     `from megatron` imports (verified via grep).
+#   * PR-body magic for `ci-megatron-pr:` removed for the same reason.
+#   * sglang checkout/install preserved (miles-D imports
+#     sglang.multimodal_gen.runtime.* and sglang.srt.*).
+
+on:
+  workflow_call:
+    inputs:
+      execute_command:
+        type: string
+        required: true
+
+# TODO: run gpu
+jobs:
+  run-cpu:
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    env:
+      PYTHONPATH: ${{ github.workspace }}
+    steps:
+      - name: Free disk space
+        shell: bash
+        run: |
+          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc
+          df -h
+
+      - name: Checkout miles-diffusion
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          # 3.10 matches miles-D's setup.py / pyproject (python_requires=">=3.10").
+          python-version: '3.10'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+
+      - name: Install system dependencies
+        shell: bash
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y protobuf-compiler
+          protoc --version
+
+      # TODO: sglang-diffusion-rollout-test to be switched back to sgl main
+      - name: Resolve dependency refs
+        id: resolve-refs
+        shell: bash
+        env:
+          PR_BODY: ${{ github.event.pull_request.body || '' }}
+          INPUT_SGLANG_PR: ${{ github.event.inputs.ci_sglang_pr || '' }}
+          INPUT_SGLANG_REPO: ${{ github.event.inputs.ci_sglang_repo || '' }}
+        run: |
+          SGLANG_PR="${INPUT_SGLANG_PR}"
+          SGLANG_REPO="${INPUT_SGLANG_REPO}"
+          if [ -n "$PR_BODY" ]; then
+            PR_SGLANG_PR=$(echo "$PR_BODY" | grep -m1 -oP '^ci-sglang-pr:\s+\K\S+' || true)
+            [ -z "$SGLANG_PR" ] && [ -n "$PR_SGLANG_PR" ] && SGLANG_PR="$PR_SGLANG_PR"
+            PR_SGLANG_REPO=$(echo "$PR_BODY" | grep -m1 -oP '^ci-sglang-repo:\s+\K\S+' || true)
+            [ -z "$SGLANG_REPO" ] && [ -n "$PR_SGLANG_REPO" ] && SGLANG_REPO="$PR_SGLANG_REPO"
+          fi
+          [ -z "$SGLANG_PR" ] && SGLANG_PR="sglang-diffusion-rollout-test"
+          # TODO: default repo Rockdu/sglang to be switched back to sgl-project/sglang
+          [ -z "$SGLANG_REPO" ] && SGLANG_REPO="Rockdu/sglang"
+          resolve_fetch_ref() {
+            local ref="$1"
+            if [[ "$ref" =~ ^#([0-9]+)$ ]]; then
+              echo "refs/pull/${BASH_REMATCH[1]}/head"
+            else
+              echo "$ref"
+            fi
+          }
+          SGLANG_FETCH=$(resolve_fetch_ref "$SGLANG_PR")
+          echo "ci_sglang_pr=$SGLANG_FETCH" >> $GITHUB_OUTPUT
+          echo "sglang_repo=$SGLANG_REPO" >> $GITHUB_OUTPUT
+          echo "Resolved: sglang repo=$SGLANG_REPO ref=$SGLANG_PR -> fetch=$SGLANG_FETCH"
+
+      # TODO: default sglang repo (Rockdu/sglang) to be switched back to sgl main
+      - name: Checkout temporary sglang
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ steps.resolve-refs.outputs.sglang_repo }}
+          ref: ${{ steps.resolve-refs.outputs.ci_sglang_pr }}
+          path: sglang
+      
+      # - name: Checkout sglang
+      #   uses: actions/checkout@v4
+      #   with:
+      #     repository: sgl-project/sglang
+      #     ref: ${{ steps.resolve-refs.outputs.ci_sglang_pr }}
+      #     path: sglang
+
+      - name: Install dependencies
+        shell: bash
+        env:
+          UV_SYSTEM_PYTHON: "1"
+        run: |
+          uv pip install -e sglang/python --no-deps
+          uv pip install -r requirements.txt
+          uv pip install -e . --no-deps
+          # sglang is installed --no-deps above, but miles-D's import chain
+          # loads many sglang modules. Install sglang's pure-python runtime
+          # deps upfront; skip GPU-only ones (cuda-python, flashinfer,
+          # flash-attn-4, sglang-kernel, torchao, torchcodec,
+          # torch_memory_saver, quack-kernels, nvidia-cutlass-dsl,
+          # apache-tvm-ffi, kernels, decord2, av). torch itself comes via
+          # accelerate (in requirements.txt).
+          uv pip install \
+            IPython aiohttp anthropic build compressed-tensors einops fastapi \
+            gguf interegular jsonschema llguidance mistral_common modelscope \
+            msgspec ninja nvidia-ml-py openai openai-harmony orjson outlines \
+            partial_json_parser prometheus-client psutil py-spy pydantic \
+            python-multipart pyzmq scipy sentencepiece setproctitle soundfile \
+            tiktoken timm torchvision uvicorn uvloop watchfiles xgrammar
+          # sglang's memory_pool_host.py unconditionally imports sgl_kernel
+          # on non-NPU/XPU/MPS hardware. sgl_kernel is GPU-only — install a
+          # local stub so imports succeed at module load.
+          uv pip install tests/ci/cpu_stubs
+
+      - name: Resolve suite plan
+        shell: bash
+        run: ${{ inputs.execute_command }} --list-only
+
+      - name: Run tests
+        shell: bash
+        run: ${{ inputs.execute_command }}
diff --git a/.github/workflows/bot-slash-lint.yaml b/.github/workflows/bot-slash-lint.yaml
new file mode 100644
index 00000000..ea234876
--- /dev/null
+++ b/.github/workflows/bot-slash-lint.yaml
@@ -0,0 +1,110 @@
+name: Slash Command Handler
+
+on:
+  issue_comment:
+    types: [created, edited]
+
+permissions:
+  contents: write      # Required to push commits back to PR branch
+  actions: write       # Required to rerun workflows
+  issues: write        # Required for comment reactions in some contexts
+
+jobs:
+  slash_lint_codebase:
+    # Only run if it is a PR comment with a recognized command
+    if: >
+      github.event_name == 'issue_comment' &&
+      github.event.issue.pull_request &&
+      (
+        contains(github.event.comment.body, '/tag-run-lint') ||
+        contains(github.event.comment.body, '/run-lint')
+      )
+    runs-on: ubuntu-latest
+    steps:
+      - name: React to command comment (ack)
+        if: always()
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const commentId = context.payload.comment.id;
+            // Add an eyes reaction to acknowledge the command
+            await github.request('POST /repos/{owner}/{repo}/issues/comments/{comment_id}/reactions', {
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              comment_id: commentId,
+              content: 'eyes'
+            });
+
+      - name: Check out Git repository
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ github.repository }}
+          ref: refs/pull/${{ github.event.issue.number }}/head    
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Run pre-commit hooks
+        continue-on-error: true
+        uses: pre-commit/action@v3.0.1
+
+      - name: Get PR branch name
+        id: get_branch
+        run: |
+            BRANCH_NAME=$(gh pr view ${{ github.event.issue.number }} --json headRefName --jq '.headRefName')
+            echo "branch_name=$BRANCH_NAME" >> $GITHUB_OUTPUT
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Check if there are any changes
+        id: verify_diff
+        run: |
+            git diff --quiet . || echo "changed=true" >> $GITHUB_OUTPUT
+
+      - name: Commit files
+        if: steps.verify_diff.outputs.changed == 'true'
+        run: |
+            git config --local user.email "action@github.com"
+            git config --local user.name "GitHub Action"
+            git add .
+            git commit -m "[CI-Lint] Fix code style issues with pre-commit ${{ github.sha }}" -a
+            git push origin HEAD:refs/heads/${{ steps.get_branch.outputs.branch_name }}
+
+  cleanup_reaction:
+    # Always run after the main job completes (success, failure, or cancelled)
+    if: always()
+    needs: slash_lint_codebase
+    runs-on: ubuntu-latest
+    steps:
+      - name: Remove initial ack reaction
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const commentId = context.payload.comment.id;
+            // List reactions on the comment
+            const reactions = await github.rest.reactions.listForIssueComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              comment_id: commentId
+            }).then(r => r.data);
+            // Find the 'eyes' reaction added by this workflow bot
+            const target = reactions.find(r => r.content === 'eyes' && r.user && r.user.login === 'github-actions[bot]');
+            if (target) {
+              try {
+                await github.rest.reactions.deleteForIssueComment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  comment_id: commentId,
+                  reaction_id: target.id
+                });
+                core.info(`Successfully deleted eyes reaction (${target.id})`);
+              } catch (err) {
+                // Non-fatal: reaction may already be gone or inaccessible
+                core.info(`Could not delete eyes reaction (${target.id}): ${err.message || err.status || 'unknown error'}`);
+              }
+            } else {
+              core.info('No eyes reaction from github-actions[bot] found to remove.');
+            }
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
new file mode 100644
index 00000000..5dd3d236
--- /dev/null
+++ b/.github/workflows/pr-test.yml
@@ -0,0 +1,69 @@
+name: PR Test
+
+# Ported from radixark/miles (.github/workflows/pr-test.yml).
+#
+# Differences vs miles main:
+#   * All GPU stages (stage-b-2-gpu-h200, stage-c-8-gpu-h100,
+#     stage-c-4-gpu-h200, stage-c-2-gpu-h200) removed — miles-D has no
+#     self-hosted GPU runner fleet yet. To re-enable, copy the job blocks
+#     back from miles main and provision matching runners.
+#   * `resolve-ci-image` job removed — only relevant for GPU stages that
+#     pull a `radixark/miles:<tag>` container.
+#   * `ci_megatron_pr` workflow_dispatch input removed — miles-D source has
+#     zero `from megatron` imports.
+
+on:
+  pull_request:
+    types: [synchronize, labeled, opened, reopened]
+  workflow_dispatch:
+    inputs:
+      ci_sglang_pr:
+        description: 'SGLang branch/commit (default: sglang-miles)'
+        required: false
+        type: string
+        default: 'sglang-miles-diffusion'
+      ci_sglang_repo:
+        description: 'SGLang repository owner/name (default: Rockdu/sglang)'
+        required: false
+        type: string
+        default: 'Rockdu/sglang'
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+# PR labels are passed through to run_suite.py as raw `run-ci-<X>` strings via
+# `--labels`. run_suite.py strips the `run-ci-` prefix internally and ignores
+# any label that does not start with `run-ci-` (see tests/ci/run_suite.py
+# `strip_run_ci_prefix`). For `workflow_dispatch`, no PR labels exist, so the
+# `--labels` list collapses to empty; `--match-all-labels` is then added
+# unconditionally to bypass the labels predicate inside run_suite.py and run
+# every enabled test in the suite.
+
+jobs:
+  # Stage A: CPU-only fast tests (always runs on PR)
+  stage-a-cpu:
+    if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request)
+    uses: ./.github/workflows/_run-ci.yml
+    with:
+      execute_command: >-
+        python tests/ci/run_suite.py --hw cpu --suite stage-a-cpu
+        --labels ${{ join(github.event.pull_request.labels.*.name, ' ') }}
+        ${{ (github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'run-ci-image') || contains(github.event.pull_request.labels.*.name, 'run-ci-all')) && '--match-all-labels' || '' }}
+    secrets: inherit
+
+  # Stage B: CPU-only slower bucket (currently empty; reserved for future
+  # CPU tests that don't fit stage-a-cpu's fast budget). Always runs on PR.
+  # Empty result exits 0 in run_suite.py.
+  stage-b-cpu:
+    if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request)
+    uses: ./.github/workflows/_run-ci.yml
+    with:
+      execute_command: >-
+        python tests/ci/run_suite.py --hw cpu --suite stage-b-cpu
+        --labels ${{ join(github.event.pull_request.labels.*.name, ' ') }}
+        ${{ (github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'run-ci-image') || contains(github.event.pull_request.labels.*.name, 'run-ci-all')) && '--match-all-labels' || '' }}
+    secrets: inherit
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
new file mode 100644
index 00000000..d0e05b27
--- /dev/null
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,41 @@
+name: pre-commit
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    types: [opened, synchronize, reopened, ready_for_review]
+
+permissions:
+  contents: read
+
+jobs:
+  run-pre-commit:
+    name: Run pre-commit
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+
+      - name: Install pre-commit
+        run: pip install --upgrade pip pre-commit
+
+      - name: Cache pre-commit environments
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/pre-commit
+          key: pre-commit-${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }}
+          restore-keys: |
+            pre-commit-${{ runner.os }}-
+
+      - name: Run pre-commit on all files
+        run: pre-commit run --all-files --show-diff-on-failure --color=always
+
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..c7ccbdcb
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,50 @@
+default_language_version:
+  python: python3
+
+ci:
+  autofix_prs: true
+  autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
+  autoupdate_schedule: quarterly
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: check-yaml
+      - id: check-case-conflict
+      - id: detect-private-key
+      - id: check-added-large-files
+        args: ['--maxkb=1000']
+      - id: requirements-txt-fixer
+
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.14.7
+    hooks:
+      - id: ruff-check
+        args: [ --fix ]
+        exclude: ^flow_grpo/
+
+  - repo: https://github.com/PyCQA/autoflake
+    rev: v2.0.2
+    hooks:
+      - id: autoflake
+        args: [--remove-all-unused-imports, --in-place]
+        exclude: ^flow_grpo/
+
+  - repo: https://github.com/pycqa/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+        args:
+          - "--profile=black"
+          - "--filter-files"
+        additional_dependencies: []
+        exclude: ^flow_grpo/
+
+  - repo: https://github.com/psf/black
+    rev: 24.3.0
+    hooks:
+      - id: black
+        name: Format code
+        additional_dependencies: ['click==8.0.2']
+        exclude: ^flow_grpo/
diff --git a/miles/utils/misc.py b/miles/utils/misc.py
index 94407cb2..3afb6833 100644
--- a/miles/utils/misc.py
+++ b/miles/utils/misc.py
@@ -1,16 +1,52 @@
 import importlib
+from contextlib import contextmanager
 
 import ray
 
 from miles.utils.http_utils import is_port_available
 
 
+# Mainly used for test purpose where `load_function` needs to load many in-flight generated functions
+class FunctionRegistry:
+    def __init__(self):
+        self._registry: dict[str, object] = {}
+
+    @contextmanager
+    def temporary(self, name: str, fn: object):
+        self._register(name, fn)
+        try:
+            yield
+        finally:
+            self._unregister(name)
+
+    def get(self, name: str) -> object | None:
+        return self._registry.get(name)
+
+    def _register(self, name: str, fn: object) -> None:
+        assert name not in self._registry
+        self._registry[name] = fn
+
+    def _unregister(self, name: str) -> None:
+        assert name in self._registry
+        self._registry.pop(name)
+
+
+function_registry = FunctionRegistry()
+
+
 def load_function(path):
     """
-    Load a function from a module.
+    Load a function from registry or module.
     :param path: The path to the function, e.g. "module.submodule.function".
     :return: The function object.
     """
+    if path is None:
+        return None
+
+    registered = function_registry.get(path)
+    if registered is not None:
+        return registered
+
     module_path, _, attr = path.rpartition(".")
     module = importlib.import_module(module_path)
     return getattr(module, attr)
diff --git a/pyproject.toml b/pyproject.toml
index 7d9ca6f6..b0f7628e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,6 +24,9 @@ line_length = 119
 
 [tool.ruff]
 line-length = 320  # TODO
+extend-exclude = ["flow_grpo"]
+
+[tool.ruff.lint]
 select = [
     "E",      # Pycodestyle Errors (Structural/Fundamental Errors like bad indentation)
     "F",      # Pyflakes (Core Errors: Unused imports, undefined names)
@@ -40,6 +43,7 @@ ignore = [
 # -vv will also display tests with duration = 0.00s
 addopts = "--verbose --pyargs --durations=0 --strict-markers"  # always add these arguments to pytest
 testpaths = ["./tests"]  # must be an explicit path to avoid importing another "tests" module
+asyncio_mode = "auto"  # async tests work without @pytest.mark.asyncio
 # directories to ignore when discovering tests
 norecursedirs = [
     "external",
diff --git a/requirements.txt b/requirements.txt
index af922654..e0114142 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,13 +17,15 @@ peft==0.18.1
 pillow==11.3.0
 pydantic==2.12.5
 pylatexenc==2.10
+pytest>=7.0.0
+pytest-asyncio
 python-Levenshtein==0.27.3
 pyyaml==6.0.1
 qwen_vl_utils==0.0.14
 ray[default]==2.53.0
 requests==2.32.5
-safetensors==0.7.0
 ring_flash_attn==0.1.8
+safetensors==0.7.0
 sglang-router==0.3.0
 tensorboard==2.20.0
 transformers==5.5.4
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/ci/__init__.py b/tests/ci/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/ci/ci_register.py b/tests/ci/ci_register.py
new file mode 100644
index 00000000..d2cd7f73
--- /dev/null
+++ b/tests/ci/ci_register.py
@@ -0,0 +1,240 @@
+import ast
+import warnings
+from dataclasses import dataclass, field
+from enum import Enum, auto
+
+from tests.ci.labels import KNOWN_LABELS
+
+__all__ = [
+    "HWBackend",
+    "CIRegistry",
+    "collect_tests",
+    "register_cpu_ci",
+    "register_cuda_ci",
+    "ut_parse_one_file",
+]
+
+# Only these two parameters may be passed positionally; everything else
+# (labels, always_on, nightly, disabled) is keyword-only.
+_POSITIONAL_PARAMS = ("est_time", "suite")
+
+# All accepted keyword arguments (in addition to the positional pair above).
+_VALID_KWARGS = frozenset({"est_time", "suite", "labels", "nightly", "disabled"})
+
+_REGISTER_NAMES = frozenset({"register_cpu_ci", "register_cuda_ci"})
+
+_UNSET = object()
+
+
+class HWBackend(Enum):
+    CPU = auto()
+    CUDA = auto()
+
+
+@dataclass
+class CIRegistry:
+    backend: HWBackend
+    filename: str
+    est_time: float
+    suite: str
+    labels: list[str] = field(default_factory=list)
+    nightly: bool = False
+    disabled: str | None = None  # None = enabled, string = disabled reason
+
+
+def register_cpu_ci(
+    est_time: float,
+    suite: str,
+    *,
+    labels: list[str] | None = None,
+    nightly: bool = False,
+    disabled: str | None = None,
+):
+    """Marker for CPU CI registration (parsed via AST; runtime no-op).
+
+    `labels=None` and `labels=[]` are equivalent: the test runs on every PR
+    regardless of `run-ci-*` labels. A non-empty `labels` list gates the test
+    on PR labels — the test runs when the PR carries `run-ci-<x>` for any
+    `<x>` in `labels`.
+    """
+    return None
+
+
+def register_cuda_ci(
+    est_time: float,
+    suite: str,
+    *,
+    labels: list[str] | None = None,
+    nightly: bool = False,
+    disabled: str | None = None,
+):
+    """Marker for CUDA CI registration (parsed via AST; runtime no-op).
+
+    See `register_cpu_ci` for label semantics.
+    """
+    return None
+
+
+_REGISTER_BACKEND_MAP = {
+    "register_cpu_ci": HWBackend.CPU,
+    "register_cuda_ci": HWBackend.CUDA,
+}
+
+
+def _extract_constant(node: ast.AST) -> object:
+    """Return the literal value of an ast.Constant; otherwise return _UNSET.
+
+    Sentinel return (instead of raising) lets callers compose richer error
+    messages with parameter names and file paths.
+    """
+    if isinstance(node, ast.Constant):
+        return node.value
+    return _UNSET
+
+
+def _extract_list_constant(node: ast.AST, *, context: str = "value") -> list:
+    """Return a list of literal string constants from `ast.List`.
+
+    Accepts `None` (as `ast.Constant(None)`) and treats it as an empty list,
+    so callers may write `labels=None` interchangeably with `labels=[]`.
+
+    Raises ValueError when the node is neither a list literal of string
+    constants nor a literal `None`.
+    """
+    if isinstance(node, ast.Constant) and node.value is None:
+        return []
+    if not isinstance(node, ast.List):
+        raise ValueError(f"{context} must be a list of string literals or None (got {type(node).__name__})")
+    out: list = []
+    for elt in node.elts:
+        v = _extract_constant(elt)
+        if v is _UNSET:
+            raise ValueError(f"{context} must be a list of string literals (non-literal element)")
+        if not isinstance(v, str):
+            raise ValueError(f"{context} must be a list of string literals (got {type(v).__name__} element)")
+        out.append(v)
+    return out
+
+
+class RegistryVisitor(ast.NodeVisitor):
+    def __init__(self, filename: str):
+        self.filename = filename
+        self.registries: list[CIRegistry] = []
+
+    def _parse_call_args(self, func_call: ast.Call, func_name: str) -> CIRegistry:
+        if any(isinstance(arg, ast.Starred) for arg in func_call.args):
+            raise ValueError(f"{self.filename}: starred arguments are not supported in {func_name}()")
+
+        if len(func_call.args) > len(_POSITIONAL_PARAMS):
+            raise ValueError(
+                f"{self.filename}: too many positional arguments in {func_name}(); "
+                f"only {list(_POSITIONAL_PARAMS)} may be positional "
+                f"(labels and later are keyword-only)"
+            )
+
+        parsed: dict[str, object] = {}
+
+        for name, arg in zip(_POSITIONAL_PARAMS, func_call.args, strict=False):
+            v = _extract_constant(arg)
+            if v is _UNSET:
+                raise ValueError(f"{self.filename}: {name} in {func_name}() must be a literal constant")
+            parsed[name] = v
+
+        for kw in func_call.keywords:
+            if kw.arg is None:
+                raise ValueError(f"{self.filename}: **kwargs are not supported in {func_name}()")
+            if kw.arg in parsed:
+                raise ValueError(f"{self.filename}: duplicated argument '{kw.arg}' in {func_name}()")
+            if kw.arg not in _VALID_KWARGS:
+                raise ValueError(f"{self.filename}: unknown argument '{kw.arg}' in {func_name}()")
+            if kw.arg == "labels":
+                parsed["labels"] = _extract_list_constant(
+                    kw.value, context=f"{self.filename}: labels in {func_name}()"
+                )
+            else:
+                v = _extract_constant(kw.value)
+                if v is _UNSET:
+                    raise ValueError(f"{self.filename}: {kw.arg} in {func_name}() must be a literal constant")
+                parsed[kw.arg] = v
+
+        if "est_time" not in parsed:
+            raise ValueError(f"{self.filename}: est_time is required in {func_name}()")
+        if "suite" not in parsed:
+            raise ValueError(f"{self.filename}: suite is required in {func_name}()")
+
+        if not isinstance(parsed["est_time"], (int, float)):
+            raise ValueError(f"{self.filename}: est_time must be a number in {func_name}()")
+        if not isinstance(parsed["suite"], str):
+            raise ValueError(f"{self.filename}: suite must be a string in {func_name}()")
+
+        # `labels` is optional. Missing / None / [] all mean "always run on
+        # every PR"; only a non-empty list gates the test on PR labels.
+        labels = parsed.get("labels", [])
+        if not isinstance(labels, list):
+            raise ValueError(f"{self.filename}: labels must be a list or None in {func_name}()")
+
+        nightly = parsed.get("nightly", False)
+        if not isinstance(nightly, bool):
+            raise ValueError(f"{self.filename}: nightly must be a boolean in {func_name}()")
+
+        disabled = parsed.get("disabled", None)
+        if disabled is not None and not isinstance(disabled, str):
+            raise ValueError(f"{self.filename}: disabled must be a string or None in {func_name}()")
+
+        unknown = [label for label in labels if label not in KNOWN_LABELS]
+        if unknown:
+            valid_list = ", ".join(sorted(KNOWN_LABELS))
+            raise ValueError(
+                f"{self.filename}: unknown labels {unknown} in {func_name}(); "
+                f"valid labels: [{valid_list}]. "
+                f"To add a new label: edit tests/ci/labels.py + create matching "
+                f"`run-ci-<label>` in GitHub repo Settings -> Labels."
+            )
+
+        return CIRegistry(
+            backend=_REGISTER_BACKEND_MAP[func_name],
+            filename=self.filename,
+            est_time=float(parsed["est_time"]),
+            suite=parsed["suite"],
+            labels=list(labels),
+            nightly=nightly,
+            disabled=disabled,
+        )
+
+    def _collect_ci_registry(self, func_call: ast.Call):
+        if not isinstance(func_call.func, ast.Name):
+            return None
+        if func_call.func.id not in _REGISTER_NAMES:
+            return None
+        return self._parse_call_args(func_call, func_call.func.id)
+
+    def visit_Module(self, node):
+        for stmt in node.body:
+            if not isinstance(stmt, ast.Expr) or not isinstance(stmt.value, ast.Call):
+                continue
+            cr = self._collect_ci_registry(stmt.value)
+            if cr is not None:
+                self.registries.append(cr)
+
+
+def ut_parse_one_file(filename: str) -> list[CIRegistry]:
+    with open(filename) as f:
+        file_content = f.read()
+    tree = ast.parse(file_content, filename=filename)
+    visitor = RegistryVisitor(filename=filename)
+    visitor.visit(tree)
+    return visitor.registries
+
+
+def collect_tests(files: list[str], sanity_check: bool = True) -> list[CIRegistry]:
+    ci_tests: list[CIRegistry] = []
+    for file in files:
+        registries = ut_parse_one_file(file)
+        if len(registries) == 0:
+            msg = f"No CI registry found in {file}"
+            if sanity_check:
+                raise ValueError(msg)
+            warnings.warn(msg, stacklevel=2)
+            continue
+        ci_tests.extend(registries)
+    return ci_tests
diff --git a/tests/ci/ci_utils.py b/tests/ci/ci_utils.py
new file mode 100644
index 00000000..c2e7459a
--- /dev/null
+++ b/tests/ci/ci_utils.py
@@ -0,0 +1,372 @@
+import logging
+import os
+import re
+import signal
+import subprocess
+import threading
+import time
+from collections.abc import Callable
+from dataclasses import dataclass
+
+from tests.ci.ci_register import CIRegistry
+
+# Configure logger to output to stdout
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class TestFile:
+    name: str
+    estimated_time: float = 60
+
+
+# Patterns that indicate retriable accuracy/performance failures
+RETRIABLE_PATTERNS = [
+    r"AssertionError:.*not greater than",
+    r"AssertionError:.*not less than",
+    r"AssertionError:.*not equal to",
+    r"AssertionError:.*!=.*expected",
+    r"accuracy",
+    r"score",
+    r"latency",
+    r"throughput",
+    r"timeout",
+]
+
+# Patterns that indicate non-retriable failures (real code errors)
+NON_RETRIABLE_PATTERNS = [
+    r"SyntaxError",
+    r"ImportError",
+    r"ModuleNotFoundError",
+    r"NameError",
+    r"TypeError",
+    r"AttributeError",
+    r"RuntimeError",
+    r"CUDA out of memory",
+    r"OOM",
+    r"Segmentation fault",
+    r"core dumped",
+    r"ConnectionRefusedError",
+    r"FileNotFoundError",
+]
+
+
+def is_retriable_failure(output: str) -> tuple[bool, str]:
+    """
+    Determine if a test failure is retriable based on output patterns.
+
+    Returns:
+        tuple: (is_retriable, reason)
+    """
+    # Check for non-retriable patterns first
+    for pattern in NON_RETRIABLE_PATTERNS:
+        if re.search(pattern, output, re.IGNORECASE):
+            return False, f"non-retriable error: {pattern}"
+
+    # Check for retriable patterns
+    for pattern in RETRIABLE_PATTERNS:
+        if re.search(pattern, output, re.IGNORECASE):
+            return True, f"retriable pattern: {pattern}"
+
+    # If we have an AssertionError but didn't match non-retriable, assume retriable
+    if re.search(r"AssertionError", output):
+        return True, "AssertionError (assuming retriable)"
+
+    # Default: not retriable
+    return False, "unknown failure type"
+
+
+def _kill_process_tree(pgid: int):
+    """Kill a process group by its PGID."""
+    try:
+        os.killpg(pgid, signal.SIGKILL)
+    except ProcessLookupError:
+        pass
+    except Exception as e:
+        logger.warning(f"Error killing process group {pgid}: {e}")
+
+
+def run_with_timeout(
+    func: Callable,
+    args: tuple = (),
+    kwargs: dict | None = None,
+    timeout: float = None,
+):
+    """Run a function with timeout."""
+    ret_value = []
+    exception_holder = []
+
+    def _target_func():
+        try:
+            ret_value.append(func(*args, **(kwargs or {})))
+        except Exception as e:
+            exception_holder.append(e)
+
+    t = threading.Thread(target=_target_func)
+    t.start()
+    t.join(timeout=timeout)
+    if t.is_alive():
+        raise TimeoutError()
+
+    if exception_holder:
+        raise exception_holder[0]
+
+    if not ret_value:
+        raise RuntimeError("Thread completed but no return value or exception was captured.")
+
+    return ret_value[0]
+
+
+def write_github_step_summary(content: str):
+    """Write content to GitHub Step Summary if available."""
+    summary_file = os.environ.get("GITHUB_STEP_SUMMARY")
+    if summary_file:
+        with open(summary_file, "a") as f:
+            f.write(content)
+
+
+def _gha_emit_group(title: str) -> None:
+    if os.environ.get("GITHUB_ACTIONS") != "true":
+        return
+    safe = title.replace("%", "%25").replace("\r", "%0D").replace("\n", "%0A")
+    print(f"::group::{safe}", flush=True)
+
+
+def _gha_emit_endgroup() -> None:
+    if os.environ.get("GITHUB_ACTIONS") != "true":
+        return
+    print("::endgroup::", flush=True)
+
+
+def _gha_emit_summary(
+    i: int,
+    n: int,
+    filename: str,
+    status: str,
+    elapsed: float,
+    exit_code: int | None = None,
+    timeout_after: float | None = None,
+    retry_of: int | None = None,
+) -> None:
+    if os.environ.get("GITHUB_ACTIONS") != "true":
+        return
+    safe_name = filename.replace("\r", "\\r").replace("\n", "\\n")
+    line = f"[{i}/{n}] {safe_name}  {status}  elapsed={int(elapsed)}s"
+    if exit_code is not None:
+        line += f" exit={int(exit_code)}"
+    if timeout_after is not None:
+        line += f" timeout_after={int(timeout_after)}s"
+    if retry_of is not None:
+        line += f" retry_of=attempt={int(retry_of)}"
+    print(line, flush=True)
+
+
+def run_unittest_files(
+    files: list[TestFile] | list[CIRegistry],
+    timeout_per_file: float,
+    continue_on_error: bool = False,
+    enable_retry: bool = False,
+    max_attempts: int = 2,
+    retry_wait_seconds: int = 60,
+):
+    """
+    Run a list of test files.
+
+    Args:
+        files: List of TestFile or CIRegistry objects to run
+        timeout_per_file: Timeout in seconds for each test file
+        continue_on_error: If True, continue running remaining tests even if one fails.
+                          If False, stop at first failure (default behavior for PR tests).
+        enable_retry: If True, retry failed tests that appear to be accuracy/performance
+                     assertion failures (not code errors).
+        max_attempts: Maximum number of attempts per file including initial run (default: 2).
+        retry_wait_seconds: Seconds to wait between retries (default: 60).
+    """
+    tic = time.perf_counter()
+    success = True
+    passed_tests = []
+    failed_tests = []
+    retried_tests = []  # Track which tests were retried
+
+    for i, file in enumerate(files):
+        if isinstance(file, CIRegistry):
+            filename, estimated_time = file.filename, file.est_time
+        else:
+            filename, estimated_time = file.name, file.estimated_time
+
+        effective_timeout = max(timeout_per_file, int(estimated_time * 1.25))
+
+        process = None
+        output_lines = []
+
+        def run_one_file(filename, capture_output=False, _i=i, _estimated_time=estimated_time):
+            nonlocal process, output_lines
+
+            full_path = os.path.join(os.getcwd(), filename)
+            logger.info(f".\n.\nBegin ({_i}/{len(files) - 1}):\npython3 {full_path}\n.\n.\n")
+            file_tic = time.perf_counter()
+
+            if capture_output:
+                # Capture output for retry decision
+                process = subprocess.Popen(
+                    ["python3", full_path],
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT,
+                    text=True,
+                    errors="ignore",
+                    start_new_session=True,
+                )
+                output_lines = []
+                for line in process.stdout:
+                    logger.info(line.rstrip())
+                    output_lines.append(line)
+                process.wait()
+            else:
+                process = subprocess.Popen(
+                    ["python3", full_path],
+                    stdout=None,
+                    stderr=None,
+                    start_new_session=True,
+                )
+                process.wait()
+
+            elapsed = time.perf_counter() - file_tic
+
+            logger.info(f".\n.\nEnd ({_i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {_estimated_time=}\n.\n.\n")
+            return process.returncode
+
+        # Retry loop for each file
+        attempt = 1
+        file_passed = False
+        was_retried = False
+
+        while attempt <= (max_attempts if enable_retry else 1):
+            if attempt > 1:
+                logger.info(f"\n[CI Retry] Attempt {attempt}/{max_attempts} for {filename}\n")
+                was_retried = True
+
+            attempt_tic = time.perf_counter()
+            current_attempt = attempt
+            group_title = f"{filename}  ({i + 1}/{len(files)} est={int(estimated_time)}s attempt={current_attempt})"
+            _gha_emit_group(group_title)
+            attempt_status: str | None = None
+            attempt_exit_code: int | None = None
+            attempt_timeout_after: float | None = None
+            attempt_elapsed: float = 0.0
+
+            try:
+                try:
+                    ret_code = run_with_timeout(
+                        run_one_file,
+                        args=(filename,),
+                        kwargs={"capture_output": enable_retry},
+                        timeout=effective_timeout,
+                    )
+                    attempt_elapsed = time.perf_counter() - attempt_tic
+
+                    if ret_code == 0:
+                        attempt_status = "PASS"
+                        file_passed = True
+                        if was_retried:
+                            logger.info(f"\nPASSED on retry (attempt {attempt}): {filename}\n")
+                            retried_tests.append((filename, attempt, "passed"))
+                        passed_tests.append(filename)
+                        break
+                    else:
+                        attempt_status = "FAIL"
+                        attempt_exit_code = ret_code
+                        # Check if we should retry
+                        if enable_retry and attempt < max_attempts:
+                            output = "".join(output_lines)
+                            is_retriable, reason = is_retriable_failure(output)
+
+                            if is_retriable:
+                                logger.info(f"\n[CI Retry] {filename} failed with {reason}")
+                                logger.info(f"[CI Retry] Waiting {retry_wait_seconds}s before retry...\n")
+                                time.sleep(retry_wait_seconds)
+                                attempt += 1
+                                continue
+                            else:
+                                logger.info(f"\n[CI Retry] {filename} failed with {reason} - not retrying\n")
+
+                        # No retry or not retriable
+                        logger.info(f"\nFAILED: {filename} returned exit code {ret_code}\n")
+                        if was_retried:
+                            retried_tests.append((filename, attempt, "failed"))
+                        failed_tests.append((filename, f"exit code {ret_code}"))
+                        break
+
+                except TimeoutError:
+                    attempt_elapsed = time.perf_counter() - attempt_tic
+                    attempt_status = "TIMEOUT"
+                    attempt_timeout_after = effective_timeout
+                    _kill_process_tree(process.pid)
+                    time.sleep(5)
+                    logger.info(f"\nTIMEOUT: {filename} after {effective_timeout} seconds\n")
+                    if was_retried:
+                        retried_tests.append((filename, attempt, "timeout"))
+                    failed_tests.append((filename, f"timeout after {effective_timeout}s"))
+                    break
+                except Exception:
+                    attempt_elapsed = time.perf_counter() - attempt_tic
+                    attempt_status = "FAIL"
+                    raise
+            finally:
+                _gha_emit_endgroup()
+                if attempt_status is not None:
+                    _gha_emit_summary(
+                        i + 1,
+                        len(files),
+                        filename,
+                        attempt_status,
+                        elapsed=attempt_elapsed,
+                        exit_code=attempt_exit_code,
+                        timeout_after=attempt_timeout_after,
+                        retry_of=(current_attempt - 1) if current_attempt >= 2 else None,
+                    )
+
+        if not file_passed:
+            success = False
+            if not continue_on_error:
+                break
+
+    elapsed_total = time.perf_counter() - tic
+
+    if success:
+        logger.info(f"Success. Time elapsed: {elapsed_total:.2f}s")
+    else:
+        logger.info(f"Fail. Time elapsed: {elapsed_total:.2f}s")
+
+    # Print summary
+    logger.info(f"\n{'='*60}")
+    logger.info(f"Test Summary: {len(passed_tests)}/{len(files)} passed")
+    if enable_retry and retried_tests:
+        logger.info(f"Retries: {len(retried_tests)} test(s) were retried")
+    logger.info(f"{'='*60}")
+    if passed_tests:
+        logger.info("PASSED:")
+        for test in passed_tests:
+            logger.info(f"  {test}")
+    if failed_tests:
+        logger.info("\nFAILED:")
+        for test, reason in failed_tests:
+            logger.info(f"  {test} ({reason})")
+    if retried_tests:
+        logger.info("\nRETRIED:")
+        for test, attempts, result in retried_tests:
+            logger.info(f"  {test} ({attempts} attempts, {result})")
+    logger.info(f"{'='*60}\n")
+
+    # Write GitHub Step Summary only if retries occurred
+    if retried_tests:
+        passed_on_retry = [t for t, _, r in retried_tests if r == "passed"]
+        failed_after_retry = [t for t, _, r in retried_tests if r != "passed"]
+        summary = f"**Retried {len(retried_tests)} test(s):**\n"
+        if passed_on_retry:
+            summary += f"- Passed on retry: {', '.join(passed_on_retry)}\n"
+        if failed_after_retry:
+            summary += f"- Still failed: {', '.join(failed_after_retry)}\n"
+        write_github_step_summary(summary)
+
+    return 0 if success else -1
diff --git a/tests/ci/cpu_stubs/pyproject.toml b/tests/ci/cpu_stubs/pyproject.toml
new file mode 100644
index 00000000..0efc1aea
--- /dev/null
+++ b/tests/ci/cpu_stubs/pyproject.toml
@@ -0,0 +1,11 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "sgl-kernel-stub"
+version = "0.0.0"
+description = "Stub sgl_kernel package for CPU-only CI (real sgl_kernel is GPU-only)"
+
+[tool.setuptools.packages.find]
+include = ["sgl_kernel*"]
diff --git a/tests/ci/cpu_stubs/sgl_kernel/__init__.py b/tests/ci/cpu_stubs/sgl_kernel/__init__.py
new file mode 100644
index 00000000..ffe79759
--- /dev/null
+++ b/tests/ci/cpu_stubs/sgl_kernel/__init__.py
@@ -0,0 +1,18 @@
+"""Stub sgl_kernel for CPU-only CI.
+
+The real sgl_kernel ships CUDA kernels and cannot be installed on an
+ubuntu-latest CPU runner. miles' import chain loads sglang modules that
+unconditionally `from sgl_kernel... import ...` at module load time (gated
+only on NPU/XPU/MPS, not CPU). Any attribute access on this stub returns a
+MagicMock so imports succeed; if stub symbols are actually *called* at test
+runtime, the MagicMock will return further MagicMocks — and if a test
+depends on real kernel behavior it will fail loudly, which is correct.
+"""
+
+from unittest.mock import MagicMock
+
+
+def __getattr__(name: str):
+    if name.startswith("__"):
+        raise AttributeError(name)
+    return MagicMock(name=f"sgl_kernel.{name}")
diff --git a/tests/ci/cpu_stubs/sgl_kernel/kvcacheio.py b/tests/ci/cpu_stubs/sgl_kernel/kvcacheio.py
new file mode 100644
index 00000000..21020752
--- /dev/null
+++ b/tests/ci/cpu_stubs/sgl_kernel/kvcacheio.py
@@ -0,0 +1,9 @@
+"""Stub sgl_kernel.kvcacheio — see sgl_kernel/__init__.py for rationale."""
+
+from unittest.mock import MagicMock
+
+
+def __getattr__(name: str):
+    if name.startswith("__"):
+        raise AttributeError(name)
+    return MagicMock(name=f"sgl_kernel.kvcacheio.{name}")
diff --git a/tests/ci/labels.py b/tests/ci/labels.py
new file mode 100644
index 00000000..5af31dc4
--- /dev/null
+++ b/tests/ci/labels.py
@@ -0,0 +1,27 @@
+"""Canonical CI label registry.
+
+Tests declare a domain label set in `register_cuda_ci(..., labels=[...])` and
+`register_cpu_ci(..., labels=[...])`. The PR-side trigger for each label is
+`run-ci-<key>`: each entry below MUST have a matching `run-ci-<key>` label in
+the GitHub repo (maintainer-managed).
+
+Adding a new label:
+1) Add an entry below.
+2) Create the matching `run-ci-<key>` label in GitHub repo Settings -> Labels.
+   The workflow does not need editing -- the generic stage job filters tests
+   by labels at runtime.
+
+The meta-labels `run-ci-image` / `run-ci-all` are intentionally NOT listed
+here: they bypass the per-test labels filter and run the full suite via the
+`--match-all-labels` flag (handled in run_suite.py).
+"""
+
+KNOWN_LABELS: dict[str, str] = {
+    "sglang-diffusion": "sglang_diffusion_utils engine / monkey patch tests",
+    "fsdp": "FSDP backend + config tests",
+    "rollout": "Rollout sampling / filter / strategy tests",
+    "ray": "Ray actor / placement_group tests",
+    "router": "Router routing decision tests",
+    "arguments": "Top-level argparse / validate_args tests",
+    "model-scripts": "train_diffusion.py + scripts/*.sh smoke tests",
+}
diff --git a/tests/ci/run_suite.py b/tests/ci/run_suite.py
new file mode 100644
index 00000000..cffb0dbf
--- /dev/null
+++ b/tests/ci/run_suite.py
@@ -0,0 +1,357 @@
+import argparse
+import glob
+import subprocess
+import sys
+import warnings
+from collections.abc import Iterable
+from pathlib import Path
+
+from tests.ci.ci_register import CIRegistry, HWBackend, collect_tests
+from tests.ci.ci_utils import run_unittest_files
+
+HW_MAPPING = {
+    "cpu": HWBackend.CPU,
+    "cuda": HWBackend.CUDA,
+}
+
+# PR-side label prefix the workflow attaches to every domain label and passes
+# verbatim to `--labels`. Stripping is done here (not in YAML) so the filter
+# is unit-testable and the workflow stays a thin pass-through.
+_RUN_CI_PREFIX = "run-ci-"
+
+# Per-commit test suites (run on every PR; per-domain selection is done at
+# runtime by `filter_tests` via the `--labels` arg, not via per-suite jobs).
+#
+# CUDA suites: empty for now — miles-D does not yet have self-hosted GPU
+# runners. Add stage-c-* entries here when the GPU fleet is provisioned and
+# matching jobs are added to .github/workflows/pr-test.yml.
+PER_COMMIT_SUITES = {
+    HWBackend.CPU: [
+        "stage-a-cpu",
+        "stage-b-cpu",
+    ],
+    HWBackend.CUDA: [],
+}
+
+# Nightly test suites (placeholder for future use)
+NIGHTLY_SUITES = {
+    HWBackend.CUDA: [],
+}
+
+
+def strip_run_ci_prefix(raw_labels: Iterable[str]) -> set[str]:
+    """Strip the `run-ci-` prefix from each PR-side label.
+
+    Inputs come straight from the workflow (e.g. `["run-ci-megatron",
+    "run-ci-fsdp"]`). Empty input yields an empty set. Items missing the
+    `run-ci-` prefix are skipped after emitting a `warnings.warn(...)` --
+    the workflow contract requires every passed label to be a raw
+    `run-ci-<X>` string, and silently including a non-prefixed item would
+    risk matching the wrong domain label (e.g. bare `"megatron"` colliding
+    with a test's domain label by accident).
+    """
+    stripped: set[str] = set()
+    for raw in raw_labels:
+        if not raw:
+            continue
+        if raw.startswith(_RUN_CI_PREFIX):
+            stripped.add(raw[len(_RUN_CI_PREFIX) :])
+        else:
+            warnings.warn(
+                f"--labels entry {raw!r} is missing the expected {_RUN_CI_PREFIX!r} "
+                f"prefix; ignoring. The workflow must pass raw `run-ci-<X>` labels.",
+                stacklevel=2,
+            )
+    return stripped
+
+
+def filter_tests(
+    ci_tests: list[CIRegistry],
+    hw: HWBackend,
+    suite: str,
+    nightly: bool = False,
+    labels: set[str] | None = None,
+    match_all_labels: bool = False,
+) -> tuple[list[CIRegistry], list[CIRegistry]]:
+    """Filter registered tests down to the set that should run.
+
+    The base predicate (hw / suite / nightly / disabled) is applied first.
+    Label selection then narrows further, with two modes:
+
+    * `match_all_labels=True`: ignore labels entirely -- every enabled test
+      that matches hw/suite/nightly runs. Used for the `run-ci-image` /
+      `run-ci-all` meta-labels and for `workflow_dispatch`. Precedence: this
+      mode wins even when `labels` is also passed.
+    * `match_all_labels=False` (default): include only tests where
+      `not test.labels or (set(test.labels) & labels)`. `labels` here is
+      the already-stripped domain-label set produced by
+      `strip_run_ci_prefix`. A test registered with `labels=[]` (or
+      omitted) is treated as always-run: it survives an empty PR-label
+      set; a test with non-empty `labels` survives only when its labels
+      intersect the PR-supplied set.
+    """
+    ci_tests = [t for t in ci_tests if t.backend == hw and t.suite == suite and t.nightly == nightly]
+
+    valid_suites = NIGHTLY_SUITES.get(hw, []) if nightly else PER_COMMIT_SUITES.get(hw, [])
+
+    if suite not in valid_suites:
+        print(f"Warning: Unknown suite {suite} for backend {hw.name}, nightly={nightly}")
+
+    if not match_all_labels:
+        label_set: set[str] = labels or set()
+        ci_tests = [t for t in ci_tests if not t.labels or (set(t.labels) & label_set)]
+
+    enabled_tests = [t for t in ci_tests if t.disabled is None]
+    skipped_tests = [t for t in ci_tests if t.disabled is not None]
+
+    return enabled_tests, skipped_tests
+
+
+def auto_partition(files: list[CIRegistry], rank, size):
+    """
+    Partition files into size sublists with approximately equal sums of estimated times
+    using a greedy algorithm (LPT heuristic), and return the partition for the specified rank.
+    """
+    if not files or size <= 0:
+        return []
+
+    # Sort files by estimated_time in descending order (LPT heuristic).
+    # Use filename as tie-breaker to ensure deterministic partitioning
+    # regardless of glob ordering.
+    sorted_files = sorted(files, key=lambda f: (-f.est_time, f.filename))
+
+    partitions = [[] for _ in range(size)]
+    partition_sums = [0.0] * size
+
+    # Greedily assign each file to the partition with the smallest current total time
+    for file in sorted_files:
+        min_sum_idx = min(range(size), key=partition_sums.__getitem__)
+        partitions[min_sum_idx].append(file)
+        partition_sums[min_sum_idx] += file.est_time
+
+    if rank < size:
+        return partitions[rank]
+    return []
+
+
+def _is_e2e_discovery_file(filename: str) -> bool:
+    basename = Path(filename).name
+    return (
+        basename != "conftest.py"
+        and basename != "__init__.py"
+        and not basename.startswith("_")
+        and not filename.endswith(".gitkeep")
+        # Exclude helper modules that aren't test files
+        and "/sglang_patch/sglang_server.py" not in filename
+        and "/sglang/utils/" not in filename
+        and "short/test_dumper.py" not in filename
+    )
+
+
+def pretty_print_tests(args, ci_tests: list[CIRegistry], skipped_tests: list[CIRegistry]):
+    hw = HW_MAPPING[args.hw]
+    suite = args.suite
+    nightly = args.nightly
+    if args.auto_partition_size:
+        partition_info = (
+            f"{args.auto_partition_id + 1}/{args.auto_partition_size} " f"(0-based id={args.auto_partition_id})"
+        )
+    else:
+        partition_info = "full"
+
+    msg = f"\n{'='*60}\n"
+    msg += f"Hardware: {hw.name}  Suite: {suite}  Nightly: {nightly}  Partition: {partition_info}\n"
+    msg += f"{'='*60}\n"
+
+    if skipped_tests:
+        msg += f"Skipped {len(skipped_tests)} test(s):\n"
+        for t in skipped_tests:
+            reason = t.disabled or "disabled"
+            msg += f"  - {t.filename} (reason: {reason})\n"
+        msg += "\n"
+
+    if len(ci_tests) == 0:
+        msg += f"No tests found for hw={hw.name}, suite={suite}, nightly={nightly}\n"
+        msg += "This is expected during incremental migration. Skipping.\n"
+    else:
+        total_est_time = sum(t.est_time for t in ci_tests)
+        msg += f"Enabled {len(ci_tests)} test(s) (est total {total_est_time:.0f}s):\n"
+        for t in ci_tests:
+            msg += f"  - {t.filename} (est_time={t.est_time}s)\n"
+
+    print(msg, flush=True)
+
+
+def run_a_suite(args):
+    hw = HW_MAPPING[args.hw]
+    suite = args.suite
+    nightly = args.nightly
+    auto_partition_id = args.auto_partition_id
+    auto_partition_size = args.auto_partition_size
+
+    # Discover test files: e2e/ for CUDA, fast/ for CPU
+    e2e_files = [f for f in glob.glob("tests/e2e/**/*.py", recursive=True) if _is_e2e_discovery_file(f)]
+    fast_files = [
+        f
+        for f in glob.glob("tests/fast/**/*.py", recursive=True)
+        if "/test_" in f
+        and not f.endswith("/conftest.py")
+        and not f.endswith("/__init__.py")
+        and not f.endswith("/utils.py")
+    ] + glob.glob("tests/utils/test_*.py")
+    files = e2e_files + fast_files
+
+    all_tests = collect_tests(files, sanity_check=False)
+    stripped_labels = strip_run_ci_prefix(args.labels or [])
+    ci_tests, skipped_tests = filter_tests(
+        all_tests,
+        hw,
+        suite,
+        nightly,
+        labels=stripped_labels,
+        match_all_labels=args.match_all_labels,
+    )
+
+    if auto_partition_size:
+        ci_tests = auto_partition(ci_tests, auto_partition_id, auto_partition_size)
+
+    pretty_print_tests(args, ci_tests, skipped_tests)
+
+    if len(ci_tests) == 0:
+        print("No tests to run. Exiting with success.", flush=True)
+        return 0
+
+    if args.list_only:
+        return 0
+
+    # CPU tests (fast/) use pytest; CUDA tests use python3 per-file
+    if hw == HWBackend.CPU:
+        cmd = ["pytest"] + [t.filename for t in ci_tests] + ["-x", "-v"]
+        print(f"Running: {' '.join(cmd)}", flush=True)
+        return subprocess.call(cmd)
+
+    # Add extra timeout when retry is enabled
+    timeout = args.timeout_per_file
+    if args.enable_retry:
+        timeout += args.retry_timeout_increase
+
+    return run_unittest_files(
+        ci_tests,
+        timeout_per_file=timeout,
+        continue_on_error=args.continue_on_error,
+        enable_retry=args.enable_retry,
+        max_attempts=args.max_attempts,
+        retry_wait_seconds=args.retry_wait_seconds,
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run CI test suites from tests/e2e/")
+    parser.add_argument(
+        "--hw",
+        type=str,
+        choices=HW_MAPPING.keys(),
+        required=True,
+        help="Hardware backend to run tests on.",
+    )
+    parser.add_argument("--suite", type=str, required=True, help="Test suite to run.")
+    parser.add_argument(
+        "--nightly",
+        action="store_true",
+        help="Run nightly tests instead of per-commit tests.",
+    )
+    parser.add_argument(
+        "--timeout-per-file",
+        type=int,
+        default=1800,
+        help="The time limit for running one file in seconds (default: 1800).",
+    )
+    parser.add_argument(
+        "--continue-on-error",
+        action="store_true",
+        default=False,
+        help="Continue running remaining tests even if one fails.",
+    )
+    parser.add_argument(
+        "--auto-partition-id",
+        type=int,
+        help="Use auto load balancing. The part id.",
+    )
+    parser.add_argument(
+        "--auto-partition-size",
+        type=int,
+        help="Use auto load balancing. The number of parts.",
+    )
+    parser.add_argument(
+        "--enable-retry",
+        action="store_true",
+        default=False,
+        help="Enable smart retry for accuracy/performance assertion failures.",
+    )
+    parser.add_argument(
+        "--max-attempts",
+        type=int,
+        default=2,
+        help="Maximum number of attempts per file including initial run (default: 2).",
+    )
+    parser.add_argument(
+        "--retry-wait-seconds",
+        type=int,
+        default=60,
+        help="Seconds to wait between retries (default: 60).",
+    )
+    parser.add_argument(
+        "--retry-timeout-increase",
+        type=int,
+        default=600,
+        help="Additional timeout in seconds when retry is enabled (default: 600).",
+    )
+    parser.add_argument(
+        "--list-only",
+        action="store_true",
+        default=False,
+        help="Only list tests that would be run, do not execute them.",
+    )
+    parser.add_argument(
+        "--labels",
+        nargs="*",
+        default=[],
+        help=(
+            "Raw PR-side labels (e.g. `run-ci-megatron run-ci-fsdp`). The "
+            "`run-ci-` prefix is stripped on the Python side; the resulting "
+            "domain-label set is intersected with each test's `labels` to "
+            "decide what runs. An empty list keeps only `always_on=True` "
+            "tests for the suite."
+        ),
+    )
+    parser.add_argument(
+        "--match-all-labels",
+        action="store_true",
+        default=False,
+        help=(
+            "Bypass the labels filter and run every enabled test in the "
+            "suite (subject to hw/suite/nightly/disabled). Set by the "
+            "workflow when the PR carries `run-ci-image` or `run-ci-all`, "
+            "and equivalently on `workflow_dispatch`."
+        ),
+    )
+    args = parser.parse_args()
+
+    # Validate auto-partition arguments
+    if (args.auto_partition_id is not None) != (args.auto_partition_size is not None):
+        parser.error("--auto-partition-id and --auto-partition-size must be specified together.")
+    if args.auto_partition_size is not None:
+        if args.auto_partition_size <= 0:
+            parser.error("--auto-partition-size must be positive.")
+        if not 0 <= args.auto_partition_id < args.auto_partition_size:
+            parser.error(
+                f"--auto-partition-id must be in range [0, {args.auto_partition_size}), "
+                f"but got {args.auto_partition_id}"
+            )
+
+    exit_code = run_a_suite(args)
+    sys.exit(exit_code)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/fast/__init__.py b/tests/fast/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/fast/utils/__init__.py b/tests/fast/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/fast/utils/test_misc.py b/tests/fast/utils/test_misc.py
new file mode 100644
index 00000000..d23ef7c7
--- /dev/null
+++ b/tests/fast/utils/test_misc.py
@@ -0,0 +1,84 @@
+from tests.ci.ci_register import register_cpu_ci
+
+register_cpu_ci(est_time=60, suite="stage-a-cpu", labels=[])
+
+import os
+
+import pytest
+
+from miles.utils.misc import FunctionRegistry, function_registry, load_function, should_run_periodic_action
+
+
+def _fn_a():
+    return "a"
+
+
+def _fn_b():
+    return "b"
+
+
+class TestFunctionRegistry:
+    def test_register_and_get(self):
+        registry = FunctionRegistry()
+        with registry.temporary("my_fn", _fn_a):
+            assert registry.get("my_fn") is _fn_a
+
+    def test_register_duplicate_raises(self):
+        registry = FunctionRegistry()
+        with registry.temporary("my_fn", _fn_a):
+            with pytest.raises(AssertionError):
+                with registry.temporary("my_fn", _fn_b):
+                    pass
+
+    def test_unregister(self):
+        registry = FunctionRegistry()
+        with registry.temporary("my_fn", _fn_a):
+            assert registry.get("my_fn") is _fn_a
+        assert registry.get("my_fn") is None
+
+    def test_temporary_cleanup_on_exception(self):
+        registry = FunctionRegistry()
+        with pytest.raises(RuntimeError):
+            with registry.temporary("temp_fn", _fn_a):
+                raise RuntimeError("test")
+        assert registry.get("temp_fn") is None
+
+
+class TestLoadFunction:
+    def test_load_from_module(self):
+        import os.path
+
+        assert load_function("os.path.join") is os.path.join
+
+    def test_load_none_returns_none(self):
+        assert load_function(None) is None
+
+    def test_load_from_registry(self):
+        with function_registry.temporary("test:my_fn", _fn_a):
+            assert load_function("test:my_fn") is _fn_a
+
+    def test_registry_takes_precedence(self):
+        with function_registry.temporary("os.path.join", _fn_b):
+            assert load_function("os.path.join") is _fn_b
+        assert load_function("os.path.join") is os.path.join
+
+
+class TestShouldRunPeriodicAction:
+    def test_interval_none_never_runs(self):
+        for rid in (0, 1, 5, 99):
+            assert should_run_periodic_action(rid, interval=None) is False
+
+    def test_last_rollout_always_runs(self):
+        assert should_run_periodic_action(rollout_id=9, interval=4, num_rollout=10) is True
+
+    def test_interval_boundary(self):
+        assert should_run_periodic_action(rollout_id=3, interval=4) is True
+        assert should_run_periodic_action(rollout_id=2, interval=4) is False
+        assert should_run_periodic_action(rollout_id=7, interval=4) is True
+
+    def test_epoch_boundary_triggers(self):
+        assert should_run_periodic_action(rollout_id=4, interval=10, num_rollout_per_epoch=5) is True
+
+    @pytest.mark.parametrize("rid", [0, 1, 2])
+    def test_no_trigger_at_small_steps(self, rid):
+        assert should_run_periodic_action(rollout_id=rid, interval=100, num_rollout=1000) is False

From a255158da0f9cc72575ae73a0bd1da3f635da26b Mon Sep 17 00:00:00 2001
From: rockdu <kangruidu@gmail.com>
Date: Tue, 9 Jun 2026 23:50:35 -0700
Subject: [PATCH 2/2] chore: apply pre-commit lint to upstream miles/ +
 train_diffusion.py

Pure lint output from pre-commit on upstream/main miles/ files. Hooks
applied: ruff (E/F/B/UP with --unsafe-fixes for the 6 F841 cases),
autoflake, isort (black profile), black (line-length=119).

Plus 2 manual fixes that ruff flagged but couldn't auto-resolve:
- qwen_image.py B023: bound `theta` via default arg `theta: float = theta`
  on the closure `_params` to capture the current loop iteration value
- qwen_image.py B007: renamed unused loop var `L` to `_L`

Plus 1 manual fix for local py3.9 / py3.10 black gap:
- loss.py: added blank line after import block (CI py3.10 black would
  apply this; local py3.9 black --safe can't parse the match-case at
  line 824 and skips the file)

flow_grpo/ excluded by all formatter hooks (see preceding ci commit).
Verified pre-commit run --all-files is now idempotent (0 modifications
on a second run).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 miles/backends/fsdp_utils/actor.py            | 99 +++++++++----------
 .../backends/fsdp_utils/configs/qwen_image.py | 55 ++++++-----
 .../configs/train_pipeline_config.py          | 12 +--
 .../diffusion_update_weight_utils.py          | 34 ++-----
 .../patch_layernorm_scale_shift.py            |  6 +-
 .../monkey_patches/patch_mul_add.py           |  4 +-
 .../monkey_patches/patch_rmsnorm.py           | 10 +-
 .../patch_scale_residual_layernorm.py         |  9 +-
 .../sglang_diffusion_engine.py                | 18 ++--
 miles/backends/training_utils/data.py         |  2 +-
 miles/backends/training_utils/log_utils.py    |  2 +-
 miles/backends/training_utils/loss.py         |  1 +
 miles/ray/actor_group.py                      |  2 -
 miles/ray/placement_group.py                  |  4 +-
 miles/ray/rollout.py                          | 12 +--
 miles/rollout/rm_hub/ocr.py                   | 44 +++++----
 miles/rollout/sglang_diffusion_rollout.py     | 45 ++++-----
 miles/rollout/step_strategy_hub.py            |  1 +
 miles/utils/arguments.py                      | 43 ++++----
 miles/utils/diffusion_protocol.py             | 10 +-
 miles/utils/diffusion_rollout_response.py     |  8 +-
 miles/utils/sde_log_prob.py                   |  3 +-
 miles/utils/types.py                          |  5 +-
 train_diffusion.py                            |  2 +-
 24 files changed, 203 insertions(+), 228 deletions(-)

diff --git a/miles/backends/fsdp_utils/actor.py b/miles/backends/fsdp_utils/actor.py
index f04b3525..103a5b2e 100644
--- a/miles/backends/fsdp_utils/actor.py
+++ b/miles/backends/fsdp_utils/actor.py
@@ -1,5 +1,4 @@
 import logging
-import os
 from argparse import Namespace
 from collections import defaultdict
 from contextlib import nullcontext
@@ -9,29 +8,28 @@
 import torch.distributed as dist
 from diffusers import DiffusionPipeline
 
+import miles.backends.fsdp_utils.configs.qwen_image  # noqa: F401 — register pipeline config
+import miles.backends.fsdp_utils.configs.sd3  # noqa: F401 — register pipeline config
 from miles.ray.train_actor import TrainRayActor
+from miles.utils import tracking_utils, train_metric_utils
 from miles.utils.context_utils import with_defer
-from miles.utils import train_metric_utils
 from miles.utils.distributed_utils import get_gloo_group
 from miles.utils.memory_utils import clear_memory, print_memory
 from miles.utils.metric_utils import compute_rollout_step
+from miles.utils.profile_utils import TrainProfiler
 from miles.utils.sde_log_prob import sde_step_with_logprob
 from miles.utils.timer import Timer, inverse_timer, timer
 from miles.utils.tracking_utils import init_tracking
-from miles.utils import tracking_utils
-from miles.utils.profile_utils import TrainProfiler
-
-from .configs.train_pipeline_config import get_train_pipeline_config
-import miles.backends.fsdp_utils.configs.qwen_image  # noqa: F401 — register pipeline config
-import miles.backends.fsdp_utils.configs.sd3  # noqa: F401 — register pipeline config
 
 from . import checkpoint
+from .configs.train_pipeline_config import get_train_pipeline_config
+from .diffusion_update_weight_utils import DiffusionUpdateWeightFromTensor, DiffusionUpdateWeightFromTensorLoRA
 from .lr_scheduler import get_lr_scheduler
 from .parallel import create_fsdp_parallel_state
-from .diffusion_update_weight_utils import DiffusionUpdateWeightFromTensor, DiffusionUpdateWeightFromTensorLoRA
 
 logger = logging.getLogger(__name__)
 
+
 class FSDPTrainRayActor(TrainRayActor):
     """FSDP training actor for diffusion GRPO.
 
@@ -238,7 +236,11 @@ def _gather_and_log_metrics(self, rollout_id: int, log_dict: dict[str, float], s
 
             logger.info(
                 f"[train step {int(step)}] rollout={rollout_id} "
-                + " ".join(f"{k}={v:.6e}" for k, v in sorted(reduced.items()) if k not in ("train/epoch", "rollout/step", "train/step"))
+                + " ".join(
+                    f"{k}={v:.6e}"
+                    for k, v in sorted(reduced.items())
+                    if k not in ("train/epoch", "rollout/step", "train/step")
+                )
             )
         else:
             dist.gather_object(
@@ -302,7 +304,9 @@ def _train_core(self, rollout_id: int, rollout_data) -> None:
         # ------------- KL loss -------------
         kl_beta = float(self.args.diffusion_kl_beta)
         if kl_beta > 0 and not self.args.use_lora:
-            raise ValueError("--diffusion-kl-beta currently requires --use-lora so the base model can be used as reference.")
+            raise ValueError(
+                "--diffusion-kl-beta currently requires --use-lora so the base model can be used as reference."
+            )
         if kl_beta > 0 and not hasattr(self.model, "disable_adapter"):
             raise RuntimeError("Diffusion KL requires a PEFT model exposing disable_adapter() after FSDP wrapping.")
 
@@ -365,15 +369,15 @@ def _train_core(self, rollout_id: int, rollout_data) -> None:
                 num_samples_in_window = grids["num_samples_in_window"]
                 sde_window_size = grids["sde_window_size"]
                 sample_microbatch = min(
-                    self.args.micro_batch_size_sample
-                    if self.args.micro_batch_size_sample is not None
-                    else num_samples_in_window,
+                    (
+                        self.args.micro_batch_size_sample
+                        if self.args.micro_batch_size_sample is not None
+                        else num_samples_in_window
+                    ),
                     num_samples_in_window,
                 )
                 tstep_microbatch = min(
-                    self.args.micro_batch_size_tstep
-                    if self.args.micro_batch_size_tstep is not None
-                    else 1,
+                    self.args.micro_batch_size_tstep if self.args.micro_batch_size_tstep is not None else 1,
                     sde_window_size,
                 )
 
@@ -452,7 +456,6 @@ def _build_train_grids(
             log_prob_old = rollout_log_probs_list[traj_idx].to(device, dtype=torch.float32)
             advantage = advantages[traj_idx]
 
-            
             rollout_dbg = rollout_debug_tensors_list[traj_idx] if rollout_debug_tensors_list else None
             rollout_model_output = (
                 rollout_dbg.rollout_model_outputs.to(device, dtype=torch.float32)
@@ -505,8 +508,7 @@ def _build_train_grids(
         # then doesn't need to override collate_cond_for_sample_batch.
         num_samples_in_window = int(traj_end - traj_start)
         needs_multi_sample_tile = (
-            self.args.micro_batch_size_sample is None
-            or self.args.micro_batch_size_sample > 1
+            self.args.micro_batch_size_sample is None or self.args.micro_batch_size_sample > 1
         ) and num_samples_in_window > 1
 
         if not needs_multi_sample_tile:
@@ -516,9 +518,7 @@ def _build_train_grids(
                 positive_cond_kwargs_list + negative_cond_kwargs_list, device
             )
         else:
-            cond_collated = train_pipeline_config.collate_cond_for_sample_batch(
-                positive_cond_kwargs_list, device
-            )
+            cond_collated = train_pipeline_config.collate_cond_for_sample_batch(positive_cond_kwargs_list, device)
 
         return {
             "latents": latents_window,
@@ -619,9 +619,7 @@ def _forward_tile(
         log_prob_old_tile = grids["log_prob_old"][sample_indices][:, tstep_indices]
         advantage_tile = grids["advantage"][sample_indices][:, tstep_indices]
 
-        latents_flat = latents_tile.reshape(
-            tile_sample_count * tile_tstep_count, *latents_tile.shape[2:]
-        )
+        latents_flat = latents_tile.reshape(tile_sample_count * tile_tstep_count, *latents_tile.shape[2:])
         timesteps_flat = timesteps_tile.reshape(tile_sample_count * tile_tstep_count)
 
         # sgl-d's Qwen DiT divides timestep by num_train_timesteps inside
@@ -639,9 +637,7 @@ def _forward_tile(
                 grids["per_sample_pos_cond"][s], tile_tstep_count
             )
             neg_cond_tile = (
-                train_pipeline_config.expand_cond_for_timestep_batch(
-                    grids["per_sample_neg_cond"][s], tile_tstep_count
-                )
+                train_pipeline_config.expand_cond_for_timestep_batch(grids["per_sample_neg_cond"][s], tile_tstep_count)
                 if use_cfg
                 else None
             )
@@ -697,7 +693,10 @@ def _compute_noise_pred(disable_adapter: bool = False) -> torch.Tensor:
                     noise_pred_pos = _forward(pos_cond_tile)
                     noise_pred_neg = _forward(neg_cond_tile)
                 return train_pipeline_config.cfg_combine(
-                    noise_pred_pos, noise_pred_neg, guidance_scale, true_cfg_scale=true_cfg_scale,
+                    noise_pred_pos,
+                    noise_pred_neg,
+                    guidance_scale,
+                    true_cfg_scale=true_cfg_scale,
                 )
 
         noise_pred_flat = _compute_noise_pred()
@@ -749,12 +748,8 @@ def _compute_noise_pred(disable_adapter: bool = False) -> torch.Tensor:
             log_stats["loss_abs_mean"].append(per_cell_loss.abs().mean().detach())
             log_stats["adv_abs_mean"].append(advantage_tile.abs().mean().detach())
             log_stats["ratio_abs_minus_1"].append((ratio - 1.0).abs().mean().detach())
-            log_stats["approx_kl"].append(
-                0.5 * torch.mean((log_prob_new - log_prob_old_tile) ** 2).detach()
-            )
-            log_stats["clipfrac"].append(
-                torch.mean((torch.abs(ratio - 1.0) > clip_range).float()).detach()
-            )
+            log_stats["approx_kl"].append(0.5 * torch.mean((log_prob_new - log_prob_old_tile) ** 2).detach())
+            log_stats["clipfrac"].append(torch.mean((torch.abs(ratio - 1.0) > clip_range).float()).detach())
             log_stats["log_prob_new_idx_0"].append(log_prob_new[0, 0].detach())
             log_stats["log_prob_old_idx_0"].append(log_prob_old_tile[0, 0].detach())
             log_stats["log_prob_mean_abs_diff"].append(
@@ -802,6 +797,7 @@ def _tile_collated_cond(
     For CFG the input packs [pos_M | neg_M] along batch=2*num_samples_in_window;
     pos and neg halves are extracted separately, the latter via offset
     `+ num_samples_in_window`. Returns (pos, None) when use_cfg is False."""
+
     def _tile_value(value, rows: torch.Tensor):
         if isinstance(value, torch.Tensor):
             return value.index_select(0, rows).repeat_interleave(tile_tstep_count, dim=0)
@@ -835,7 +831,7 @@ def _pack_cond_for_joint_cfg(pos: dict, neg: dict) -> dict:
 def _cast_cond_to_dtype(cond: dict, dtype: torch.dtype) -> dict:
     """Cast floating-point tensors to the model's compute dtype; leave bool
     masks / int / list / scalar values untouched. The bool
-    encoder_hidden_states_mask must NOT be cast. 
+    encoder_hidden_states_mask must NOT be cast.
     """
     out: dict = {}
     for k, v in cond.items():
@@ -865,6 +861,7 @@ def move_torch_optimizer(optimizer, device):
 def _resolve_dtype(name: str) -> torch.dtype:
     return {"fp32": torch.float32, "bf16": torch.bfloat16, "fp16": torch.float16}[name]
 
+
 def apply_lora(model: torch.nn.Module, args: Namespace, train_pipeline_config) -> None:
     """Apply PEFT LoRA to the model.
 
@@ -879,17 +876,21 @@ def apply_lora(model: torch.nn.Module, args: Namespace, train_pipeline_config) -
     targets = args.lora_target_modules or train_pipeline_config.lora_target_modules
     init_lora_weight = args.diffusion_init_lora_weight
     if init_lora_weight == "kaiming-uniform":
-        init_lora_weight = True # namely kaiming-uniform
-    model = get_peft_model(model, LoraConfig(
-        r=args.lora_rank,
-        lora_alpha=args.lora_alpha,
-        target_modules=targets,
-        init_lora_weights=init_lora_weight,
-    ))
+        init_lora_weight = True  # namely kaiming-uniform
+    model = get_peft_model(
+        model,
+        LoraConfig(
+            r=args.lora_rank,
+            lora_alpha=args.lora_alpha,
+            target_modules=targets,
+            init_lora_weights=init_lora_weight,
+        ),
+    )
     if dist.get_rank() == 0:
         model.print_trainable_parameters()
     return model
 
+
 def apply_fsdp2(model, mesh=None, cpu_offload=False, args=None):
     from torch.distributed.fsdp import CPUOffloadPolicy, MixedPrecisionPolicy, fully_shard
 
@@ -898,15 +899,13 @@ def apply_fsdp2(model, mesh=None, cpu_offload=False, args=None):
     layer_cls_to_wrap = model._no_split_modules
     assert len(layer_cls_to_wrap) > 0 and layer_cls_to_wrap[0] is not None
 
-    modules = [
-        module
-        for name, module in model.named_modules()
-        if module.__class__.__name__ in layer_cls_to_wrap
-    ]
+    modules = [module for name, module in model.named_modules() if module.__class__.__name__ in layer_cls_to_wrap]
 
     param_dtype = _resolve_dtype(args.diffusion_forward_dtype)
     reduce_dtype = _resolve_dtype(args.fsdp_reduce_dtype)
-    logger.info(f"FSDP: wrapping {len(modules)} modules of type {layer_cls_to_wrap}, param_dtype={param_dtype}, reduce_dtype={reduce_dtype}")
+    logger.info(
+        f"FSDP: wrapping {len(modules)} modules of type {layer_cls_to_wrap}, param_dtype={param_dtype}, reduce_dtype={reduce_dtype}"
+    )
 
     fsdp_kwargs = {
         "mp_policy": MixedPrecisionPolicy(
diff --git a/miles/backends/fsdp_utils/configs/qwen_image.py b/miles/backends/fsdp_utils/configs/qwen_image.py
index 40a81895..aa92bae4 100644
--- a/miles/backends/fsdp_utils/configs/qwen_image.py
+++ b/miles/backends/fsdp_utils/configs/qwen_image.py
@@ -8,6 +8,7 @@
 
 from .train_pipeline_config import TrainPipelineConfig, register_train_pipeline_config
 
+
 def _rebuild_pos_embed_freqs_on_cuda(model) -> None:
     """Rebuild QwenEmbedRope ``pos_freqs`` / ``neg_freqs`` on the model's
     CUDA device so train-side (diffusers) matches rollout-side (sglang-d)
@@ -40,7 +41,7 @@ def _rebuild_pos_embed_freqs_on_cuda(model) -> None:
             continue
         theta = submod.theta
 
-        def _params(index: torch.Tensor, dim: int) -> torch.Tensor:
+        def _params(index: torch.Tensor, dim: int, theta: float = theta) -> torch.Tensor:
             inv = 1.0 / torch.pow(
                 theta,
                 torch.arange(0, dim, 2, device=device).to(torch.float32).div(dim),
@@ -50,12 +51,8 @@ def _params(index: torch.Tensor, dim: int) -> torch.Tensor:
 
         pos_idx = torch.arange(4096, device=device)
         neg_idx = torch.arange(4096, device=device).flip(0) * -1 - 1
-        submod.pos_freqs = torch.cat(
-            [_params(pos_idx, d) for d in submod.axes_dim], dim=1
-        )
-        submod.neg_freqs = torch.cat(
-            [_params(neg_idx, d) for d in submod.axes_dim], dim=1
-        )
+        submod.pos_freqs = torch.cat([_params(pos_idx, d) for d in submod.axes_dim], dim=1)
+        submod.neg_freqs = torch.cat([_params(neg_idx, d) for d in submod.axes_dim], dim=1)
         # clear @lru_cache function
         cvf = getattr(submod, "_compute_video_freqs", None)
         if cvf is not None and hasattr(cvf, "cache_clear"):
@@ -66,10 +63,18 @@ def _params(index: torch.Tensor, dim: int) -> torch.Tensor:
 class QwenImageTrainPipelineConfig(TrainPipelineConfig):
 
     lora_target_modules = [
-        "to_q", "to_k", "to_v", "to_out.0",
-        "add_q_proj", "add_k_proj", "add_v_proj", "to_add_out",
-        "img_mlp.net.0.proj", "img_mlp.net.2",
-        "txt_mlp.net.0.proj", "txt_mlp.net.2",
+        "to_q",
+        "to_k",
+        "to_v",
+        "to_out.0",
+        "add_q_proj",
+        "add_k_proj",
+        "add_v_proj",
+        "to_add_out",
+        "img_mlp.net.0.proj",
+        "img_mlp.net.2",
+        "txt_mlp.net.0.proj",
+        "txt_mlp.net.2",
     ]
 
     # Last-block text-branch outputs are discarded by parent transformer.forward.
@@ -111,25 +116,22 @@ def collate_cond_for_sample_batch(
         for kw in per_sample_cond_kwargs:
             lens = kw.get("txt_seq_lens") or []
             assert len(lens) == 1, (
-                f"collate expects per-sample cond_kwargs with txt_seq_lens of length 1, "
-                f"got {lens}"
+                f"collate expects per-sample cond_kwargs with txt_seq_lens of length 1, " f"got {lens}"
             )
             L = int(lens[0])
             seq_lens.append(L)
-            enc = kw["encoder_hidden_states"]   # (1, L_i, D) — L_i may equal L or be padded already
-            assert enc.shape[0] == 1, (
-                f"collate expects per-sample encoder_hidden_states with batch=1, got {tuple(enc.shape)}"
-            )
+            enc = kw["encoder_hidden_states"]  # (1, L_i, D) — L_i may equal L or be padded already
+            assert (
+                enc.shape[0] == 1
+            ), f"collate expects per-sample encoder_hidden_states with batch=1, got {tuple(enc.shape)}"
             encs.append(enc)
             shapes = kw.get("img_shapes") or []
-            assert len(shapes) == 1, (
-                f"collate expects per-sample img_shapes of length 1, got {shapes}"
-            )
+            assert len(shapes) == 1, f"collate expects per-sample img_shapes of length 1, got {shapes}"
             img_shapes.append(shapes[0])
 
         max_len = max(seq_lens)
         padded = []
-        for enc, L in zip(encs, seq_lens):
+        for enc, _L in zip(encs, seq_lens, strict=False):
             cur_len = enc.shape[1]
             if cur_len < max_len:
                 # pad seq dim on the right; F.pad with 4-tuple pads the last 2 dims
@@ -138,12 +140,11 @@ def collate_cond_for_sample_batch(
             elif cur_len > max_len:
                 enc = enc[:, :max_len, :]
             padded.append(enc)
-        encoder_hidden_states = torch.cat(padded, dim=0).to(device)   # (M, max_len, D)
+        encoder_hidden_states = torch.cat(padded, dim=0).to(device)  # (M, max_len, D)
 
-        mask = (
-            torch.arange(max_len, device=device).unsqueeze(0)
-            < torch.tensor(seq_lens, device=device).unsqueeze(1)
-        )                                                              # (M, max_len) bool
+        mask = torch.arange(max_len, device=device).unsqueeze(0) < torch.tensor(seq_lens, device=device).unsqueeze(
+            1
+        )  # (M, max_len) bool
 
         return {
             "encoder_hidden_states": encoder_hidden_states,
@@ -176,4 +177,4 @@ def cfg_combine(
 
     def preprocess_model_before_fsdp(self, model: torch.nn.Module) -> None:
         """Preprocess the model before FSDP."""
-        _rebuild_pos_embed_freqs_on_cuda(model)
\ No newline at end of file
+        _rebuild_pos_embed_freqs_on_cuda(model)
diff --git a/miles/backends/fsdp_utils/configs/train_pipeline_config.py b/miles/backends/fsdp_utils/configs/train_pipeline_config.py
index 9efbfa62..622559ed 100644
--- a/miles/backends/fsdp_utils/configs/train_pipeline_config.py
+++ b/miles/backends/fsdp_utils/configs/train_pipeline_config.py
@@ -19,27 +19,28 @@
 from miles.utils.types import CondKwargs, DiTTrajectory
 
 
-_REGISTRY: dict[str, type["TrainPipelineConfig"]] = {}
+_REGISTRY: dict[str, type[TrainPipelineConfig]] = {}
 
 
 def register_train_pipeline_config(*model_name_patterns: str):
     """Decorator: register a TrainPipelineConfig subclass for one or more model name patterns."""
+
     def wrapper(cls):
         for pat in model_name_patterns:
             _REGISTRY[pat.lower()] = cls
         return cls
+
     return wrapper
 
 
-def get_train_pipeline_config(model_name: str) -> "TrainPipelineConfig":
+def get_train_pipeline_config(model_name: str) -> TrainPipelineConfig:
     """Look up and instantiate a TrainPipelineConfig by matching model_name against registered patterns."""
     name_lower = model_name.lower()
     for pattern, cls in _REGISTRY.items():
         if pattern in name_lower:
             return cls()
     raise ValueError(
-        f"No TrainPipelineConfig registered for model '{model_name}'. "
-        f"Known patterns: {list(_REGISTRY.keys())}"
+        f"No TrainPipelineConfig registered for model '{model_name}'. " f"Known patterns: {list(_REGISTRY.keys())}"
     )
 
 
@@ -102,7 +103,7 @@ def collate_cond_for_sample_batch(
         Default: naive concat along batch dim, only valid when shapes match.
         """
         raise NotImplementedError(
-            f"Must implement collate_cond_for_sample_batch to enable --micro-batch-size-sample in fsdp training"
+            "Must implement collate_cond_for_sample_batch to enable --micro-batch-size-sample in fsdp training"
         )
 
     @abc.abstractmethod
@@ -118,4 +119,3 @@ def cfg_combine(
     @abc.abstractmethod
     def preprocess_model_before_fsdp(self, model: torch.nn.Module) -> None:
         """Preprocess the model before FSDP."""
-        pass
\ No newline at end of file
diff --git a/miles/backends/fsdp_utils/diffusion_update_weight_utils.py b/miles/backends/fsdp_utils/diffusion_update_weight_utils.py
index 8141da26..a72f8966 100644
--- a/miles/backends/fsdp_utils/diffusion_update_weight_utils.py
+++ b/miles/backends/fsdp_utils/diffusion_update_weight_utils.py
@@ -141,9 +141,7 @@ def update_bucket_weights(self, named_tensors, weight_version=None) -> None:
                     "metadata": metadata,
                 }
             }
-            serialized_tensors.append(
-                MultiprocessingSerializer.serialize(flattened_tensor_data, output_str=True)
-            )
+            serialized_tensors.append(MultiprocessingSerializer.serialize(flattened_tensor_data, output_str=True))
 
         if self._ipc_gather_src == dist.get_rank():
             gathered_serialized_batches = [None for _ in range(dist.get_world_size(self._ipc_gather_group))]
@@ -171,6 +169,7 @@ def update_bucket_weights(self, named_tensors, weight_version=None) -> None:
                 ref = self._ipc_engine.update_weights_from_tensor.remote(**kwargs)
                 ray.get(ref)
 
+
 # TODO: update weights only for sgl-d LoRA params
 class DiffusionUpdateWeightFromTensorLoRA(DiffusionUpdateWeightFromTensor):
     """LoRA-aware updater: merges adapters into base before pushing to rollout.
@@ -240,8 +239,7 @@ def update_weights(self):
             # ``base_model.model.`` is PeftModel.base_model (=LoraModel) .model.
             sglang_d_param_name = name.replace(".base_layer", "")
             if sglang_d_param_name.startswith("base_model.model."):
-                sglang_d_param_name = sglang_d_param_name[len("base_model.model."):]
-
+                sglang_d_param_name = sglang_d_param_name[len("base_model.model.") :]
 
             sz = param.numel() * param.element_size()
             if bucket and bucket_size + sz >= self.args.update_weight_buffer_size:
@@ -272,9 +270,7 @@ def _verify_weight_sync(self, pairs: list[tuple[str, torch.Tensor]]) -> None:
         expected = self._sha256_named_tensors(pairs)
 
         try:
-            remote = ray.get(
-                self._ipc_engine.get_weights_checksum.remote([self.target_module])
-            )
+            remote = ray.get(self._ipc_engine.get_weights_checksum.remote([self.target_module]))
         except Exception as e:
             logger.error(f"[weight_sync verify] failed to fetch remote checksum: {e}")
             return
@@ -295,27 +291,15 @@ def _verify_weight_sync(self, pairs: list[tuple[str, torch.Tensor]]) -> None:
         if dist.get_rank() != 0:
             return
         try:
-            per_engine = ray.get([
-                e.get_weights_checksum.remote([self.target_module])
-                for e in self.rollout_engines
-            ])
+            per_engine = ray.get([e.get_weights_checksum.remote([self.target_module]) for e in self.rollout_engines])
         except Exception as e:
             logger.error(f"[weight_sync verify cross-engine] failed: {e}")
             return
-        engine_sums = [
-            (idx, (r or {}).get(self.target_module))
-            for idx, r in enumerate(per_engine)
-        ]
+        engine_sums = [(idx, (r or {}).get(self.target_module)) for idx, r in enumerate(per_engine)]
         first_sum = engine_sums[0][1]
         all_equal = all(s == first_sum for _, s in engine_sums)
-        pretty = "  ".join(
-            f"eng{idx}={s[:16] if isinstance(s, str) else s}"
-            for idx, s in engine_sums
-        )
-        logger.warning(
-            f"[weight_sync verify v{self.weight_version} cross-engine] "
-            f"all_equal={all_equal}  {pretty}"
-        )
+        pretty = "  ".join(f"eng{idx}={s[:16] if isinstance(s, str) else s}" for idx, s in engine_sums)
+        logger.warning(f"[weight_sync verify v{self.weight_version} cross-engine] " f"all_equal={all_equal}  {pretty}")
 
     @staticmethod
     def _sha256_named_tensors(pairs: list[tuple[str, torch.Tensor]]) -> str:
@@ -327,4 +311,4 @@ def _sha256_named_tensors(pairs: list[tuple[str, torch.Tensor]]) -> str:
             if isinstance(t, DTensor):
                 t = t._local_tensor
             hasher.update(t.cpu().contiguous().reshape(-1).view(torch.uint8).numpy().data)
-        return hasher.hexdigest()
\ No newline at end of file
+        return hasher.hexdigest()
diff --git a/miles/backends/sglang_diffusion_utils/monkey_patches/patch_layernorm_scale_shift.py b/miles/backends/sglang_diffusion_utils/monkey_patches/patch_layernorm_scale_shift.py
index 123829be..e34ecfdc 100644
--- a/miles/backends/sglang_diffusion_utils/monkey_patches/patch_layernorm_scale_shift.py
+++ b/miles/backends/sglang_diffusion_utils/monkey_patches/patch_layernorm_scale_shift.py
@@ -1,5 +1,3 @@
-from typing import Optional
-
 import torch
 
 from sglang.multimodal_gen.runtime.layers.layernorm import LayerNormScaleShift
@@ -10,8 +8,8 @@
 def _patched_forward(
     self,
     x: torch.Tensor,
-    shift: Optional[torch.Tensor] = None,
-    scale: Optional[torch.Tensor] = None,
+    shift: torch.Tensor | None = None,
+    scale: torch.Tensor | None = None,
 ):
     # diffusers sequence: LayerNorm(x) then (1+scale)*x + shift in bf16 eager.
     normed = self.norm(x)
diff --git a/miles/backends/sglang_diffusion_utils/monkey_patches/patch_mul_add.py b/miles/backends/sglang_diffusion_utils/monkey_patches/patch_mul_add.py
index 25381ff6..3ef8650e 100644
--- a/miles/backends/sglang_diffusion_utils/monkey_patches/patch_mul_add.py
+++ b/miles/backends/sglang_diffusion_utils/monkey_patches/patch_mul_add.py
@@ -14,9 +14,7 @@ def _patched_forward(
     if b.dim() == 4:
         num_frames = b.shape[1]
         frame_seqlen = a.shape[1] // num_frames
-        return c + (
-            a.unflatten(dim=1, sizes=(num_frames, frame_seqlen)) * (k + b)
-        ).flatten(1, 2)
+        return c + (a.unflatten(dim=1, sizes=(num_frames, frame_seqlen)) * (k + b)).flatten(1, 2)
     return c + a * (k + b)
 
 
diff --git a/miles/backends/sglang_diffusion_utils/monkey_patches/patch_rmsnorm.py b/miles/backends/sglang_diffusion_utils/monkey_patches/patch_rmsnorm.py
index 466319a2..86a67f69 100644
--- a/miles/backends/sglang_diffusion_utils/monkey_patches/patch_rmsnorm.py
+++ b/miles/backends/sglang_diffusion_utils/monkey_patches/patch_rmsnorm.py
@@ -1,5 +1,3 @@
-from typing import Optional
-
 import torch
 
 from sglang.multimodal_gen.runtime.layers.layernorm import RMSNorm
@@ -8,7 +6,7 @@
 def _patched_forward(
     self,
     x: torch.Tensor,
-    residual: Optional[torch.Tensor] = None,
+    residual: torch.Tensor | None = None,
 ):
     # diffusers' RMSNorm rounds to weight dtype BEFORE the weight mul, so the
     # mul runs bf16*bf16. sgl-d's default keeps fp32 through the weight mul.
@@ -22,11 +20,7 @@ def _patched_forward(
         residual = x_fp32.to(orig_dtype)
 
     variance_size_override = getattr(self, "variance_size_override", None)
-    x_var = (
-        x_fp32
-        if variance_size_override is None
-        else x_fp32[..., :variance_size_override]
-    )
+    x_var = x_fp32 if variance_size_override is None else x_fp32[..., :variance_size_override]
     variance = x_var.pow(2).mean(dim=-1, keepdim=True)
     x_fp32 = x_fp32 * torch.rsqrt(variance + self.variance_epsilon)
 
diff --git a/miles/backends/sglang_diffusion_utils/monkey_patches/patch_scale_residual_layernorm.py b/miles/backends/sglang_diffusion_utils/monkey_patches/patch_scale_residual_layernorm.py
index ddc70a1f..dcb45694 100644
--- a/miles/backends/sglang_diffusion_utils/monkey_patches/patch_scale_residual_layernorm.py
+++ b/miles/backends/sglang_diffusion_utils/monkey_patches/patch_scale_residual_layernorm.py
@@ -1,8 +1,5 @@
 import torch
-
-from sglang.multimodal_gen.runtime.layers.layernorm import (
-    ScaleResidualLayerNormScaleShift,
-)
+from sglang.multimodal_gen.runtime.layers.layernorm import ScaleResidualLayerNormScaleShift
 
 from miles.backends.sglang_diffusion_utils.monkey_patches._common import ensure_broadcast
 
@@ -23,9 +20,7 @@ def _patched_forward(
     elif gate.dim() == 4:
         num_frames = gate.shape[1]
         frame_seqlen = x.shape[1] // num_frames
-        residual_out = residual + (
-            x.unflatten(dim=1, sizes=(num_frames, frame_seqlen)) * gate
-        ).flatten(1, 2)
+        residual_out = residual + (x.unflatten(dim=1, sizes=(num_frames, frame_seqlen)) * gate).flatten(1, 2)
     else:
         residual_out = residual + x * gate
 
diff --git a/miles/backends/sglang_diffusion_utils/sglang_diffusion_engine.py b/miles/backends/sglang_diffusion_utils/sglang_diffusion_engine.py
index 3b99865d..c16e2f3f 100644
--- a/miles/backends/sglang_diffusion_utils/sglang_diffusion_engine.py
+++ b/miles/backends/sglang_diffusion_utils/sglang_diffusion_engine.py
@@ -6,8 +6,8 @@
 import time
 
 import requests
-from sglang.multimodal_gen.runtime.server_args import ServerArgs
 from sglang.multimodal_gen.runtime.launch_server import kill_process_tree
+from sglang.multimodal_gen.runtime.server_args import ServerArgs
 
 from miles.ray.ray_actor import RayActor
 from miles.utils.http_utils import get_host_info
@@ -40,8 +40,10 @@ def _scheduler_process_with_sgld_monkey_patches(*args, **kwargs):
     # before calling the real run_scheduler_process, so the DiT that's
     # constructed inside the grandchild sees the patched classes.
     from miles.backends.sglang_diffusion_utils.monkey_patches import apply_sgld_monkey_patches
+
     apply_sgld_monkey_patches()
     from sglang.multimodal_gen.runtime.managers.gpu_worker import run_scheduler_process
+
     return run_scheduler_process(*args, **kwargs)
 
 
@@ -61,9 +63,11 @@ def _launch_server_target(server_args, apply_sgld_monkey_patches: bool = False):
         # the miles qualname across to the grandchild, which applies the patch before
         # calling the real scheduler entrypoint.
         import sglang.multimodal_gen.runtime.launch_server as _ls_mod
+
         _ls_mod.run_scheduler_process = _scheduler_process_with_sgld_monkey_patches
 
     from sglang.multimodal_gen.runtime.launch_server import launch_server
+
     launch_server(server_args)
 
 
@@ -108,6 +112,7 @@ def _wait_server_healthy(base_url, is_process_alive):
 
             time.sleep(2)
 
+
 class SGLangDiffusionEngine(RayActor):
     def __init__(self, args, rank: int, base_gpu_id: int | None = None):
         self.args = args
@@ -154,10 +159,7 @@ def _init_normal(self, server_args_dict):
         self._pin_to_assigned_gpu()
         apply_sgld_monkey_patches = self.args.apply_sgld_monkey_patches
         if apply_sgld_monkey_patches:
-            logger.info(
-                "Launching sglang-d with sgl-d → diffusers monkey patches "
-                "(--apply-sgld-monkey-patches)"
-            )
+            logger.info("Launching sglang-d with sgl-d → diffusers monkey patches " "(--apply-sgld-monkey-patches)")
         self.process = launch_server_process(
             ServerArgs.from_kwargs(**server_args_dict),
             apply_sgld_monkey_patches=apply_sgld_monkey_patches,
@@ -275,12 +277,10 @@ def shutdown(self):
             worker_url = f"http://{self.server_host}:{self.server_port}"
             response = None
             if self.args.use_miles_router:
-                response = requests.post(
-                    f"http://{self.router_ip}:{self.router_port}/remove_worker?url={worker_url}"
-                )
+                response = requests.post(f"http://{self.router_ip}:{self.router_port}/remove_worker?url={worker_url}")
             else:
                 # SGL-D router TODO: shutdown for sglang-diffusion router
-                logger.warning(f"Failed to fetch workers list or remove worker: now only support miles_router")
+                logger.warning("Failed to fetch workers list or remove worker: now only support miles_router")
 
             if response is not None:
                 response.raise_for_status()
diff --git a/miles/backends/training_utils/data.py b/miles/backends/training_utils/data.py
index c6f88327..3c069b34 100644
--- a/miles/backends/training_utils/data.py
+++ b/miles/backends/training_utils/data.py
@@ -1,6 +1,7 @@
 import logging
 from argparse import Namespace
 from collections.abc import Sequence
+from typing import Any
 
 import torch
 import torch.distributed as dist
@@ -8,7 +9,6 @@
 
 from miles.utils.data import get_minimum_num_micro_batch_size
 from miles.utils.seqlen_balancing import get_seqlen_balanced_partitions
-from typing import Any
 
 # Type alias – rollout data is a plain dict of lists/tensors.
 RolloutBatch = dict[str, Any]
diff --git a/miles/backends/training_utils/log_utils.py b/miles/backends/training_utils/log_utils.py
index f18db839..379e529f 100644
--- a/miles/backends/training_utils/log_utils.py
+++ b/miles/backends/training_utils/log_utils.py
@@ -1,6 +1,7 @@
 import logging
 from argparse import Namespace
 from math import isclose
+from typing import Any
 
 import numpy as np
 import torch
@@ -9,7 +10,6 @@
 from miles.utils import train_metric_utils
 from miles.utils.flops_utils import calculate_fwd_flops
 from miles.utils.metric_utils import compute_pass_rate, compute_rollout_step
-from typing import Any
 
 RolloutBatch = dict[str, Any]
 
diff --git a/miles/backends/training_utils/loss.py b/miles/backends/training_utils/loss.py
index 945e8fe4..721e9d81 100644
--- a/miles/backends/training_utils/loss.py
+++ b/miles/backends/training_utils/loss.py
@@ -18,6 +18,7 @@
     get_reinforce_plus_plus_baseline_advantages,
     get_reinforce_plus_plus_returns,
 )
+
 RolloutBatch = dict[str, Any]
 
 from .cp_utils import all_gather_with_cp, get_logits_and_tokens_offset_with_cp, get_sum_of_sample_mean
diff --git a/miles/ray/actor_group.py b/miles/ray/actor_group.py
index 40f7d407..82a19b03 100644
--- a/miles/ray/actor_group.py
+++ b/miles/ray/actor_group.py
@@ -4,8 +4,6 @@
 from ray.util.placement_group import PlacementGroup
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 
-from miles.ray.utils import NOSET_VISIBLE_DEVICES_ENV_VARS_LIST
-
 
 class RayTrainGroup:
     """
diff --git a/miles/ray/placement_group.py b/miles/ray/placement_group.py
index c47f8b68..0200bb02 100644
--- a/miles/ray/placement_group.py
+++ b/miles/ray/placement_group.py
@@ -137,9 +137,7 @@ def create_training_models(args, pgs, rollout_manager):
         num_gpus_per_node=args.actor_num_gpus_per_node,
         pg=pgs["actor"],
     )
-    start_rollout_ids = ray.get(
-        actor_model.async_init(args, role="actor", with_ref=False)
-    )
+    start_rollout_ids = ray.get(actor_model.async_init(args, role="actor", with_ref=False))
     logger.info("Actor model initialized.")
 
     assert len(set(start_rollout_ids)) == 1
diff --git a/miles/ray/rollout.py b/miles/ray/rollout.py
index d9019bd5..1516864c 100644
--- a/miles/ray/rollout.py
+++ b/miles/ray/rollout.py
@@ -380,9 +380,7 @@ def _convert_samples_to_train_data(self, samples: list[Sample] | list[list[Sampl
             "sample_indices": [sample.index for sample in samples],
             "prompt": [sample.prompt for sample in samples],
             # Per-sample training step indices (flow_grpo sde-window). None = train every step.
-            "sde_step_indices": [
-                (sample.train_metadata or {}).get("sde_step_indices") for sample in samples
-            ],
+            "sde_step_indices": [(sample.train_metadata or {}).get("sde_step_indices") for sample in samples],
         }
 
         return train_data
@@ -405,6 +403,7 @@ def _log_images(
         own namespace at least groups them in one UI section.
         """
         import wandb
+
         log_dict: dict = {}
         for media_key, samples in media_key_to_samples.items():
             images = []
@@ -449,10 +448,11 @@ def _split_train_data_by_dp(self, data, dp_size):
             rollout_data_refs.append(Box(ray.put(rollout_data)))
         return rollout_data_refs
 
+
 def init_rollout_engines(args, pg, all_rollout_engines):
     if args.debug_train_only:
         return 0
-    
+
     num_gpu_per_engine = min(args.rollout_num_gpus_per_engine, args.num_gpus_per_node)
     num_engines = args.rollout_num_gpus // num_gpu_per_engine
     assert len(all_rollout_engines) == num_engines
@@ -602,7 +602,7 @@ def _start_router(args):
         from miles.router.router import run_router
 
         router_args = args
-    else :
+    else:
         raise RuntimeError("Miles-diffusion only supports miles router for now")
 
     process = multiprocessing.Process(
@@ -698,5 +698,3 @@ def _is_zero_std(samples: list[Sample]):
     interesting_rewards = [str(round(g[0].get_reward_value(args), 1)) for g in interesting_sample_groups]
 
     return {f"zero_std/count_{reward}": len(items) for reward, items in group_by(interesting_rewards).items()}
-
-
diff --git a/miles/rollout/rm_hub/ocr.py b/miles/rollout/rm_hub/ocr.py
index 31d8ce88..5382d5ba 100644
--- a/miles/rollout/rm_hub/ocr.py
+++ b/miles/rollout/rm_hub/ocr.py
@@ -1,18 +1,20 @@
+import argparse
 import asyncio
 import logging
+
 import numpy as np
 import ray
 import torch
-import argparse
-from paddleocr import PaddleOCR
 from Levenshtein import distance
+from paddleocr import PaddleOCR
 from PIL import Image
-from typing import List, Union
+
 from miles.utils.misc import SingletonMeta
 from miles.utils.types import Sample
 
 logger = logging.getLogger(__name__)
 
+
 def _init_paddleocr(use_gpu: bool) -> PaddleOCR:
     def make_ocr() -> PaddleOCR:
         return PaddleOCR(
@@ -21,8 +23,10 @@ def make_ocr() -> PaddleOCR:
             use_gpu=use_gpu,
             show_log=False,
         )
+
     return make_ocr()
 
+
 class OcrScorer:
     def __init__(self, use_gpu: bool = False):
         """
@@ -32,9 +36,7 @@ def __init__(self, use_gpu: bool = False):
         self.ocr = _init_paddleocr(use_gpu)
 
     @torch.no_grad()
-    def __call__(self, 
-                images: Union[List[Image.Image], List[np.ndarray]], 
-                prompts: List[str]) -> List[float]:
+    def __call__(self, images: list[Image.Image] | list[np.ndarray], prompts: list[str]) -> list[float]:
         """
         Calculate OCR reward
         :param images: List of input images (PIL or numpy format)
@@ -44,20 +46,24 @@ def __call__(self,
         prompts = [prompt.split('"')[1] for prompt in prompts]
         rewards = []
         # Ensure input lengths are consistent
-        assert len(images) == len(prompts), f"Images({len(images)}) and prompts({len(prompts)}) must have the same length"
-        for img, prompt in zip(images, prompts):
+        assert len(images) == len(
+            prompts
+        ), f"Images({len(images)}) and prompts({len(prompts)}) must have the same length"
+        for img, prompt in zip(images, prompts, strict=False):
             # Convert image format
             if isinstance(img, Image.Image):
                 img = np.array(img)
-            
+
             try:
                 # OCR recognition
                 result = self.ocr.ocr(img, cls=False)
                 # Extract recognized text (handle possible multi-line results)
-                recognized_text = ''.join([res[1][0] if res[1][1] > 0 else '' for res in result[0]]) if result[0] else ''
-                
-                recognized_text = recognized_text.replace(' ', '').lower()
-                prompt = prompt.replace(' ', '').lower()
+                recognized_text = (
+                    "".join([res[1][0] if res[1][1] > 0 else "" for res in result[0]]) if result[0] else ""
+                )
+
+                recognized_text = recognized_text.replace(" ", "").lower()
+                prompt = prompt.replace(" ", "").lower()
                 if prompt in recognized_text:
                     dist = 0
                 else:
@@ -65,16 +71,17 @@ def __call__(self,
                 # Recognized many unrelated characters, only add one character penalty
                 if dist > len(prompt):
                     dist = len(prompt)
-                
+
             except Exception as e:
                 # Error handling (e.g., OCR parsing failure)
                 print(f"OCR processing failed: {str(e)}")
                 dist = len(prompt)  # Maximum penalty
-            reward = 1-dist/(len(prompt))
+            reward = 1 - dist / (len(prompt))
             rewards.append(reward)
 
         return rewards
 
+
 @ray.remote
 class OcrRewardActor:
     def __init__(self, use_gpu: bool = False):
@@ -107,6 +114,7 @@ async def score(self, image: np.ndarray, prompt: str) -> float:
         loop = asyncio.get_running_loop()
         return float(await loop.run_in_executor(None, ray.get, ref))
 
+
 def _rgb_hwc_from_generated(sample: Sample) -> np.ndarray:
     """``generated_output``: ``[C, F, H, W]`` or ``[C, H, W]``; use frame index 0.
 
@@ -136,19 +144,21 @@ def _rgb_hwc_from_generated(sample: Sample) -> np.ndarray:
         out = hwc.clip(0, 255).astype(np.uint8)
     return out
 
+
 async def ocr_rm(args, sample: Sample):
     pool = AsyncOcrPool(args)
     image = _rgb_hwc_from_generated(sample)
     score = await pool.score(image, sample.prompt)
     return score
 
+
 if __name__ == "__main__":
     args = argparse.Namespace(ocr_num_workers=4)
     pil_image = Image.open("imgs/miles_logo.png").convert("RGB")
     image_tensor = torch.from_numpy(np.array(pil_image)).permute(2, 0, 1).unsqueeze(1).float()
     sample = Sample(
-        prompt="A logo of Miles saying \"Miles\"",
+        prompt='A logo of Miles saying "Miles"',
         generated_output=image_tensor,
     )
     img = np.array(pil_image)
-    print(OcrScorer()([img], [sample.prompt])[0])
\ No newline at end of file
+    print(OcrScorer()([img], [sample.prompt])[0])
diff --git a/miles/rollout/sglang_diffusion_rollout.py b/miles/rollout/sglang_diffusion_rollout.py
index fe776b38..6ad11abf 100644
--- a/miles/rollout/sglang_diffusion_rollout.py
+++ b/miles/rollout/sglang_diffusion_rollout.py
@@ -22,7 +22,7 @@
 from miles.utils.misc import SingletonMeta, load_function
 from miles.utils.types import Sample
 
-from .rm_hub import async_rm, batched_async_rm
+from .rm_hub import batched_async_rm
 
 __all__ = ["generate_rollout"]
 
@@ -30,13 +30,9 @@
 
 
 def build_rollout_sampling_params(
-    args: Namespace, 
-    *, 
-    extra_sampling_params: dict[str, Any] | None = None, 
-    evaluation: bool = False
+    args: Namespace, *, extra_sampling_params: dict[str, Any] | None = None, evaluation: bool = False
 ) -> dict[str, Any]:
-    """Build static fields in JSON body for ``POST /rollout/generate`` (``RolloutImageRequest``). 
-    """
+    """Build static fields in JSON body for ``POST /rollout/generate`` (``RolloutImageRequest``)."""
     neg = args.diffusion_negative_prompt
     eval_steps = args.diffusion_eval_num_steps
     num_steps = eval_steps if evaluation and eval_steps is not None else args.diffusion_num_steps
@@ -71,20 +67,21 @@ def build_rollout_sampling_params(
 
     return sampling_params
 
+
 def build_rollout_generate_payload(
     sampling_params: dict[str, Any],
     prompt: str,
     *,
     num_outputs_per_prompt: int = 1,
 ) -> dict[str, Any]:
-    """Build full JSON payload for ``POST /rollout/generate`` (``RolloutImageRequest``).
-    """
+    """Build full JSON payload for ``POST /rollout/generate`` (``RolloutImageRequest``)."""
     sampling_params["prompt"] = prompt
     if sampling_params["negative_prompt"] is None:
         sampling_params["negative_prompt"] = " "  # FlowGRPO default
     sampling_params["num_outputs_per_prompt"] = num_outputs_per_prompt
     return sampling_params
 
+
 class GenerateState(metaclass=SingletonMeta):
     """Global state for sglang-diffusion image rollout."""
 
@@ -96,9 +93,7 @@ def __init__(self, args: Namespace) -> None:
         )
         self.sampling_params = build_rollout_sampling_params(args)
         self.step_strategy_fn = (
-            load_function(args.diffusion_step_strategy_path)
-            if args.diffusion_step_strategy_path
-            else None
+            load_function(args.diffusion_step_strategy_path) if args.diffusion_step_strategy_path else None
         )
         self.dp_counts = [0] * args.sglang_dp_size
         self.dp_rank = 0
@@ -165,9 +160,7 @@ async def generate_microgroup(
         sde_indices = None
 
     payload = build_rollout_generate_payload(
-        sampling_params,
-        microgroup[0].prompt,
-        num_outputs_per_prompt=len(microgroup)
+        sampling_params, microgroup[0].prompt, num_outputs_per_prompt=len(microgroup)
     )
 
     output = await post(url, payload)
@@ -199,7 +192,6 @@ async def generate_and_rm_microgroup(
     sampling_params: dict[str, Any],
     evaluation: bool = False,
 ) -> list[Sample]:
-    return_microgroup = []
 
     state = GenerateState(args)
 
@@ -229,7 +221,7 @@ async def generate_and_rm_microgroup(
 async def generate_and_rm_group(
     args: Namespace, group: list[Sample], sampling_params: dict[str, Any], evaluation: bool = False
 ) -> list[Sample]:
-    state = GenerateState(args)
+    GenerateState(args)
 
     # N-spaced base so sgl-d's seed→[seed+0..seed+N-1] expansion stays disjoint
     # per (rollout, prompt-group); group_index is monotonic across the run.
@@ -239,11 +231,13 @@ async def generate_and_rm_group(
 
     tasks = []
     for idx in range(0, len(group), args.diffusion_microgroup_size):
-        microgroup = group[idx:min(idx + args.diffusion_microgroup_size, len(group))]
+        microgroup = group[idx : min(idx + args.diffusion_microgroup_size, len(group))]
         current_sampling_params = sampling_params.copy()
         current_sampling_params["seed"] = seed_base + idx
         tasks.append(
-            asyncio.create_task(generate_and_rm_microgroup(args, microgroup, current_sampling_params, evaluation=evaluation))
+            asyncio.create_task(
+                generate_and_rm_microgroup(args, microgroup, current_sampling_params, evaluation=evaluation)
+            )
         )
 
     microgroups = await asyncio.gather(*tasks)
@@ -293,7 +287,9 @@ async def generate_rollout_async(
     target_data_size = args.rollout_batch_size
 
     # TODO: oversampling and abort
-    assert args.over_sampling_batch_size == args.rollout_batch_size, "Now we don't support over sampling, please set --over_sampling_batch_size equal to --rollout_batch_size"
+    assert (
+        args.over_sampling_batch_size == args.rollout_batch_size
+    ), "Now we don't support over sampling, please set --over_sampling_batch_size equal to --rollout_batch_size"
 
     data = []
     all_data = []
@@ -343,9 +339,7 @@ async def generate_rollout_async(
 
     assert len(data) == args.rollout_batch_size, f"Got {len(data)} samples, expected {args.rollout_batch_size}"
     data = sorted(data, key=lambda group: group[0][0].index if isinstance(group[0], list) else group[0].index)
-    all_samples = sorted(
-        all_data, key=lambda group: group[0][0].index if isinstance(group[0], list) else group[0].index
-    )
+    sorted(all_data, key=lambda group: group[0][0].index if isinstance(group[0], list) else group[0].index)
 
     # reset the global state to prevent effects on the next rollout or eval.
     state.reset()
@@ -358,6 +352,7 @@ async def generate_rollout_async(
 
 EVAL_PROMPT_DATASET = {}
 
+
 # eval only
 async def eval_rollout(args: Namespace, rollout_id: int) -> tuple[dict[str, dict[str, list[Any]]], list[list[Sample]]]:
     assert not args.group_rm, "Group RM is not supported for eval rollout"
@@ -430,9 +425,7 @@ async def eval_rollout_single_dataset(
         if do_print:
             row = rows[0]
             logger.info(
-                "eval_rollout_single_dataset example data, prompt: "
-                f"{[str(row.prompt)]} "
-                f"reward={row.reward}"
+                "eval_rollout_single_dataset example data, prompt: " f"{[str(row.prompt)]} " f"reward={row.reward}"
             )
             do_print = False
         data.extend(rows)
diff --git a/miles/rollout/step_strategy_hub.py b/miles/rollout/step_strategy_hub.py
index a2733fc0..f638ae45 100644
--- a/miles/rollout/step_strategy_hub.py
+++ b/miles/rollout/step_strategy_hub.py
@@ -5,6 +5,7 @@
 where ``sde`` and ``ret`` are ``list[int] | None`` (``None`` = all steps).
 Point ``--diffusion-step-strategy-path`` at any such function.
 """
+
 from __future__ import annotations
 
 from argparse import Namespace
diff --git a/miles/utils/arguments.py b/miles/utils/arguments.py
index 0c1a8d3e..549dc71f 100644
--- a/miles/utils/arguments.py
+++ b/miles/utils/arguments.py
@@ -5,7 +5,6 @@
 from typing import Any
 
 import yaml
-from transformers import AutoConfig
 
 from miles.backends.sglang_diffusion_utils.arguments import add_sglang_diffusion_arguments
 from miles.backends.sglang_diffusion_utils.arguments import validate_args as sglang_validate_args
@@ -368,21 +367,21 @@ def add_rollout_arguments(parser):
                 type=int,
                 default=0,
                 help="flow_grpo-style random SDE window; 0 disables. Steps outside "
-                     "the window run ODE and are not returned for training.",
+                "the window run ODE and are not returned for training.",
             )
             parser.add_argument(
                 "--diffusion-sde-window-range",
                 type=str,
                 default=None,
                 help="'lo,hi' bounds for the SDE window start (inclusive, exclusive). "
-                     "Defaults to [0, num_inference_steps].",
+                "Defaults to [0, num_inference_steps].",
             )
             parser.add_argument(
                 "--diffusion-step-strategy-path",
                 type=str,
                 default=None,
                 help="Dotted path to a factory(args) -> StepStrategy callable. "
-                     "Overrides --diffusion-sde-window-size.",
+                "Overrides --diffusion-sde-window-size.",
             )
             parser.add_argument(
                 "--diffusion-log-prob-no-const",
@@ -901,7 +900,6 @@ def add_wandb_arguments(parser):
             parser.add_argument("--wandb-run-id", type=str, default=None)
             return parser
 
-
         # debug
         def add_debug_arguments(parser):
             parser.add_argument(
@@ -959,14 +957,24 @@ def add_debug_arguments(parser):
             )
 
             # LoRA
-            parser.add_argument("--diffusion-ignore-last", type=int, default=0,
-                help="Skip last N denoising steps for training (avoids small-sigma numerical issues). FlowGRPO/DanceGRPO use 1.")
-            parser.add_argument("--use-lora", action="store_true", default=False,
-                help="Use LoRA adapters instead of full finetune.")
+            parser.add_argument(
+                "--diffusion-ignore-last",
+                type=int,
+                default=0,
+                help="Skip last N denoising steps for training (avoids small-sigma numerical issues). FlowGRPO/DanceGRPO use 1.",
+            )
+            parser.add_argument(
+                "--use-lora", action="store_true", default=False, help="Use LoRA adapters instead of full finetune."
+            )
             parser.add_argument("--lora-rank", type=int, default=64)
             parser.add_argument("--lora-alpha", type=int, default=64)
-            parser.add_argument("--lora-target-modules", type=str, nargs="+", default=None,
-                help="Override LoRA target modules. Default: per-model from TrainPipelineConfig.")
+            parser.add_argument(
+                "--lora-target-modules",
+                type=str,
+                nargs="+",
+                default=None,
+                help="Override LoRA target modules. Default: per-model from TrainPipelineConfig.",
+            )
             parser.add_argument(
                 "--diffusion-init-lora-weight",
                 type=str,
@@ -1166,8 +1174,9 @@ def parse_args(add_custom_arguments=None):
     # TODO: Diffusion FSDP
     add_miles_arguments = get_miles_extra_args_provider(add_custom_arguments)
 
-    backend = parse_args_train_backend()
+    parse_args_train_backend()
     from miles.backends.fsdp_utils.arguments import load_fsdp_args
+
     args = load_fsdp_args(extra_args_provider=add_miles_arguments)
     args.rank = 0  # Primary process rank for wandb initialization
     args.world_size = args.actor_num_nodes * args.actor_num_gpus_per_node
@@ -1251,9 +1260,7 @@ def miles_validate_args(args):
         args.eval_reward_key = args.reward_key
 
     if args.diffusion_log_image_interval < 1:
-        raise ValueError(
-            f"diffusion_log_image_interval must be >= 1, got {args.diffusion_log_image_interval}"
-        )
+        raise ValueError(f"diffusion_log_image_interval must be >= 1, got {args.diffusion_log_image_interval}")
 
     if args.dump_details is not None:
         args.save_debug_rollout_data = f"{args.dump_details}/rollout_data/{{rollout_id}}.pt"
@@ -1321,9 +1328,9 @@ def miles_validate_args(args):
 
     dp_size = args.actor_num_gpus_per_node * args.actor_num_nodes
     if args.global_batch_size is not None:
-        assert args.global_batch_size % dp_size == 0, (
-            f"global_batch_size {args.global_batch_size} is not divisible by dp_size {dp_size}"
-        )
+        assert (
+            args.global_batch_size % dp_size == 0
+        ), f"global_batch_size {args.global_batch_size} is not divisible by dp_size {dp_size}"
     else:
         args.global_batch_size = dp_size
 
diff --git a/miles/utils/diffusion_protocol.py b/miles/utils/diffusion_protocol.py
index 229cfb28..a2e86fff 100644
--- a/miles/utils/diffusion_protocol.py
+++ b/miles/utils/diffusion_protocol.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
+from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import Iterable
 
 import torch
 
@@ -9,10 +9,10 @@
 @dataclass(frozen=True)
 class DiffusionRolloutSpec:
     # Required rollout keys to reconstruct per-step log_prob_new and PPO ratio in training.
-    #latents and next latents for log_prob_new in training, log_prob_old used with log_prob_new (in training) to get ratio.
+    # latents and next latents for log_prob_new in training, log_prob_old used with log_prob_new (in training) to get ratio.
     required_keys: tuple[str, ...] = ("timesteps", "sigmas", "latents", "next_latents", "log_prob_old")
     # Optional rollout keys for KL regularization or debugging.
-    #mean of distribution p(x_{t+1} | x_t), for KL
+    # mean of distribution p(x_{t+1} | x_t), for KL
     optional_keys: tuple[str, ...] = ("prev_latents_mean",)
 
 
@@ -119,9 +119,7 @@ def validate_rollout_metadata(metadata: dict) -> list[str]:
     if t_t and t_l and t_t != t_l:
         errors.append(f"timestep mismatch: timesteps steps {t_t} != latents steps {t_l}")
     if t_s and t_t and t_s not in (t_t, t_t + 1):
-        errors.append(
-            f"timestep mismatch: sigmas steps {t_s} not in (timesteps {t_t}, timesteps+1 {t_t + 1})"
-        )
+        errors.append(f"timestep mismatch: sigmas steps {t_s} not in (timesteps {t_t}, timesteps+1 {t_t + 1})")
     if t_l and t_n and t_l != t_n:
         errors.append(f"timestep mismatch: latents steps {t_l} != next_latents steps {t_n}")
     if t_l and t_p and t_l != t_p:
diff --git a/miles/utils/diffusion_rollout_response.py b/miles/utils/diffusion_rollout_response.py
index 0ddf9bc5..8e825b36 100644
--- a/miles/utils/diffusion_rollout_response.py
+++ b/miles/utils/diffusion_rollout_response.py
@@ -80,9 +80,7 @@ def _parse_cond_kwargs(
         encoder_hidden_states=_parse_tensor_or_list(
             data.get("encoder_hidden_states"), deserialize_func=deserialize_func
         ),
-        pooled_projections=_parse_tensor_or_list(
-            data.get("pooled_projections"), deserialize_func=deserialize_func
-        ),
+        pooled_projections=_parse_tensor_or_list(data.get("pooled_projections"), deserialize_func=deserialize_func),
     )
 
 
@@ -144,7 +142,9 @@ def apply_rollout_image_response(
         sample.seed = int(body["seed"])
 
     sample.generated_output = deserialize_func(body.get("generated_output"))
-    sample.rollout_log_probs = _deserialize_rollout_log_probs(body.get("rollout_log_probs"), deserialize_func=deserialize_func)
+    sample.rollout_log_probs = _deserialize_rollout_log_probs(
+        body.get("rollout_log_probs"), deserialize_func=deserialize_func
+    )
     sample.rollout_debug_tensors = _parse_rollout_debug_tensors(
         body.get("rollout_debug_tensors"),
         deserialize_func=deserialize_func,
diff --git a/miles/utils/sde_log_prob.py b/miles/utils/sde_log_prob.py
index 87856e0c..60495dc1 100644
--- a/miles/utils/sde_log_prob.py
+++ b/miles/utils/sde_log_prob.py
@@ -4,7 +4,6 @@
 """
 
 import math
-from typing import Union
 
 import torch
 
@@ -12,7 +11,7 @@
 def sde_step_with_logprob(
     scheduler,
     model_output: torch.FloatTensor,
-    timestep: Union[float, torch.FloatTensor],
+    timestep: float | torch.FloatTensor,
     sample: torch.FloatTensor,
     prev_sample: torch.FloatTensor,
     noise_level: float = 0.7,
diff --git a/miles/utils/types.py b/miles/utils/types.py
index f15c8588..d7169d41 100644
--- a/miles/utils/types.py
+++ b/miles/utils/types.py
@@ -1,17 +1,20 @@
 from __future__ import annotations
 
+import base64
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any
+
 import torch
-import base64
 from safetensors.torch import load, save
 
+
 def decode_tensor_base64(b64: str) -> torch.Tensor:
     """Deserialize base64 to CPU tensor (same wire format as inference: safetensors ``[\"t\"]``, else ``torch.load``)."""
     raw = base64.b64decode(b64.encode("ascii") if isinstance(b64, str) else b64)
     return load(raw)["t"]
 
+
 def tensor_to_base64(tensor: torch.Tensor) -> str:
     """Encode a CPU tensor as base64 safetensors (single key ``tensor_key``, default ``t``)."""
     tensor = tensor.detach().cpu()
diff --git a/train_diffusion.py b/train_diffusion.py
index 4e7caadb..3e69b372 100644
--- a/train_diffusion.py
+++ b/train_diffusion.py
@@ -60,7 +60,7 @@ def save(rollout_id):
         if args.eval_interval is not None and rollout_id == 0 and not args.skip_eval_before_train:
             ray.get(rollout_manager.eval.remote(rollout_id))
 
-        #generating rollout data
+        # generating rollout data
         rollout_data_ref = ray.get(rollout_manager.generate.remote(rollout_id))
         logger.info(f"train: rollout {rollout_id} generate done")