rainyfly · rainyfly · Feb 28, 2026 · Feb 28, 2026 · Feb 28, 2026 · Feb 28, 2026
diff --git a/.clang-format b/.clang-format
@@ -16,7 +16,7 @@
 ---
 Language:        Cpp
 BasedOnStyle:  Google
-IndentWidth:     4
+IndentWidth:     2
 TabWidth:        2
 ContinuationIndentWidth: 4
 AccessModifierOffset: -1  # The private/protected/public has no indent in class
@@ -26,4 +26,5 @@ BinPackParameters: false
 BinPackArguments: false
 IncludeBlocks: Preserve
 IncludeIsMainSourceRegex: (\.cu)$
+SortIncludes: false
 ...
diff --git a/.claude/skills/cuda-kernel-unittest.md b/.claude/skills/cuda-kernel-unittest.md
@@ -0,0 +1,174 @@
+# Skill: CUDA Kernel Unit Test
+
+Write unit tests for PaddlePaddle CUDA custom ops following a modular 4-layer architecture.
+
+## Trigger
+
+When the user asks to write/create/add unit tests for a CUDA kernel (`.cu` file in `custom_ops/`).
+
+## Steps
+
+1. **Read the CUDA kernel source** to understand: input/output tensors, dtypes, shapes, which tensors are CPU vs GPU, scalar attrs, in-place semantics.
+2. **Write the test file** in `tests/operators/test_<kernel_name>.py` following the structure below.
+
+## Test File Structure
+
+```python
+import unittest
+from typing import Any, Dict
+import numpy as np
+import paddle
+
+# --- Import ops (bypass fastdeploy.__init__) ---
+try:
+    import sys, os
+    _fd_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    if _fd_root not in sys.path:
+        sys.path.insert(0, _fd_root)
+    from fastdeploy.import_ops import import_custom_ops
+    _package = "fastdeploy.model_executor.ops.gpu"
+    import_custom_ops(_package, ".fastdeploy_ops", globals())
+except ImportError as e:
+    print(f"Import error: {e}")
+    raise
+
+CUDA_PLACE = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace()
+CPU_PLACE = paddle.CPUPlace()
+
+
+# ============================================================
+# Layer 1: Helpers — tensor creation / kernel invocation / output extraction
+# ============================================================
+
+def to_paddle_inputs(inputs: Dict[str, Any]) -> Dict[str, Any]:
+    """Convert numpy dict → paddle tensors. CPU tensors must be explicitly handled."""
+    paddle_inputs = {}
+    for k, v in inputs.items():
+        if isinstance(v, (int, bool, float, str)):
+            paddle_inputs[k] = v
+        elif k in ("<CPU_TENSOR_NAMES>",):  # <-- tensors the kernel expects on CPU
+            paddle_inputs[k] = paddle.to_tensor(v, place=CPU_PLACE)
+        elif v is not None:
+            paddle_inputs[k] = paddle.to_tensor(v, place=CUDA_PLACE)
+        else:
+            paddle_inputs[k] = None
+    return paddle_inputs
+
+def run_kernel(paddle_inputs, inputs):
+    """Call the CUDA kernel with paddle tensors + scalar attrs."""
+    kernel_name(
+        paddle_inputs["tensor_a"],
+        # ... all tensor args ...
+        inputs["scalar_attr"],  # scalar attrs from raw dict
+    )
+
+def get_outputs(paddle_inputs) -> Dict[str, np.ndarray]:
+    """Extract ALL in-place-modified tensors back to numpy."""
+    keys = ["tensor_a", "tensor_b", ...]
+    return {k: paddle_inputs[k].numpy() for k in keys}
+
+
+# ============================================================
+# Layer 2: Input generation
+# ============================================================
+
+def gen_<kernel>_inputs(real_bsz=8, ..., seed=42) -> Dict[str, Any]:
+    """Generate randomized test inputs. Returns dict with both numpy arrays and scalar configs."""
+    rng = np.random.default_rng(seed)
+    # ... generate all numpy arrays with correct dtypes/shapes ...
+    return { "tensor_a": ..., "scalar_attr": ..., "real_bsz": real_bsz, ... }
+
+
+# ============================================================
+# Layer 3: Reference implementation (pure Python/NumPy)
+# ============================================================
+
+def reference_<kernel>(inputs: Dict[str, Any]) -> Dict[str, Any]:
+    """Python reference — must match CUDA kernel logic exactly."""
+    # Deep-copy all mutable arrays
+    tensor_a = inputs["tensor_a"].copy()
+    # ... replicate kernel logic ...
+    return {"tensor_a": tensor_a, ...}
+
+
+# ============================================================
+# Layer 4a: TEST_CONFIGS — all pure-parameter test scenarios
+# ============================================================
+
+TEST_CONFIGS = [
+    # Each config is a dict of gen_<kernel>_inputs kwargs + a "name" key.
+    # Pure parameter variations go here — do NOT create separate test methods for them.
+    #
+    # --- basic coverage ---
+    {"name": "small_batch",   "real_bsz": 1,  "seed": 42, ...},
+    {"name": "large_batch",   "real_bsz": 64, "seed": 42, ...},
+    # --- mode / strategy variants ---
+    {"name": "mode_a",        "real_bsz": 8,  "mode": "a", "seed": 42, ...},
+    {"name": "mode_b",        "real_bsz": 8,  "mode": "b", "seed": 42, ...},
+    # --- flags ---
+    {"name": "reject_all",    "real_bsz": 8,  "reject_all": True, "seed": 42, ...},
+    {"name": "accept_all",    "real_bsz": 8,  "accept_all": True, "seed": 42, ...},
+    # --- edge cases ---
+    {"name": "min_batch",     "real_bsz": 1,  "max_tokens": 1, "seed": 42, ...},
+]
+
+
+# ============================================================
+# Layer 4b: Test suite
+# ============================================================
+
+class Test<KernelName>(unittest.TestCase):
+
+    # ------ shared helpers ------
+
+    def _run_and_get(self, inputs):
+        paddle_inputs = to_paddle_inputs(inputs)
+        run_kernel(paddle_inputs, inputs)
+        return get_outputs(paddle_inputs)
+
+    def _check_all_outputs(self, inputs, outputs):
+        """Compare ALL output tensors against reference + sanity checks."""
+        ref = reference_<kernel>(inputs)
+        all_keys = ["tensor_a", "tensor_b", ...]
+        for key in all_keys:
+            np.testing.assert_array_equal(
+                outputs[key], ref[key], err_msg=f"{key} mismatch"
+            )
+        # Add domain-specific sanity checks here
+
+    def _run_full_test(self, config):
+        inputs = gen_<kernel>_inputs(**config)
+        outputs = self._run_and_get(inputs)
+        self._check_all_outputs(inputs, outputs)
+        return outputs
+
+    # ------ test cases ------
+
+    def test_configs(self):
+        """Run all TEST_CONFIGS via subTest (one subTest per config)."""
+        for cfg in TEST_CONFIGS:
+            with self.subTest(name=cfg["name"]):
+                test_cfg = {k: v for k, v in cfg.items() if k != "name"}
+                self._run_full_test(test_cfg)
+
+    # Only keep separate test methods for scenarios that need tensor overrides:
+    def test_special_scenario(self):
+        """Scenarios that need manual tensor setup beyond gen_inputs params."""
+        inputs = gen_<kernel>_inputs(real_bsz=2, seed=42)
+        inputs["some_tensor"][0, 2] = special_value  # override specific tensor
+        outputs = self._run_and_get(inputs)
+        self._check_all_outputs(inputs, outputs)
+
+if __name__ == "__main__":
+    unittest.main()
+```
+
+## Key Rules
+
+1. **CPU vs GPU tensors**: Read the CUDA kernel `.cu` file carefully. If a tensor is `copy_to(place, false)` inside the host function, it's a CPU tensor input — must use `CPU_PLACE` in `to_paddle_inputs`.
+2. **`_check_all_outputs` checks ALL tensors**: Every in-place-modified output tensor must be compared against reference. Never scatter `assertEqual`/`assertTrue` across individual test methods — all checks go through `_check_all_outputs`.
+3. **Stochastic kernels**: If the kernel uses `curand` (e.g., top-p sampling), compare only deterministic positions. Skip the last sampled token in `compare_results`. Note: `curand_states` in reference should be sized to `max_step_tokens` (position count), not `bsz` (batch count).
+4. **TEST_CONFIGS for pure-parameter scenarios**: Any test that only differs by `gen_inputs` parameters belongs in `TEST_CONFIGS`, not a separate `test_*` method. Only create separate methods when you need to **override specific tensor values** after generation.
+5. **Test cases are thin**: Each `test_*` method should be 3-15 lines. It either calls `_run_full_test(config)` or does `gen → override → _run_and_get → _check_all_outputs`.
+6. **No `fastdeploy.__init__`**: Import ops via `import_custom_ops` directly to avoid heavy dependency chain.
+7. **Padding slots**: Kernel may have `max_bsz > real_bsz`. Reference impl must handle padding slots the same way as the kernel (typically no-op or stop_count++).
diff --git a/.flake8 b/.flake8
@@ -1,5 +1,5 @@
 [flake8]
-ignore = E203, E402, E501, E731, E741, W503, W605, E722
+ignore = E203, E402, E501, E731, E741, W503, W605, E722, E231, W604, E702, E226, E221, E713, E271
 max-line-length = 119
 
 # E402: module level import not at top of file

diff --git a/.github/actions/rerun-workflow/action.yml b/.github/actions/rerun-workflow/action.yml
@@ -0,0 +1,30 @@
+name: 'Rerun Workflow'
+description: 'Re-run GitHub Actions workflow for a given Pull Request'
+inputs:
+  GITHUB_TOKEN:
+    description: 'GitHub token with repo scope'
+    required: true
+  OWNER:
+    description: 'Repository owner'
+    required: true
+  REPO:
+    description: 'Repository name'
+    required: true
+  PR_ID:
+    description: 'Pull Request ID'
+    required: true
+  JOB_NAME:
+    description: 'Job name to rerun'
+    required: true
+
+runs:
+  using: 'composite'
+  steps:
+    - run: bash ./.github/actions/rerun-workflow/rerun.sh
+      shell: bash
+      env:
+        GITHUB_TOKEN: ${{ inputs.GITHUB_TOKEN }}
+        OWNER: ${{ inputs.OWNER }}
+        REPO: ${{ inputs.REPO }}
+        PR_ID: ${{ inputs.PR_ID }}
+        JOB_NAME: ${{ inputs.JOB_NAME }}
diff --git a/.github/actions/rerun-workflow/rerun.sh b/.github/actions/rerun-workflow/rerun.sh
@@ -0,0 +1,77 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+COMMIT_SHA=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  "https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_ID" | jq -r '.head.sha')
+
+echo "Commit SHA: $COMMIT_SHA"
+
+response=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  "https://api.github.com/repos/$OWNER/$REPO/actions/runs?head_sha=$COMMIT_SHA&per_page=100")
+
+echo "Response: $response"
+
+run_ids=$(echo "$response" | jq -r '.workflow_runs[].id')
+
+if [ -n "$run_ids" ]; then
+  echo "Found run_ids for commit $COMMIT_SHA: $run_ids"
+
+  for run_id in $run_ids; do
+    if [ "$JOB_NAME" = "all-failed" ]; then
+      echo "Rerunning all failed jobs for run_id: $run_id"
+
+      rerun_response=$(curl -X POST -s -w "%{http_code}" -o /dev/null \
+        -H "Accept: application/vnd.github.v3+json" \
+        -H "Authorization: Bearer $GITHUB_TOKEN" \
+        "https://api.github.com/repos/$OWNER/$REPO/actions/runs/$run_id/rerun-failed-jobs")
+      if [ "$rerun_response" -eq 201 ]; then
+        echo "Successfully requested rerun for all blocked jobs in run_id: $run_id"
+      else
+        echo "Failed to request rerun for run_id: $run_id with status code $rerun_response"
+      fi
+
+    else
+      jobs_response=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
+      "https://api.github.com/repos/$OWNER/$REPO/actions/runs/$run_id/jobs")
+
+      echo "Jobs Response for run_id $run_id: $jobs_response"
+
+      # if [[ "$JOB_NAME" == *"bypass"* ]]; then
+        block_jobs=$(echo "$jobs_response" | jq -r --arg job_name "$JOB_NAME" \
+        '.jobs[] | select(.name == $job_name) | .id')
+      # else
+      #   block_jobs=$(echo "$jobs_response" | jq -r --arg job_name "$JOB_NAME" \
+      #   '.jobs[] | select(.name == $job_name and .conclusion != "success") | .id')
+      # fi
+
+      if [ -n "$block_jobs" ]; then
+        echo "Found block jobs for run_id $run_id: $block_jobs"
+
+        for job_id in $block_jobs; do
+          echo "Rerunning job_id: $job_id"
+          curl -X POST -H "Accept: application/vnd.github.v3+json" \
+            -H "Authorization: token $GITHUB_TOKEN" \
+            "https://api.github.com/repos/$OWNER/$REPO/actions/jobs/$job_id/rerun"
+        done
+      else
+        echo "No block jobs found for run_id $run_id with name $JOB_NAME."
+      fi
+    fi
+  done
+else
+  echo "No matching workflow runs found for commit $COMMIT_SHA."
+  exit 1
+fi
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
@@ -0,0 +1,54 @@
+# GitHub Copilot Custom Review Instructions
+
+When reviewing code, focus on:
+
+## Security Critical Issues
+- Check for hardcoded secrets, API keys, or credentials
+- Look for SQL injection and XSS vulnerabilities
+- Verify proper input validation and sanitization
+- Review authentication and authorization logic
+
+## Performance Red Flags
+- Identify N+1 database query problems
+- Spot inefficient loops and algorithmic issues
+- Check for memory leaks and resource cleanup
+- Review caching opportunities for expensive operations
+
+## Code Quality Essentials
+- Functions should be focused and appropriately sized
+- Use clear, descriptive naming conventions
+- Ensure proper error handling throughout
+
+## Review Style
+- Be specific and actionable in feedback
+- Explain the "why" behind recommendations
+- Acknowledge good patterns when you see them
+- Ask clarifying questions when code intent is unclear
+
+Always prioritize security vulnerabilities and performance issues that could impact users.
+
+Always suggest changes to improve readability. For example, this suggestion seeks to make the code more readable and also makes the validation logic reusable and testable.
+
+// Instead of:
+if (user.email && user.email.includes('@') && user.email.length > 5) {
+  submitButton.enabled = true;
+} else {
+  submitButton.enabled = false;
+}
+
+// Consider:
+function isValidEmail(email) {
+  return email && email.includes('@') && email.length > 5;
+}
+
+submitButton.enabled = isValidEmail(user.email);
+
+## Description for pull request
+
+- Please check the title of the Pull Request. It needs to follow the format of [CLASS]Title, for example, [BugFix] Fix memory leak of data processor. If the title is incorrect, provide suggestions on how the committer should modify it.
+- Please check the description information of the Pull Request. At a minimum, it should explain why these modifications are being made in this Pull Request and what problem is being solved. If the committer hasn't written the corresponding information or the information is incomplete, prompt the committer to make modifications.
+- For all Pull Requests, please confirm whether it is necessary to add, update, or delete documentation, and remind the committer to handle it accordingly.
+
+## Others
+- 对于所有提交的PR，你提交的评论都使用中文语言，但需要注意，代码中的注释仍然需要使用英文
+- 在你提交Pull Request时，需要注意代码风格要满足本Repo的设定，commit代码前需要`pip install pre-commit==4.2.0`并且执行`pre-commit install`
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -0,0 +1,34 @@
+<!-- TemplateReference: https://github.com/PaddlePaddle/FastDeploy/blob/develop/.github/pull_request_template.md -->
+
+<!-- Thank you for your contribution! Please follow these guidelines to enhance your pull request. If anything is unclear, submit your PR and reach out to maintainers for assistance. -->
+
+## Motivation
+
+<!-- Describe the purpose and goals of this pull request. -->
+
+> :bulb: If this PR is a Cherry Pick, the PR title needs to follow the format by adding the [Cherry-Pick] label at the very beginning and appending the original PR ID at the end. For example, [Cherry-Pick][CI] Add check trigger and logic(#5191)
+
+> :bulb: 如若此PR是Cherry Pick，PR标题需遵循格式，在最开始加上[Cherry-Pick]标签，以及最后面加上原PR ID，例如[Cherry-Pick][CI] Add check trigger and logic(#5191)
+
+## Modifications
+
+<!-- Detail the changes made in this pull request. -->
+
+## Usage or Command
+
+<!-- You should provide the usage if this pr is about the new function. -->
+<!-- You should provide the command to run if this pr is about the performance optimization or fixing bug. -->
+
+## Accuracy Tests
+
+<!-- If this pull request affects model outputs (e.g., changes to the kernel or model forward code), provide accuracy test results. -->
+
+## Checklist
+
+- [ ] Add at least a tag in the PR title.
+  - Tag list: [`[FDConfig]`,`[APIServer]`,`[Engine]`, `[Scheduler]`, `[PD Disaggregation]`, `[Executor]`, `[Graph Optimization]`, `[Speculative Decoding]`, `[RL]`, `[Models]`, `[Quantization]`, `[Loader]`, `[OP]`, `[KVCache]`, `[DataProcessor]`, `[BugFix]`, `[Docs]`, `[CI]`, `[Optimization]`, `[Feature]`, `[Benchmark]`, `[Others]`, `[XPU]`, `[HPU]`, `[GCU]`, `[DCU]`, `[Iluvatar]`, `[Metax]`]
+  - You can add new tags based on the PR content, but the semantics must be clear.
+- [ ] Format your code, run `pre-commit` before commit.
+- [ ] Add unit tests. Please write the reason in this PR if no unit tests.
+- [ ] Provide accuracy results.
+- [ ] If the current PR is submitting to the `release` branch, make sure the PR has been submitted to the `develop` branch, then cherry-pick it to the `release` branch with the `[Cherry-Pick]` PR tag.