RuyiAI-Stack · yuzibo · Apr 19, 2026 · Apr 22, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/.github/workflows/ci-riscv64.yml b/.github/workflows/ci-riscv64.yml
@@ -0,0 +1,167 @@
+# Note: this runner is provided externally, so we minimize its access to
+# secrets.
+
+name: CI (riscv64)
+
+on:
+  push:
+    branches: [riscv]
+
+  pull_request_target:
+    types: [opened, synchronize, reopened]
+
+permissions:
+  contents: read
+  # No permissions to secrets.
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  core-ci:
+    name: pytorch-riscv64-core-ci
+    runs-on: [self-hosted, linux, amd64]
+
+    outputs:
+      base_commit: ${{ steps.meta.outputs.base_commit }}
+      head_commit: ${{ steps.meta.outputs.head_commit }}
+      patch_file: ${{ steps.patch.outputs.patch_file }}
+      ci_result_base_url: ${{ steps.jenkins.outputs.ci_result_base_url }}
+      ci_stat_url: ${{ steps.jenkins.outputs.ci_stat_url }}
+
+    # This is in its own separate environment.
+    environment: riscv64
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 3000   # shadow clone?
+          ref: ${{ github.sha }} # including latest sha
+
+      - name: Extract PR info
+        run: |
+          echo "BASE_SHA=${{ github.event.pull_request.base.sha }}" >> $GITHUB_ENV
+          echo "HEAD_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV
+
+      - name: Diff base and head
+        id: meta
+        run: |
+          if [[ "${{ github.event_name }}" = "pull_request" || "${{ github.event_name }}" == "pull_request_target" ]]; then
+            echo "Push PR build"
+            BASE_REF="${{ github.base_ref }}"
+            HEAD_REF="${{ github.head_ref }}"
+
+            echo "Base ref: $BASE_REF"
+            echo "Head ref: $HEAD_REF"
+
+            # must based on riscv
+            if [ "$BASE_REF" != "riscv" ]; then
+              echo "ERROR: PR must target 'riscv' branch, got '$BASE_REF'"
+              exit 1
+            fi
+
+            # need to get contents of the PR
+            git fetch --quiet origin pull/${{ github.event.pull_request.number }}/head:pr-head
+            git fetch --quiet origin main
+            BASE=$(git merge-base pr-head origin/main)
+            HEAD=$(git rev-parse pr-head)
+          else
+            echo "Push to riscv"
+            # 统一用 riscv 作为 baseline
+            git fetch --quiet origin main
+            #git fetch origin riscv
+
+            BASE=$(git merge-base ${{ github.sha }} origin/main) # The latest commit
+            HEAD=${{ github.sha }}
+
+          fi
+
+          echo "BASE_COMMIT=$BASE" >> $GITHUB_ENV
+          echo "HEAD_COMMIT=$HEAD" >> $GITHUB_ENV
+
+          echo "base_commit=$BASE" >> "$GITHUB_OUTPUT"
+          echo "head_commit=$HEAD" >> "$GITHUB_OUTPUT"
+
+          echo "Base: $BASE"
+          echo "Head: $HEAD"
+
+      - name: Generate patch
+        id: patch
+        run: |
+          echo "Generating patch..."
+
+          SHORT_HEAD=${HEAD_COMMIT:0:7}
+          PATCH_NAME="patch_${SHORT_HEAD}.patch"
+
+          git diff $BASE_COMMIT $HEAD_COMMIT > $PATCH_NAME
+
+          echo "Patch size:"
+          wc -l $PATCH_NAME
+
+          cp $PATCH_NAME /home/jenkins/patch/
+          cat /home/jenkins/patch/$PATCH_NAME
+
+          echo "PATCH_FILE=$PATCH_NAME" >> "$GITHUB_ENV"
+          echo "patch_file=$PATCH_NAME" >> "$GITHUB_OUTPUT"
+
+      - name: Trigger Jenkins Job
+        id: jenkins
+        run: |
+          set -euo pipefail
+
+          BASE=${{ steps.meta.outputs.base_commit }}
+          PATCH=${{ steps.patch.outputs.patch_file }}
+
+          bash /home/jenkins/scripts/jenkins-run.sh $BASE  $PATCH | tee jenkins.log
+
+          CI_STAT_URL=$(grep -oE 'https://[^ ]+/pytorch-ci-stat\.json' jenkins.log | tail -n1)
+
+          if [[ -z "$CI_STAT_URL" ]]; then
+            echo "ERROR: cannot find pytorch-ci-stat.json URL from Jenkins log"
+            exit 1
+          fi
+
+          CI_RESULT_BASE_URL="${CI_STAT_URL%/pytorch-ci-stat.json}"
+
+          echo "ci_stat_url=$CI_STAT_URL" >> "$GITHUB_OUTPUT"
+          echo "ci_result_base_url=$CI_RESULT_BASE_URL" >> "$GITHUB_OUTPUT"
+
+          echo "CI_STAT_URL=$CI_STAT_URL"
+          echo "CI_RESULT_BASE_URL=$CI_RESULT_BASE_URL"
+
+  full-ci:
+    name: pytorch-riscv64-full-ci
+    runs-on: [self-hosted, linux, amd64]
+    needs: core-ci
+    if: always()
+    continue-on-error: true
+
+    steps:
+      - name: Query existing full test result
+        shell: bash
+        run: |
+          set -euo pipefail
+
+          BASE_URL="${{ needs.core-ci.outputs.ci_result_base_url }}"
+          STAT_URL="${BASE_URL}/pytorch-ci-stat.json"
+
+          echo "STAT_URL=$STAT_URL"
+
+          curl -fsSL "$STAT_URL" -o pytorch-ci-stat.json
+
+          echo "==== FULL TEST STAT ===="
+          cat pytorch-ci-stat.json
+          echo
+
+          FAILED=$(jq '.failed | length' pytorch-ci-stat.json)
+
+          if [[ "$FAILED" != "0" ]]; then
+            echo "==== FULL TEST FAILED ===="
+            echo "failed cases: $FAILED"
+            exit 1
+          fi
+
+          echo "==== FULL TEST PASSED ===="
+          echo "full test no failures"
+
diff --git a/.gitmodules b/.gitmodules
@@ -41,7 +41,7 @@
 [submodule "third_party/cpuinfo"]
     ignore = dirty
     path = third_party/cpuinfo
-    url = https://github.com/pytorch/cpuinfo.git
+    url = https://github.com/RuyiAI-Stack/cpuinfo.git
 [submodule "third_party/python-peachpy"]
     ignore = dirty
     path = third_party/python-peachpy

diff --git a/test/inductor/test_cpu_select_algorithm.py b/test/inductor/test_cpu_select_algorithm.py
@@ -1583,6 +1583,7 @@ def forward(self, x):
         vec_amx = VecAMX()
         self._check_amx_counter(vec_amx)
 
+    @unittest.skipIf(not torch._C._has_mkldnn, "MKLDNN is not enabled")
     @inductor_config.patch({"freezing": True})
     @patches
     @torch.no_grad
@@ -1700,6 +1701,7 @@ def forward(self, x, scale):
             vec_amx = VecAMX()
             self._check_amx_counter(vec_amx)
 
+    @unittest.skipIf(not torch._C._has_mkldnn, "MKLDNN is not enabled")
     @inductor_config.patch({"freezing": True, "cpp.enable_concat_linear": True})
     @patches
     @torch.no_grad

diff --git a/test/run_test.py b/test/run_test.py
@@ -110,6 +110,7 @@ def upload_adhoc_failure_json(*args, **kwargs):
 INDUCTOR_TEST_PREFIX = "inductor"
 IS_SLOW = "slow" in TEST_CONFIG or "slow" in BUILD_ENVIRONMENT
 IS_S390X = platform.machine() == "s390x"
+IS_RISCV64 = platform.machine() == "riscv64"
 
 
 # Note [ROCm parallel CI testing]
@@ -285,6 +286,52 @@ def __contains__(self, item):
     "test_xpu",
 ]
 
+RISCV64_BLOCKLIST = [
+    # disable distributed related test
+    "inductor/test_distributed_patterns",
+    "fx/test_dce_pass",
+    "export/test_cpp_serdes",
+    "export/test_export",
+    "export/test_export_strict",
+    "export/test_export_training_ir_to_run_decomp",
+    "export/test_retraceability",
+    "export/test_serdes",
+    "export/test_strict_export_v2",
+    "test_public_bindings",
+    "ao/sparsity/test_composability",
+    # QNNPACK is not supported
+    "export/test_converter",
+    # record_contex_cpp is not support on non-linux non-x86_64 platforms
+    "torch_np/numpy_tests/core/test_numeric",
+    # Failed to import torch.distributed.run: cannot import name 'Store' from 'torch.distributed'
+    "test_testing",
+    "inductor/test_aot_inductor_arrayref",
+    "inductor/test_cpu_repro",
+    # TODO: mkldnn not available, shape guard failures on RISC-V
+    "inductor/test_cpu_select_algorithm",
+    # TODO:scalar value not equal, need to fix
+    "profiler/test_profiler",
+    # TODO precision
+    "test_binary_ufuncs",
+    "test_decomp",
+    # TODO no CUDA related module
+    "quantization/core/test_workflow_module",  # TestFakeQuantize.test_fq_module_per_channel
+    "quantization/core/test_workflow_ops",
+    "quantization/core/test_quantized_op",
+    # z3-solver build fail
+    "test_proxy_tensor",
+    # too slow on riscv64
+    # 53013.55 s
+    "functorch/test_aotdispatch",
+    # 25069 s
+    "functorch/test_ops",
+    # 17528 s
+    "test_transformers",
+    # 10897 s
+    "functorch/test_vmap",
+]
+
+
 # The tests inside these files should never be run in parallel with each other
 RUN_PARALLEL_BLOCKLIST = [
     "test_extension_utils",
@@ -1875,6 +1922,13 @@ def get_selected_tests(options) -> list[str]:
             selected_tests,
             "Skip distributed tests on s390x",
         )
+    elif IS_RISCV64:
+        selected_tests = exclude_tests(RISCV64_BLOCKLIST, selected_tests, "on riscv64")
+        selected_tests = exclude_tests(
+            DISTRIBUTED_TESTS,
+            selected_tests,
+            "Skip distributed tests on riscv64",
+        )
 
     # skip all distributed tests if distributed package is not available.
     if not dist.is_available():

diff --git a/test/test_linalg.py b/test/test_linalg.py
@@ -9002,6 +9002,7 @@ def test_matrix_exp_backward_input_validation(self, device, dtype):
         with self.assertRaisesRegex(RuntimeError, "must be batches of square matrices"):
             torch.ops.aten.matrix_exp_backward(non_square, grad_non_square)
 
+    @slowTest
     @skipCUDAIfNoMagmaAndNoLinalgsolver
     @skipCPUIfNoLapack
     @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)

diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
@@ -32,6 +32,7 @@
     IS_SANDCASTLE,
     IS_S390X,
     IS_ARM64,
+    IS_RISCV64,
     parametrize,
     TEST_WITH_TORCHDYNAMO,
     xfailIfTorchDynamo,
@@ -1109,6 +1110,13 @@ def test_float_to_int_conversion_nonfinite(self, device, dtype):
 
         if dtype == torch.bool:
             refs = (True, True, True)
+        elif IS_RISCV64:
+            if dtype in (torch.int32, torch.int64):
+                refs = (torch.iinfo(dtype).min, torch.iinfo(dtype).max, torch.iinfo(dtype).max)
+            elif dtype == torch.uint8:
+                refs = (0, torch.iinfo(dtype).max, torch.iinfo(dtype).max)
+            elif dtype in (torch.int8, torch.int16):
+                refs = (0, -1, -1)
         elif IS_ARM64:
             refs = (torch.iinfo(dtype).min, torch.iinfo(dtype).max, 0)
             if dtype in (torch.int8, torch.int16):

diff --git a/test/test_torch.py b/test/test_torch.py
@@ -43,7 +43,7 @@
     wrapDeterministicFlagAPITest, DeterministicGuard, CudaSyncGuard,
     bytes_to_scalar, parametrize, noncontiguous_like,
     AlwaysWarnTypedStorageRemoval, TEST_WITH_TORCHDYNAMO, xfailIfTorchDynamo,
-    xfailIfS390X, set_warn_always_context, decorateIf, isRocmArchAnyOf)
+    xfailIfS390X, xfailIfRISCV, set_warn_always_context, decorateIf, isRocmArchAnyOf)
 from multiprocessing.reduction import ForkingPickler
 from torch.testing._internal.common_device_type import (
     expectedFailureMeta,
@@ -9594,14 +9594,21 @@ def test_type(self):
 
     # FIXME: port to a quantization test suite
     @xfailIfS390X
+    @xfailIfRISCV
     def test_qengine(self):
         qengines = torch.backends.quantized.supported_engines
+        if not qengines:
+            self.skipTest("No quantized engines supported on this platform")
         original_qe = torch.backends.quantized.engine
         for qe in qengines:
             torch.backends.quantized.engine = qe
             if torch.backends.quantized.engine != qe:
                 raise AssertionError(f"qengine not set successfully: expected {qe}, got {torch.backends.quantized.engine}")
-        torch.backends.quantized.engine = original_qe
+        # On platforms where no qengine is compiled in as the default (e.g. RISC-V),
+        # the initial engine reads as "none" (NoQEngine), which is not a valid value
+        # to pass back to _set_qengine. Only restore if it was a real engine.
+        if original_qe != "none":
+            torch.backends.quantized.engine = original_qe
 
     def test_terminate_handler_on_crash(self):
         cmd = [sys.executable, '-c', "import os; os.environ[\"TORCH_CUSTOM_TERMINATE\"] ='1'; \

diff --git a/third_party/kineto b/third_party/kineto
+9 −0		.github/scripts/config_cpu.sh
+15 −0		.github/scripts/config_cuda.sh
+12 −0		.github/scripts/config_rocm.sh
+9 −0		.github/scripts/config_xpu.sh
+3 −7		.github/scripts/setup.sh
+0 −3		.github/workflows/linux_cpu_kineto.yml
+0 −3		.github/workflows/linux_cpu_pytorch.yml
+0 −3		.github/workflows/linux_cuda_kineto.yml
+0 −3		.github/workflows/linux_cuda_pytorch.yml
+0 −3		.github/workflows/mac_cpu.yml
+0 −3		.gitmodules
+1 −1		benchmarks/CMakeLists.txt
+41 −13		libkineto/CMakeLists.txt
+8 −3		libkineto/include/AbstractConfig.h
+22 −2		libkineto/include/Config.h
+4 −0		libkineto/include/IActivityProfiler.h
+4 −1		libkineto/src/AbstractConfig.cpp
+1 −1		libkineto/src/ApproximateClock.cpp
+36 −1		libkineto/src/Config.cpp
+86 −2		libkineto/src/ConfigLoader.cpp
+6 −0		libkineto/src/ConfigLoader.h
+1 −1		libkineto/src/CuptiActivity.h
+16 −18		libkineto/src/CuptiActivityApi.cpp
+5 −4		libkineto/src/CuptiActivityProfiler.cpp
+2 −2		libkineto/src/CuptiCallbackApi.cpp
+39 −10		libkineto/src/CuptiCbidRegistry.cpp
+12 −0		libkineto/src/CuptiCbidRegistry.h
+3 −3		libkineto/src/CuptiRangeProfiler.cpp
+4 −0		libkineto/src/CuptiRangeProfilerConfig.h
+12 −12		libkineto/src/GenericActivityProfiler.cpp
+9 −13		libkineto/src/GenericActivityProfiler.h
+3 −15		libkineto/src/Logger.h
+0 −2		libkineto/src/RocmActivityProfiler.h
+0 −131		libkineto/src/RocmStreamQueue.h
+0 −14		libkineto/src/RocprofActivityApi.cpp
+4 −4		libkineto/src/RocprofLogger.cpp
+0 −24		libkineto/src/RoctracerActivityApi.cpp
+5 −4		libkineto/src/RoctracerLogger.cpp
+453 −33		libkineto/src/cupti_strings.cpp
+1 −4		libkineto/src/cupti_strings.h
+1 −1		libkineto/src/output_csv.cpp
+5 −5		libkineto/src/output_json.cpp
+12 −9		libkineto/src/plugin/xpupti/XpuptiActivityProfilerSession.cpp
+4 −0		libkineto/src/plugin/xpupti/XpuptiScopeProfilerConfig.h
+6 −0		libkineto/test/ApproximateClockTest.cpp
+1 −1		libkineto/test/CMakeLists.txt
+52 −0		libkineto/test/ConfigTest.cpp
+1 −1		libkineto/test/CuptiRangeProfilerConfigTest.cpp
+15 −49		libkineto/test/CuptiStringsTest.cpp
+0 −123		libkineto/test/LoggerObserverTest.cpp
+11 −189		libkineto/test/RocmActivityProfilerTest.cpp
+1 −1		libkineto/test/xpupti/CMakeLists.txt
+1 −1		libkineto/test/xpupti/XpuptiScopeProfilerConfigTest.cpp
+1 −1		libkineto/test/xpupti/compute/CMakeLists.txt
+0 −1		libkineto/third_party/json