From 260dd5a2fe40db9c795ce331059a1af9643a16ba Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Fri, 17 Apr 2026 15:18:01 +0800
Subject: [PATCH 01/25] CI: refactor workflows, add multi-platform support, fix
 QA scripts (post-Apr-02)

---
 .github/configs/cuda.yml                      |  23 +-
 .github/configs/metax.yml                     |  47 ++--
 .github/scripts/setup_cuda.sh                 |  25 +++
 .github/scripts/setup_metax.sh                |  46 ++++
 .github/workflows/all_tests_common.yml        | 131 +++++++----
 .github/workflows/all_tests_cuda.yml          |   2 +
 .github/workflows/all_tests_metax.yml         |   9 +-
 .github/workflows/blossom-ci.yml              |  86 --------
 .github/workflows/deploy_nightly_docs.yml     |  39 ----
 .github/workflows/functional_tests_common.yml | 190 ----------------
 .../workflows/integration_tests_common.yml    | 179 +++++++++++++++
 .github/workflows/lint.yml                    |  63 ------
 .github/workflows/lint_common.yml             |  47 ++++
 .github/workflows/qa-format.yml               |  32 ---
 .../qa-l0-te-cpp-unittest-pytorch-lint.yml    |  17 +-
 .../workflows/qa-l1-te-cpp-pytorch-tests.yml  |  22 +-
 .github/workflows/trigger-ci.yml              | 102 ---------
 .github/workflows/unit_tests_common.yml       | 204 ++----------------
 .github/workflows/upload-ci-logs.yml          |  52 -----
 3rdparty/cudnn-frontend                       |   2 +-
 3rdparty/googletest                           |   2 +-
 qa/L0_pytorch_debug_unittest/test.sh          |   9 +-
 qa/L0_pytorch_unittest/test.sh                |   2 -
 qa/L1_pytorch_distributed_unittest/test.sh    | 125 ++++++++++-
 qa/L1_pytorch_mcore_integration/test.sh       | 149 ++++++++++---
 qa/L1_pytorch_mcore_integration/test_bak.sh   |  79 +++++++
 transformer_engine/common/__init__.py         |  34 ++-
 .../plugin/core/backends/vendor/cuda/cuda.py  |  41 +++-
 28 files changed, 860 insertions(+), 899 deletions(-)
 create mode 100755 .github/scripts/setup_cuda.sh
 create mode 100755 .github/scripts/setup_metax.sh
 delete mode 100644 .github/workflows/blossom-ci.yml
 delete mode 100644 .github/workflows/deploy_nightly_docs.yml
 delete mode 100644 .github/workflows/functional_tests_common.yml
 create mode 100644 .github/workflows/integration_tests_common.yml
 delete mode 100644 .github/workflows/lint.yml
 create mode 100644 .github/workflows/lint_common.yml
 delete mode 100644 .github/workflows/qa-format.yml
 delete mode 100644 .github/workflows/trigger-ci.yml
 delete mode 100644 .github/workflows/upload-ci-logs.yml
 create mode 100644 qa/L1_pytorch_mcore_integration/test_bak.sh

diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml
index 6975fab589..fcfc2f1be8 100644
--- a/.github/configs/cuda.yml
+++ b/.github/configs/cuda.yml
@@ -5,22 +5,24 @@
 hardware_name: cuda
 display_name: 'NVIDIA CUDA (A100)'
 
+# CI image for BAAI env
 ci_image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
 
 # Runner labels for self-hosted A100 node
 # runner_labels:
-# - self-hosted
-# - Linux
-# - X64
-# - nvidia
-# - gpu-8
+#   - self-hosted
+#   - Linux
+#   - X64
+#   - nvidia
+#   - gpu-8
+
+# Runner labels for BAAI env
 runner_labels:
   - nv-8g-cicd-te
 
 # Container volumes
 container_volumes:
   - /home/flagscale_cicd/flask/static:/workspace/report
-  # - /home/flagscale_cicd/data:/opt/data
 
 # Container options
 container_options: >-
@@ -32,9 +34,8 @@ container_options: >-
   --ulimit stack=67108864 
   --user root
 
-# Device types
-device_types:
-  - a100
+# Platform-specific environment setup script
+setup_script: .github/scripts/setup_cuda.sh
 
 # Build environment variables (platform-specific)
 build_env:
@@ -47,6 +48,10 @@ build_env:
   CUDA_HOME: /usr/local/cuda-12.8
   NVCC: /usr/local/cuda-12.8/bin/nvcc
 
+# Device types to run tests on
+device_types:
+  - a100
+
 # Test matrix configuration
 test_matrix:
   l0_pytorch:
diff --git a/.github/configs/metax.yml b/.github/configs/metax.yml
index e3b10c892d..07ae49925f 100644
--- a/.github/configs/metax.yml
+++ b/.github/configs/metax.yml
@@ -1,28 +1,33 @@
 # Metax Hardware Configuration for TE-FL
 # This file defines CI/CD settings for Metax-based testing
-# Test configurations are defined in tests/test_utils/config/platforms/metax.yaml
+# This file defines environment variables, volumes, and test filters for TE tests.
 
 hardware_name: metax
 display_name: 'Metax Tests'
 
-ci_image: localhost:5000/megatron-lm-with-te:v1
-# ci_image: harbor.baai.ac.cn/flagscale/megatron-lm-with-te:202603231839
+# CI image for Metax dev env
+# ci_image: localhost:5000/megatron-lm-with-te:v1
 
-runner_labels:
-  - self-hosted
-  - Linux
-  - X64
-  - metax
-  - dev
+# CI image for BAAI env
+ci_image: harbor.baai.ac.cn/flagscale/megatron-lm-with-te:202603231839
+
+# Runner labels for self-hosted Metax node
 # runner_labels:
-#   - mx-4g-cicd-te
+#   - self-hosted
+#   - Linux
+#   - X64
+#   - metax
+#   - dev
+
+# Runner labels for BAAI env
+runner_labels:
+  - mx-4g-cicd-te
 
+# Container volumes
 container_volumes:
   - /nfs/metax_fs:/nfs/metax_fs
-  - /dev/dri:/dev/dri
-  - /dev/mxcd:/dev/mxcd
-  - /dev/infiniband:/dev/infiniband
 
+# Container options
 container_options: >-
   --uts=host
   --ipc=host
@@ -30,17 +35,16 @@ container_options: >-
   --group-add video
   --shm-size=100gb
   --ulimit memlock=-1
-  --security-opt seccomp=unconfined
-  --security-opt apparmor=unconfined
-  --device=/dev/dri
-  --device=/dev/mxcd
-  --device=/dev/infiniband
   --user root
   --ulimit nofile=65535:65535
   -e PLATFORM=metax
   -e TORCH_DISTRIBUTED_BACKEND=mccl
   -e LD_LIBRARY_PATH=/opt/maca/lib:/usr/local/lib:$LD_LIBRARY_PATH
 
+# Platform-specific environment setup script
+setup_script: .github/scripts/setup_metax.sh
+
+# Build environment variables (platform-specific)
 build_env:
   TE_FL_SKIP_CUDA: '1'
   NVTE_WITH_MACA: '1'
@@ -62,10 +66,3 @@ test_matrix:
       # example: tests/unit_tests/test_example.py
       # - tests/unit_tests/test_inference.py
       # - tests/unit_tests/test_rl_utils.py
-
-  # functional:
-  #   train:
-  #     - device: c500
-  #       task: train
-  #       model: deepseek
-  #       case: tp2_pp2_ep2
diff --git a/.github/scripts/setup_cuda.sh b/.github/scripts/setup_cuda.sh
new file mode 100755
index 0000000000..f9e289c6d0
--- /dev/null
+++ b/.github/scripts/setup_cuda.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# CUDA Platform Environment Setup Script
+# Called by unit_tests_common.yml for CUDA platforms (A100, H100, etc.)
+set -euo pipefail
+
+echo "===== Step 0: Activate Python environment ====="
+source /opt/miniconda3/etc/profile.d/conda.sh
+conda activate flagscale-train
+echo "PATH=$PATH" >> $GITHUB_ENV
+echo "Python: $(which python3) ($(python3 --version 2>&1))"
+
+echo "===== Step 1: Remove Existing TransformerEngine ====="
+pip uninstall transformer_engine transformer_engine_torch -y || true
+
+echo "===== Step 2: Build & Install TransformerEngine ====="
+cd $GITHUB_WORKSPACE
+
+pip install nvdlfw-inspect --quiet
+pip install expecttest --quiet
+pip install . -v --no-deps --no-build-isolation
+
+echo "===== Step 3: Verify Installation ====="
+python3 tests/pytorch/test_sanity_import.py
+
+echo "===== Environment Setup Complete ====="
diff --git a/.github/scripts/setup_metax.sh b/.github/scripts/setup_metax.sh
new file mode 100755
index 0000000000..b05e0190b1
--- /dev/null
+++ b/.github/scripts/setup_metax.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# Metax Platform Environment Setup Script
+# Called by unit_tests_common.yml for Metax platforms (C500, etc.)
+set -euo pipefail
+
+echo "===== Step 0: Activate Python environment ====="
+source /opt/conda/etc/profile.d/conda.sh
+conda activate base
+echo "PATH=$PATH" >> $GITHUB_ENV
+echo "Python: $(which python3) ($(python3 --version 2>&1))"
+
+echo "===== Step 1: Base Environment Setup ====="
+# Configure MACA toolchain paths
+export PATH=/opt/maca/bin:$PATH
+export LD_LIBRARY_PATH=/opt/maca/lib:$LD_LIBRARY_PATH
+service ssh restart
+
+echo "===== Step 2: Create nvcc Symlink (cucc -> nvcc) ====="
+# TransformerEngine expects nvcc, but MACA provides cucc
+ln -sf /opt/maca/tools/cu-bridge/bin/cucc /opt/maca/tools/cu-bridge/bin/nvcc
+which nvcc || true
+
+echo "===== Step 3: Install Required System Tools ====="
+# Install essential build tools (avoid modifying Python dependencies)
+apt-get update -qq && apt-get install -y -qq git cmake ninja-build curl
+
+echo "===== Step 4: Remove Existing TransformerEngine ====="
+# Prevent conflicts with preinstalled or incompatible versions
+python3 -m pip uninstall transformer_engine -y || true
+python3 -m pip install nvdlfw-inspect --no-deps || true
+
+echo "===== Step 5: Install TE-FL Plugin Layer ====="
+# Install TransformerEngine-FL Python layer (plugin logic)
+cd $GITHUB_WORKSPACE
+TE_FL_SKIP_CUDA=1 python3 setup.py install
+
+echo "===== Step 6: Final Verification ====="
+# Verify both TE Python API and backend are functional
+python3 - <<'EOF'
+import transformer_engine
+import transformer_engine_torch as te
+print("transformer_engine:", transformer_engine)
+print("transformer_engine_torch:", te)
+EOF
+
+echo "===== Environment Setup Complete ====="
diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml
index 2165de9b49..e1b60c4cd9 100644
--- a/.github/workflows/all_tests_common.yml
+++ b/.github/workflows/all_tests_common.yml
@@ -7,13 +7,20 @@ on:
         required: true
         type: string
         description: Platform name (e.g., cuda, default)
-      setup_commands:
+      run_unit_tests:
         required: false
-        type: string
-        default: ''
+        type: boolean
+        default: true
+        description: Whether to run unit tests in this workflow
+      run_integration_tests:
+        required: false
+        type: boolean
+        default: true
+        description: Whether to run integration tests in this workflow
 
 jobs:
   checkout_and_config:
+    name: checkout_and_config
     defaults:
       run:
         shell: bash
@@ -24,19 +31,12 @@ jobs:
       container_volumes: ${{ steps.config.outputs.container_volumes }}
       container_options: ${{ steps.config.outputs.container_options }}
       device_types: ${{ steps.config.outputs.device_types }}
-      train_test_matrix: ${{ steps.config.outputs.train_test_matrix }}
-      ignored_tests: ${{ steps.config.outputs.ignored_tests }}
+      setup_script: ${{ steps.config.outputs.setup_script }}
       build_env: ${{ steps.config.outputs.build_env }}
     steps:
       - name: Checkout source code
         uses: actions/checkout@v4
 
-      - name: Check if tests should run
-        id: should_run
-        run: |
-         
-          echo "should_run=true" >> $GITHUB_OUTPUT
-
       - name: Load platform configuration
         id: config
         run: |
@@ -71,26 +71,29 @@ jobs:
           DEVICE_TYPES=$(yq '.device_types | tojson(0)' "$CONFIG_FILE")
           echo "device_types=$DEVICE_TYPES" >> $GITHUB_OUTPUT
 
-          # Read test matrix for training
-          TRAIN_MATRIX=$(yq '.test_matrix.functional.train | tojson(0)' "$CONFIG_FILE")
-          echo "train_test_matrix=$TRAIN_MATRIX" >> $GITHUB_OUTPUT
-
-          # Read ignored tests list from test_matrix.unit (default to empty array if not defined)
-          IGNORED_TESTS=$(yq '.test_matrix.unit.ignored_tests // [] | tojson(0)' "$CONFIG_FILE")
-          echo "ignored_tests=$IGNORED_TESTS" >> $GITHUB_OUTPUT
+          # Read setup script path
+          SETUP_SCRIPT=$(yq '.setup_script // ""' "$CONFIG_FILE")
+          echo "setup_script=$SETUP_SCRIPT" >> $GITHUB_OUTPUT
 
           # Read build environment variables (default to empty object if not defined)
           BUILD_ENV=$(yq '.build_env // {} | tojson(0)' "$CONFIG_FILE")
           echo "build_env=$BUILD_ENV" >> $GITHUB_OUTPUT
 
+  lint:
+    name: lint
+    uses: ./.github/workflows/lint_common.yml
+
   unit_tests:
-    needs: checkout_and_config
+    name: unit_tests
+    if: inputs.run_unit_tests
+    needs: 
+      - checkout_and_config
+      - lint
     strategy:
       fail-fast: false
       matrix:
         device: ${{ fromJson(needs.checkout_and_config.outputs.device_types) }}
     uses: ./.github/workflows/unit_tests_common.yml
-    name: unit_tests
     with:
       platform: ${{ inputs.platform }}
       device: ${{ matrix.device }}
@@ -98,24 +101,57 @@ jobs:
       runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
       container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
       container_options: ${{ needs.checkout_and_config.outputs.container_options }}
-      setup_commands: ${{ inputs.setup_commands }}
-      ignored_tests: ${{ needs.checkout_and_config.outputs.ignored_tests }}
+      setup_script: ${{ needs.checkout_and_config.outputs.setup_script }}
       build_env: ${{ needs.checkout_and_config.outputs.build_env }}
 
-  # arguments.py not compatible with megatron-core-fl
-  # functional_tests:
-  #   needs:
-  #     - checkout_and_config
-  #   if: fromJson(needs.checkout_and_config.outputs.train_test_matrix)[0] != null
-  #   uses: ./.github/workflows/functional_tests_common.yml
-  #   with:
-  #     platform: ${{ inputs.platform }}
-  #     test_matrix: ${{ needs.checkout_and_config.outputs.train_test_matrix }}
-  #     image: ${{ needs.checkout_and_config.outputs.ci_image }}
-  #     runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
-  #     container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
-  #     container_options: ${{ needs.checkout_and_config.outputs.container_options }}
+  unit_tests_complete:
+    name: unit_tests_complete
+    needs: 
+      - unit_tests
+    runs-on: ubuntu-latest
+    if: always() && inputs.run_unit_tests
+    steps:
+      - name: Check unit tests result
+        run: |
+          if [ "${{ needs.unit_tests.result }}" != "success" ] && \
+             [ "${{ needs.unit_tests.result }}" != "skipped" ]; then
+            echo "❌ Unit tests failed: ${{ needs.unit_tests.result }}"
+            exit 1
+          fi
+          echo "✅ Unit tests passed"
 
+  integration_tests:
+    name: integration_tests
+    if: inputs.run_integration_tests
+    needs:
+      - checkout_and_config
+      - lint
+    uses: ./.github/workflows/integration_tests_common.yml
+    with:
+      platform: ${{ inputs.platform }}
+      device: ${{ matrix.device }}
+      image: ${{ needs.checkout_and_config.outputs.ci_image }}
+      runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
+      container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
+      container_options: ${{ needs.checkout_and_config.outputs.container_options }}
+      setup_script: ${{ needs.checkout_and_config.outputs.setup_script }}
+      build_env: ${{ needs.checkout_and_config.outputs.build_env }}
+
+  integration_tests_complete:
+    name: integration_tests_complete
+    if: always() && inputs.run_integration_tests
+    needs: 
+      - integration_tests
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check integration tests result
+        run: |
+          if [ "${{ needs.integration_tests.result }}" != "success" ] && \
+             [ "${{ needs.integration_tests.result }}" != "skipped" ]; then
+            echo "❌ Integration tests failed: ${{ needs.integration_tests.result }}"
+            exit 1
+          fi
+          echo "✅ Integration tests passed"
 
   all_tests_complete:
     defaults:
@@ -123,8 +159,9 @@ jobs:
         shell: bash
     needs:
       - checkout_and_config
-      - unit_tests
-      # - functional_tests
+      - lint
+      - unit_tests_complete
+      - integration_tests_complete
     runs-on: ubuntu-latest
     if: always()
     steps:
@@ -133,15 +170,23 @@ jobs:
           # Check all test jobs (skip if not run)
           failed=false
 
-          if [ "${{ needs.unit_tests.result }}" != "success" ]; then
-            echo "❌ Unit tests failed"
+          if [ "${{ needs.lint.result }}" != "success" ] && \
+            [ "${{ needs.lint.result }}" != "skipped" ]; then
+            echo "❌ Lint failed"
             failed=true
           fi
 
-          # if [ "${{ needs.functional_tests.result }}" != "success" ]; then
-          #   echo "❌ Training functional tests failed"
-          #   failed=true
-          # fi
+          if [ "${{ needs.unit_tests_complete.result }}" != "success" ] && \
+            [ "${{ needs.unit_tests_complete.result }}" != "skipped" ]; then
+            echo "❌ Unit tests failed or cancelled: ${{ needs.unit_tests_complete.result }}"
+            failed=true
+          fi
+
+          if [ "${{ needs.integration_tests_complete.result }}" != "success" ] && \
+            [ "${{ needs.integration_tests_complete.result }}" != "skipped" ]; then
+            echo "❌ Integration tests failed or cancelled: ${{ needs.integration_tests_complete.result }}"
+            failed=true
+          fi
 
           if [ "$failed" = "true" ]; then
             exit 1
diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml
index 0aa652f64b..cc7ade9f50 100644
--- a/.github/workflows/all_tests_cuda.yml
+++ b/.github/workflows/all_tests_cuda.yml
@@ -17,6 +17,8 @@ jobs:
     uses: ./.github/workflows/all_tests_common.yml
     with:
       platform: cuda
+      run_unit_tests: true
+      run_integration_tests: true
 
   all_tests:
     needs: run_tests
diff --git a/.github/workflows/all_tests_metax.yml b/.github/workflows/all_tests_metax.yml
index d3e496c4b2..0af545e291 100644
--- a/.github/workflows/all_tests_metax.yml
+++ b/.github/workflows/all_tests_metax.yml
@@ -13,15 +13,12 @@ concurrency:
 
 jobs:
   run_tests:
+    # Package manager and environment settings are read from .github/configs/metax.yml
     uses: ./.github/workflows/all_tests_common.yml
     with:
       platform: metax
-      # Metax Environment Setup
-      setup_commands: |
-        export PATH=/opt/conda/bin:$PATH        
-        export LD_LIBRARY_PATH=/usr/local/maca/lib:/opt/maca/lib:$LD_LIBRARY_PATH        
-        which python3
-        python3 -m pip --version
+      run_unit_tests: true
+      run_integration_tests: true
 
   all_tests:
     needs: run_tests
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
deleted file mode 100644
index cc2f9eb9a8..0000000000
--- a/.github/workflows/blossom-ci.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# See LICENSE for license information.
-
-# A workflow to trigger ci on hybrid infra (github + self hosted runner)
-
-# DISABLED in FlagOS
-name: Blossom-CI
-on:
-  issue_comment:
-    types: [__disabled_do_not_remove__]
-  workflow_dispatch:
-      inputs:
-          platform:
-            description: 'runs-on argument'
-            required: false
-          args:
-            description: 'argument'
-            required: false
-jobs:
-  Authorization:
-    name: Authorization
-    runs-on: blossom
-    outputs:
-      args: ${{ env.args }}
-
-    # This job only runs for pull request comments
-    if: >
-         github.event.comment.body == '/blossom-ci'
-         && (
-           github.actor == 'ptrendx'
-           || github.actor == 'ksivaman'
-         )
-    steps:
-      - name: Check if comment is issued by authorized person
-        run: blossom-ci
-        env:
-          OPERATION: 'AUTH'
-          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
-
-  Vulnerability-scan:
-    name: Vulnerability scan
-    needs: [Authorization]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v2
-        with:
-          repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
-          ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
-          lfs: 'true'
-
-      - name: Run blossom action
-        uses: NVIDIA/blossom-action@main
-        env:
-          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
-        with:
-          args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
-          args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
-          args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}
-
-  Job-trigger:
-    name: Start ci job
-    needs: [Vulnerability-scan]
-    runs-on: blossom
-    steps:
-      - name: Start ci job
-        run: blossom-ci
-        env:
-          OPERATION: 'START-CI-JOB'
-          CI_SERVER: ${{ secrets.CI_SERVER }}
-          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-  Upload-Log:
-    name: Upload log
-    runs-on: blossom
-    if : github.event_name == 'workflow_dispatch'
-    steps:
-      - name: Jenkins log for pull request ${{ fromJson(github.event.inputs.args).pr }} (click here)
-        run: blossom-ci
-        env:
-          OPERATION: 'POST-PROCESSING'
-          CI_SERVER: ${{ secrets.CI_SERVER }}
-          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/deploy_nightly_docs.yml b/.github/workflows/deploy_nightly_docs.yml
deleted file mode 100644
index 38a3e1dbc2..0000000000
--- a/.github/workflows/deploy_nightly_docs.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# See LICENSE for license information.
-
-# A workflow to deploy the nightly version of TE documentation to GitHub Pages
-name: Deploy nightly docs
-on:
-  push:
-    branches: [ "__disabled_do_not_remove__" ]
-jobs:
-  build:
-    uses: ./.github/workflows/docs.yml
-
-  prepare:
-    needs: build
-    runs-on: ubuntu-latest
-    steps:
-      - name: Download artifact
-        uses: actions/download-artifact@v4
-        with:
-            name: "te_docs"
-            path: "html"
-      - name: Prepare for pages
-        uses: actions/upload-pages-artifact@v1.0.7
-        with:
-          name: github-pages
-          path: "html"
-  deploy:
-    needs: prepare
-    environment:
-      name: github-pages
-      url: ${{ steps.deployment.outputs.page_url }}
-    permissions:
-      pages: write
-      id-token: write
-    runs-on: ubuntu-latest
-    steps:
-    - name: Deploy
-      uses: actions/deploy-pages@v2.0.0
diff --git a/.github/workflows/functional_tests_common.yml b/.github/workflows/functional_tests_common.yml
deleted file mode 100644
index aa6b734778..0000000000
--- a/.github/workflows/functional_tests_common.yml
+++ /dev/null
@@ -1,190 +0,0 @@
-# Disabled for compatibility issues
-name: Common Functional Tests - Training
-
-on:
-  workflow_call:
-    inputs:
-      platform:
-        required: true
-        type: string
-        description: Platform name (e.g., cuda, default)
-      test_matrix:
-        required: true
-        type: string
-        description: JSON array of test configurations
-      image:
-        required: true
-        type: string
-      runs_on:
-        required: true
-        type: string
-      container_volumes:
-        required: true
-        type: string
-      container_options:
-        required: true
-        type: string
-
-jobs:
-  functional_test_train:
-    defaults:
-      run:
-        shell: bash
-    env:
-      PROJECT_ROOT: ${{ github.workspace }}
-    runs-on: ${{ fromJson(inputs.runs_on) }}
-    strategy:
-      fail-fast: false
-      matrix:
-        test_config: ${{ fromJson(inputs.test_matrix) }}
-    container:
-      image: ${{ inputs.image }}
-      ports:
-        - 80
-      volumes: ${{ fromJson(inputs.container_volumes) }}
-      options: ${{ inputs.container_options }}
-
-    steps:
-      - name: Checkout source code
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-      
-      # - name: Set safe directory
-      #   run: |
-      #     git config --global --add safe.directory $PROJECT_ROOT
-      ## The above step is commented out because there is no git cli in the container, and it causes the step to fail. The safe directory is set in the next step with a conditional check.
-      - name: Set safe directory
-        run: |
-          command -v git && git config --global --add safe.directory $PROJECT_ROOT || true
-
-      - name: Activate Python environment
-        run: |
-          source /opt/conda/etc/profile.d/conda.sh
-          conda activate base   
-          echo "PATH=$PATH" >> $GITHUB_ENV
-    
-      - name: Setup Python environment
-        env:
-          NVTE_WITH_MACA: '1'
-          NVTE_WITH_CUDA: '0'
-          NVCC: /opt/maca/bin/mcc
-          CUDA_HOME: /opt/maca
-          
-          PATH: /opt/maca/bin:${{ env.PATH }}
-          LD_LIBRARY_PATH: /opt/maca/lib:${{ env.LD_LIBRARY_PATH }}
-        run: |
-          set -euo pipefail
-          cd $PROJECT_ROOT
-          pip install -e . --no-deps --no-build-isolation
-        timeout-minutes: 60
-
-      - name: L0 Pytorch Wheel
-        id: L0_pytoech_wheel
-        # timeout-minutes: 50
-        env:
-          TE_PATH: .
-          RUN_LOG: /logs/pytorch/wheel
-        run: |
-          echo "TE_PATH: ${TE_PATH}"
-          sed -i "s/^cd transformer_engine\/pytorch\s*$/pushd transformer_engine\/pytorch/" qa/L0_pytorch_wheel/test.sh
-          sed -i '44 s/^cd \s*\$TE_PATH\s*$/popd/' qa/L0_pytorch_wheel/test.sh
-
-          cat qa/L0_pytorch_wheel/test.sh
-          # source /opt/miniconda3/etc/profile.d/conda.sh
-          # conda activate flagscale-train
-          pip uninstall -y transformer_engine
-
-          set -euo pipefail
-          cd $PROJECT_ROOT
-
-          PLATFORM='${{ inputs.platform }}'
-          DEVICE='${{ matrix.test_config.device }}'
-          TASK='${{ matrix.test_config.task }}'
-          MODEL='${{ matrix.test_config.model }}'
-          CASE='${{ matrix.test_config.case }}'
-
-          echo "Running functional tests for training"
-          echo "Platform: $PLATFORM"
-          echo "Device: $DEVICE"
-          echo "Task: $TASK"
-          echo "Model: $MODEL"
-          echo "Case: ${CASE:-all}"
-
-          # Set environment variables
-          export PYTHONPATH=$PROJECT_ROOT:${PYTHONPATH:-}
-          
-          set +e
-          bash qa/L0_pytorch_wheel/test.sh | tee ${RUN_LOG}/pytorch_wheel-${{ github.run_id }}.log
-          exit_code=$?
-          set -e
-
-          if [ $exit_code -eq 0 ]; then
-            echo "✅ Functional tests passed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE"
-          else
-            echo "❌ Functional tests failed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE (exit code: $exit_code)"
-          fi
-
-          echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
-          exit $exit_code
-
-      - name: Upload Installation Logs
-        if: always() && steps.L0_pytoech_wheel.outcome == 'failure'
-        uses: actions/upload-artifact@v4
-        with:
-          name: L0-pytorch-logs-${{ github.run_id }}
-          path: /logs/pytorch/wheel
-          retention-days: 7
-          if-no-files-found: warn
-
-      # - name: Run functional tests
-      #   id: functional_test
-      #   run: |
-      #     set -euo pipefail
-      #     cd $PROJECT_ROOT
-
-      #     PLATFORM='${{ inputs.platform }}'
-      #     DEVICE='${{ matrix.test_config.device }}'
-      #     TASK='${{ matrix.test_config.task }}'
-      #     MODEL='${{ matrix.test_config.model }}'
-      #     CASE='${{ matrix.test_config.case }}'
-
-      #     echo "Running functional tests for training"
-      #     echo "Platform: $PLATFORM"
-      #     echo "Device: $DEVICE"
-      #     echo "Task: $TASK"
-      #     echo "Model: $MODEL"
-      #     echo "Case: ${CASE:-all}"
-
-      #     # Set environment variables
-      #     export PYTHONPATH=$PROJECT_ROOT:${PYTHONPATH:-}
-
-      #     # Run functional tests via run_tests.sh with explicit platform/device/task/model/case
-      #     set +e
-      #     bash "$PROJECT_ROOT/tests/test_utils/runners/run_tests.sh" \
-      #       --platform "$PLATFORM" \
-      #       --device "$DEVICE" \
-      #       --type functional \
-      #       --task "$TASK" \
-      #       --model "$MODEL" \
-      #       --list "$CASE"
-      #     exit_code=$?
-      #     set -e
-
-      #     if [ $exit_code -eq 0 ]; then
-      #       echo "✅ Functional tests passed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE"
-      #     else
-      #       echo "❌ Functional tests failed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE (exit code: $exit_code)"
-      #     fi
-
-      #     echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
-      #     exit $exit_code
-      #   timeout-minutes: 60
-
-      # - name: Debug - keep container alive on failure
-      #   if: failure()
-      #   run: |
-      #     echo "Container sleeping for 60 minutes for debugging..."
-      #     echo "On host, run: docker ps  then  docker exec -it <container_id> bash"
-      #     sleep 3600
-      #   timeout-minutes: 60
\ No newline at end of file
diff --git a/.github/workflows/integration_tests_common.yml b/.github/workflows/integration_tests_common.yml
new file mode 100644
index 0000000000..70d0ca829d
--- /dev/null
+++ b/.github/workflows/integration_tests_common.yml
@@ -0,0 +1,179 @@
+name: Common Integration Tests
+
+on:
+  workflow_call:
+    inputs:
+      platform:
+        required: true
+        type: string
+      device:
+        required: true
+        type: string
+      image:
+        required: true
+        type: string
+      runs_on:
+        required: true
+        type: string
+      container_volumes:
+        required: true
+        type: string
+      container_options:
+        required: true
+        type: string
+      # Platform-specific environment setup script path (from platform config)
+      setup_script:
+        required: false
+        type: string
+        default: ''
+      # Platform-specific build environment variables (JSON object from config)
+      build_env:
+        required: false
+        type: string
+        default: '{}'
+
+jobs:
+  integration_test:
+    defaults:
+      run:
+        shell: bash
+    runs-on: ${{ fromJson(inputs.runs_on) }}
+    strategy:
+      fail-fast: false
+      matrix:
+        test_group:
+          - name: pytorch_mcore_integration
+            path: "qa/L1_pytorch_mcore_integration/test.sh"
+            test_type: "integration"
+    name: integration-${{ inputs.device }}-${{ matrix.test_group.name }}
+    container:
+      image: ${{ inputs.image }}
+      volumes: ${{ fromJson(inputs.container_volumes) }}
+      options: --pull never ${{ inputs.container_options }}
+
+    steps:
+      # Cuda requires git safe.directory configuration and 3 checkout attempts to handle submodule-heavy repos
+      - name: Configure Git Safe Directory on Cuda
+        if: inputs.platform == 'cuda'
+        run: /usr/bin/git config --global safe.directory '*'
+
+      - name: Checkout Source Code on Cuda (attempt 1)
+        id: checkout1
+        if: inputs.platform == 'cuda'
+        uses: actions/checkout@v4
+        continue-on-error: true
+        with:
+          fetch-depth: 0
+          submodules: recursive
+          set-safe-directory: true
+
+      - name: Checkout Source Code on Cuda (attempt 2)
+        id: checkout2
+        if: inputs.platform == 'cuda' && steps.checkout1.outcome == 'failure'
+        uses: actions/checkout@v4
+        continue-on-error: true
+        with:
+          fetch-depth: 0
+          submodules: recursive
+          set-safe-directory: true
+
+      - name: Checkout Source Code on Cuda (attempt 3)
+        id: checkout3
+        if: inputs.platform == 'cuda' && steps.checkout2.outcome == 'failure'
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          submodules: recursive
+          set-safe-directory: true
+
+      # Metax no need submodules
+      - name: Checkout Source Code on Metax
+        if: inputs.platform == 'metax'
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      
+      - name: Environment Setup
+        if: inputs.setup_script != ''
+        run: |
+          bash $GITHUB_WORKSPACE/${{ inputs.setup_script }}
+
+      - name: Execute Tests
+        if: inputs.setup_script != ''
+        env:
+          TE_PATH: ${{ github.workspace }}
+          TE_FL_PREFER: vendor
+          MCORE_REPO_URL: https://github.com/BrianPei/Megatron-LM-FL.git
+          MCORE_REF: dev
+        run: |
+          set -euo pipefail
+
+          # # Activate conda environment
+          # source /opt/miniconda3/etc/profile.d/conda.sh
+          # conda activate flagscale-train
+          export TE_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/transformer_engine
+
+          echo "=== Running L1 PyTorch Megatron-FL MCore Integration Test ==="
+          bash ${{ matrix.test_group.path }}
+        timeout-minutes: 30
+      
+      # - name: Execute Tests
+      #   if: inputs.setup_script != ''
+      #   working-directory: ${{ github.workspace }}
+      #   run: |
+      #     set -euo pipefail
+
+      #     # Load platform-specific environment variables
+      #     while IFS='=' read -r key value; do
+      #       [ -n "$key" ] && export "$key=$value"
+      #     done < <(echo '${{ inputs.build_env }}' | python3 -c "
+      #     import json, sys
+      #     env = json.load(sys.stdin)
+      #     for k, v in env.items():
+      #         print(f'{k}={v}')
+      #     ")
+
+      #     export TE_PATH=$GITHUB_WORKSPACE
+      #     export TE_LIB_PATH=$(python3 -c "import site; print(site.getsitepackages()[0])")
+      #     export PYTHONPATH=$GITHUB_WORKSPACE:${PYTHONPATH:-}
+      #     export PATH=${CUDA_HOME:-/usr/local/cuda}/bin:$PATH
+      #     export LD_LIBRARY_PATH=${CUDA_HOME:-/usr/local/cuda}/lib:${LD_LIBRARY_PATH:-}
+          
+      #     # check envs before running tests
+      #     echo "TE_PATH=$TE_PATH"
+      #     echo "TE_LIB_PATH=$TE_LIB_PATH"
+      #     echo "PYTHONPATH=$PYTHONPATH"
+      #     echo "PATH=$PATH"
+      #     echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
+          
+      #     # Ensure log directory exists regardless of volume mount state
+      #     mkdir -p /logs
+
+      #     # Coverage setup: install once + configure collection via PYTEST_ADDOPTS
+      #     COVERAGE_ENABLED=false
+      #     if pip3 install coverage pytest-cov --quiet 2>/dev/null; then
+      #       export PYTEST_ADDOPTS="--cov=transformer_engine --cov-append --cov-report="
+      #       COVERAGE_ENABLED=true
+      #     else
+      #       echo "WARNING: Failed to install coverage/pytest-cov, coverage collection disabled"
+      #     fi
+
+      #     if [[ "${{ matrix.test_group.name }}" != *"debug"* ]]; then
+      #       # Fail fast on backend/API mismatch before running the full test group.
+      #       # Skip for debug group (does not use FP8/optimizer symbols).
+      #       python3 -c "import sys, importlib; import transformer_engine.common as _te_common; tex = importlib.import_module('transformer_engine_torch'); required=['multi_tensor_scale','multi_tensor_compute_scale_and_scale_inv']; missing=[n for n in required if not hasattr(tex, n)]; print('[TE check] module:', tex); print('[TE check] file:', getattr(tex, '__file__', 'N/A')); print('[TE check] missing:', ', '.join(missing) if missing else 'none'); sys.exit(1 if missing else 0)"
+      #     fi
+
+      #     bash ${{ matrix.test_group.path }}
+      #     exit_code=$?
+
+      #     # Combine coverage fragments and generate JSON report
+      #     if [ "$COVERAGE_ENABLED" = "true" ]; then
+      #       python3 -m coverage combine --keep 2>/dev/null || true
+      #       python3 -m coverage json \
+      #         -o "coverage-${{ inputs.platform }}-${{ inputs.device }}-${{ matrix.test_group.name }}.json" \
+      #         --include="transformer_engine/*" 2>/dev/null \
+      #         || echo "WARNING: No coverage data found"
+      #     fi
+      #     exit $exit_code
+      #   timeout-minutes: 60
\ No newline at end of file
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
deleted file mode 100644
index ee6433d484..0000000000
--- a/.github/workflows/lint.yml
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# See LICENSE for license information.
-
-# A workflow to trigger lint tests on GitHub
-name: 'Lint'
-on:
-  pull_request:
-  workflow_dispatch:
-jobs:
-  pytorch_cpplint:
-    name: 'PyTorch C++'
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-      - name: 'Lint'
-        run: |
-          sudo apt-get update
-          sudo apt-get install pip -y
-          export CPP_ONLY=1
-          export TE_PATH=.
-          bash ./qa/L0_pytorch_lint/test.sh
-  pytorch_pylint:
-    name: 'PyTorch Python'
-    runs-on: ubuntu-latest
-    steps:
-      - name: 'Checkout'
-        uses: actions/checkout@v3
-      - name: 'Lint'
-        run: |
-          sudo apt-get update
-          sudo apt-get install pip -y
-          pip install torch numpy
-          export PYTHON_ONLY=1
-          export TE_PATH=.
-          bash ./qa/L0_pytorch_lint/test.sh
-  jax_cpplint:
-    name: 'JAX C++'
-    runs-on: ubuntu-latest
-    steps:
-      - name: 'Checkout'
-        uses: actions/checkout@v3
-      - name: 'Lint'
-        run: |
-          sudo apt-get update
-          sudo apt-get install pip -y
-          export CPP_ONLY=1
-          export TE_PATH=.
-          bash ./qa/L0_jax_lint/test.sh
-  jax_pylint:
-    name: 'JAX Python'
-    runs-on: ubuntu-latest
-    steps:
-      - name: 'Checkout'
-        uses: actions/checkout@v3
-      - name: 'Lint'
-        run: |
-          sudo apt-get update
-          sudo apt-get install pip -y
-          export PYTHON_ONLY=1
-          export TE_PATH=.
-          bash ./qa/L0_jax_lint/test.sh
diff --git a/.github/workflows/lint_common.yml b/.github/workflows/lint_common.yml
new file mode 100644
index 0000000000..850b93640b
--- /dev/null
+++ b/.github/workflows/lint_common.yml
@@ -0,0 +1,47 @@
+name: 'Lint'
+
+on:
+  workflow_call: 
+  workflow_dispatch:
+
+jobs:
+  pytorch_pylint:
+    name: 'pytorch_lint'
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Checkout (attempt 1)
+        id: checkout1
+        uses: actions/checkout@v4
+        continue-on-error: true
+        with:
+          fetch-depth: 0
+
+      - name: Checkout (attempt 2)
+        id: checkout2
+        if: steps.checkout1.outcome == 'failure'
+        uses: actions/checkout@v4
+        continue-on-error: true
+        with:
+          fetch-depth: 0
+
+      - name: Checkout (attempt 3)
+        if: steps.checkout2.outcome == 'failure'
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      
+      - name: Setup Python
+        run: |
+          sudo apt-get update -q
+          sudo apt-get install -y -q python3-pip
+          pip3 install torch numpy --quiet
+
+      - name: Run Lint
+        env:
+          TE_PATH: ${{ github.workspace }}
+          PYTHON_ONLY: '1'
+        run: bash ./qa/L0_pytorch_lint/test.sh
+        timeout-minutes: 15
diff --git a/.github/workflows/qa-format.yml b/.github/workflows/qa-format.yml
deleted file mode 100644
index ff1cddf312..0000000000
--- a/.github/workflows/qa-format.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-name: format_check
-
-on:
-  pull_request:
-    branches: [ "main" ]
-    types: [opened, synchronize, reopened]
-
-jobs:
-  format:
-    runs-on: ubuntu-22.04
-    env:
-      PRID: ${{ github.event.pull_request.number }}
-      BRANCH: main
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event.pull_request.base.ref }}
-
-      - name: Merge PR to sub-branch
-        run: |
-          git fetch origin pull/${PRID}/merge
-          git checkout -b test FETCH_HEAD
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.10"
-
-      - name: Run pre-commit
-        run: bash ./qa/format.sh
\ No newline at end of file
diff --git a/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml b/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
index b026f9aa10..7b072d1d5e 100644
--- a/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
+++ b/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
@@ -2,20 +2,11 @@ name: QA L0 - Core Unit & Lint Tests
 
 on:
   push:
-    branches: main
-    paths:
-      - '.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml'
-      - 'qa/L0_pytorch_lint/**'
-      - 'transformer_engine/**'
-      - 'tests/pytorch/**'
+    branches:
+      - __disabled_do_not_remove__
   pull_request:
-    branches: main
-    paths:
-      - '.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml'
-      - 'qa/L0_pytorch_lint/**'
-      - 'transformer_engine/**'
-      - 'tests/pytorch/**'
-
+    branches:
+      - __disabled_do_not_remove__
   workflow_dispatch:
 
 concurrency:
diff --git a/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml b/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
index 51f071aa3b..e7b2cf97c0 100644
--- a/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
+++ b/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
@@ -57,8 +57,8 @@ jobs:
       - name: Checkout Code
         uses: actions/checkout@v6.0.1
         with:
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
-          ref: ${{ github.event.pull_request.head.ref }}
+          repository: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name || github.repository }}
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.ref || github.ref_name }}
           ssh-strict: true
           ssh-user: git
           persist-credentials: true
@@ -166,3 +166,21 @@ jobs:
           echo "=== Running L1 PyTorch ONNX Unit Tests ==="
           bash ./qa/L1_pytorch_onnx_unittest/test.sh
         # timeout-minutes: 30
+
+            
+      - name: Run L1 PyTorch Megatron-FL MCore Integration Test
+        env:
+          TE_PATH: .
+          TE_FL_PREFER: vendor
+          MCORE_REPO_URL: https://github.com/BrianPei/Megatron-LM-FL.git
+          MCORE_REF: dev
+        run: |
+          # Activate conda environment
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+
+          export TE_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/transformer_engine
+
+          echo "=== Running L1 PyTorch Megatron-FL MCore Integration Test ==="
+          bash ./qa/L1_pytorch_mcore_integration/test.sh
+        timeout-minutes: 30
diff --git a/.github/workflows/trigger-ci.yml b/.github/workflows/trigger-ci.yml
deleted file mode 100644
index 37754fbfb7..0000000000
--- a/.github/workflows/trigger-ci.yml
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# See LICENSE for license information.
-
-# A workflow to trigger ci on hybrid infra (github + self hosted runner)
-name: TE-CI Trigger
-on:
-  issue_comment:
-    types: [__disabled_do_not_remove__]
-jobs:
-  Authorization:
-    name: Authorization
-    runs-on: blossom
-    outputs:
-      args: ${{ env.args }}
-
-    # This job only runs for pull request comments
-    if: >
-         startsWith(github.event.comment.body, '/te-ci')
-         && (
-           github.actor == 'ptrendx'
-           || github.actor == 'ksivaman'
-           || github.actor == 'schetlur-nv'
-           || github.actor == 'timmoon10'
-           || github.actor == 'zlsh80826'
-           || github.actor == 'mingxu1067'
-           || github.actor == 'cyanguwa'
-           || github.actor == 'nzmora-nvidia'
-           || github.actor == 'galagam'
-           || github.actor == 'nouiz'
-           || github.actor == 'denera'
-           || github.actor == 'sudhakarsingh27'
-           || github.actor == 'Oleg-Goncharov'
-           || github.actor == 'phu0ngng'
-           || github.actor == 'xrennvidia'
-           || github.actor == 'yaox12'
-           || github.actor == 'huanghua1994'
-           || github.actor == 'mgoldfarb-nvidia'
-           || github.actor == 'pggPL'
-           || github.actor == 'vasunvidia'
-           || github.actor == 'erhoo82'
-           || github.actor == 'kocchop'
-           || github.actor == 'youngeunkwon0405'
-           || github.actor == 'KshitijLakhani'
-           || github.actor == 'jberchtold-nvidia'
-           || github.actor == 'sanandaraj5597'
-           || github.actor == 'negvet'
-           || github.actor == 'zhongbozhu'
-           || github.actor == 'kwyss-nvidia'
-           || github.actor == 'BestJuly'
-           || github.actor == 'xiaopoc'
-           || github.actor == 'jreiffers'
-           || github.actor == 'lhb8125'
-           || github.actor == 'kunlunl'
-           || github.actor == 'pstjohn'
-           || github.actor == 'vcherepanov-nv'
-           || github.actor == 'tdophung'
-           || github.actor == 'vthumbe1503'
-           || github.actor == 'janekb04'
-           || github.actor == 'shengfangd'
-         )
-    steps:
-      - name: Check if comment is issued by authorized person
-        run: blossom-ci
-        env:
-          OPERATION: 'AUTH'
-          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
-
-  Vulnerability-scan:
-    name: Vulnerability scan
-    needs: [Authorization]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v2
-        with:
-          repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
-          ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
-          lfs: 'true'
-
-      - name: Run blossom action
-        uses: NVIDIA/blossom-action@main
-        env:
-          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
-        with:
-          args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
-          args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
-          args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}
-
-  Job-trigger:
-    name: Start ci job
-    needs: [Vulnerability-scan]
-    runs-on: blossom
-    steps:
-      - name: Start ci job
-        run: blossom-ci
-        env:
-          OPERATION: 'START-CI-JOB'
-          CI_SERVER: ${{ secrets.CI_SERVER }}
-          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/unit_tests_common.yml b/.github/workflows/unit_tests_common.yml
index 615f7c9001..69b45aa5a5 100644
--- a/.github/workflows/unit_tests_common.yml
+++ b/.github/workflows/unit_tests_common.yml
@@ -1,6 +1,5 @@
 name: Common Unit Tests
 
-
 on:
   workflow_call:
     inputs:
@@ -22,12 +21,8 @@ on:
       container_options:
         required: true
         type: string
-      ignored_tests:
-        required: false
-        type: string
-        default: ''
-      # New input for hardware-specific initialization (e.g., conda activate)
-      setup_commands:
+      # Platform-specific environment setup script path (from platform config)
+      setup_script:
         required: false
         type: string
         default: ''
@@ -36,41 +31,9 @@ on:
         required: false
         type: string
         default: '{}'
-      # Whether to upload coverage report
-      upload_coverage:
-        description: "Whether to upload coverage report"
-        required: false
-        type: boolean
-        default: true
 
 jobs:
-  # 1. Change Detection
-  detect_changes:
-    runs-on: ubuntu-latest
-    outputs:
-      core: ${{ steps.filter.outputs.core }}
-      qa_l0: ${{ steps.filter.outputs.qa_l0 }}
-    steps:
-      - name: Checkout source code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Detect changed paths
-        id: filter
-        run: |
-          set -euo pipefail
-          BASE_REF="${{ github.event_name == 'pull_request' && format('origin/{0}', github.base_ref) || 'HEAD~1' }}"
-          [ "${{ github.event_name }}" == "pull_request" ] && git fetch origin ${{ github.base_ref }} --depth=1
-          
-          CHANGED_FILES=$(git diff --name-only $BASE_REF...HEAD 2>/dev/null || git diff --name-only $BASE_REF HEAD)
-
-          echo "core=$(echo "$CHANGED_FILES" | grep -qE "^tests/unit_tests/|^megatron/core/|^.github/" && echo "true" || echo "false")" >> $GITHUB_OUTPUT
-          echo "qa_l0=$(echo "$CHANGED_FILES" | grep -qE "^qa/L0_|^transformer_engine/|^tests/pytorch/|^.github/" && echo "true" || echo "false")" >> $GITHUB_OUTPUT
-  
-  # 2. Unified Test Execution
   unit_test:
-    needs: detect_changes
     defaults:
       run:
         shell: bash
@@ -79,16 +42,15 @@ jobs:
       fail-fast: false
       matrix:
         test_group:
-          - name: pytorch_lint
-            path: "qa/L0_pytorch_lint/test.sh"
-            test_type: "lint"
           - name: pytorch_debug
             path: "qa/L0_pytorch_debug_unittest/test.sh"
             test_type: "debug"
           - name: pytorch_unittest
             path: "qa/L0_pytorch_unittest/test.sh"
             test_type: "unittest"
-
+          - name: pytorch_distributed_unittest
+            path: "qa/L1_pytorch_distributed_unittest/test.sh"
+            test_type: "unittest"
     name: unit-${{ inputs.device }}-${{ matrix.test_group.name }}
     container:
       image: ${{ inputs.image }}
@@ -96,33 +58,14 @@ jobs:
       options: --pull never ${{ inputs.container_options }}
 
     steps:
-      - name: Check if tests should run
-        id: should_run
-        run: |
-          echo "should_run=true" >> $GITHUB_OUTPUT
-          GROUP='${{ matrix.test_group.name }}'
-          # Force run if 'full ci' label exists
-          if [ "${{ contains(github.event.pull_request.labels.*.name, 'full ci') }}" == "true" ]; then 
-            echo "should_run=true" >> $GITHUB_OUTPUT; exit 0
-          fi
-
-          if [[ "$GROUP" == "pytorch_"* ]]; then
-            CHANGED='${{ needs.detect_changes.outputs.qa_l0 }}'
-          else
-            CHANGED='${{ needs.detect_changes.outputs.core }}'
-          fi
-          
-          # For debugging, you can force this to true
-          echo "should_run=true" >> $GITHUB_OUTPUT
-
       # Cuda requires git safe.directory configuration and 3 checkout attempts to handle submodule-heavy repos
       - name: Configure Git Safe Directory on Cuda
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda'
+        if: inputs.platform == 'cuda'
         run: /usr/bin/git config --global safe.directory '*'
 
       - name: Checkout Source Code on Cuda (attempt 1)
         id: checkout1
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda'
+        if: inputs.platform == 'cuda'
         uses: actions/checkout@v4
         continue-on-error: true
         with:
@@ -132,7 +75,7 @@ jobs:
 
       - name: Checkout Source Code on Cuda (attempt 2)
         id: checkout2
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda' && steps.checkout1.outcome == 'failure'
+        if: inputs.platform == 'cuda' && steps.checkout1.outcome == 'failure'
         uses: actions/checkout@v4
         continue-on-error: true
         with:
@@ -142,7 +85,7 @@ jobs:
 
       - name: Checkout Source Code on Cuda (attempt 3)
         id: checkout3
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda' && steps.checkout2.outcome == 'failure'
+        if: inputs.platform == 'cuda' && steps.checkout2.outcome == 'failure'
         uses: actions/checkout@v4
         with:
           fetch-depth: 0
@@ -151,107 +94,18 @@ jobs:
 
       # Metax no need submodules
       - name: Checkout Source Code on Metax
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'metax'
+        if: inputs.platform == 'metax'
         uses: actions/checkout@v4
         with:
           fetch-depth: 0
       
-      - name: Environment Setup on Cuda
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda'
+      - name: Environment Setup
+        if: inputs.setup_script != ''
         run: |
-          set -euo pipefail
-
-          echo "===== Step 0: Activate Python environment ====="
-          source /opt/miniconda3/etc/profile.d/conda.sh
-          conda activate flagscale-train
-          echo "PATH=$PATH" >> $GITHUB_ENV
-          echo "Python: $(which python3) ($(python3 --version 2>&1))"
-
-          echo "===== Step 1: Remove Existing TransformerEngine ====="
-          pip uninstall transformer_engine transformer_engine_torch -y || true
-
-          echo "===== Step 2: Build & Install TransformerEngine ====="
-          cd $GITHUB_WORKSPACE
-
-          pip install nvdlfw-inspect --quiet
-          pip install expecttest --quiet
-          pip install . -v --no-deps --no-build-isolation
-
-          echo "===== Step 3: Verify Installation ====="
-          python3 tests/pytorch/test_sanity_import.py
-
-          echo "===== Environment Setup Complete ===== "
-
-      - name: Environment Setup on Metax
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'metax'
-        run: |
-          set -euo pipefail
-
-          echo "===== Step 0: Activate Python environment ====="
-          source /opt/conda/etc/profile.d/conda.sh
-          conda activate base
-          echo "PATH=$PATH" >> $GITHUB_ENV
-          echo "Python: $(which python3) ($(python3 --version 2>&1))"
-
-          echo "===== Step 1: Base Environment Setup ====="
-          # Configure MACA toolchain paths
-          export PATH=/opt/maca/bin:$PATH
-          export LD_LIBRARY_PATH=/opt/maca/lib:$LD_LIBRARY_PATH
-          service ssh restart
-
-          echo "===== Step 2: Create nvcc Symlink (cucc -> nvcc) ====="
-          # TransformerEngine expects nvcc, but MACA provides cucc
-          ln -sf /opt/maca/tools/cu-bridge/bin/cucc /opt/maca/tools/cu-bridge/bin/nvcc
-          which nvcc || true
-
-          echo "===== Step 3: Install Required System Tools ====="
-          # Install essential build tools (avoid modifying Python dependencies)
-          apt-get update -qq && apt-get install -y -qq git cmake ninja-build curl
-
-          echo "===== Step 4: Remove Existing TransformerEngine ====="
-          # Prevent conflicts with preinstalled or incompatible versions
-          python3 -m pip uninstall transformer_engine -y || true
-          python3 -m pip install nvdlfw-inspect --quiet
-          python3 -m pip install expecttest --quiet
-
-          # echo "===== Step 5: Install Metax Binary Backend ====="
-          # # Install prebuilt Metax backend (required for MACA operators)
-          # WHL_PATH="/home/muxiuser/transformer_engine_metax-2.9.0-cp312-cp312-linux_x86_64.whl"
-          # if [ ! -f "$WHL_PATH" ]; then
-          #   echo "ERROR: Wheel file not found at $WHL_PATH"
-          #   echo "Please verify volume mount: -v /home/muxiuser:/home/muxiuser"
-          #   exit 1
-          # fi
-
-          # # Use --no-deps to avoid overwriting Metax-optimized PyTorch
-          # python3 -m pip install "$WHL_PATH" --no-deps --force-reinstall
-
-          # echo "===== Step 6: Verify Metax Backend ====="
-          # # Ensure transformer_engine_torch is correctly loaded
-          # python3 - <<'EOF'
-          # import transformer_engine_torch as te
-          # print("Backend loaded successfully:", te)
-          # EOF
-
-          echo "===== Step 7: Install TE-FL Plugin Layer ====="
-          # Install TransformerEngine-FL Python layer (plugin logic)
-          # cd /workspace/TransformerEngine-FL
-          cd $GITHUB_WORKSPACE
-          TE_FL_SKIP_CUDA=1 python3 setup.py install
-
-          echo "===== Step 8: Final Verification ====="
-          # Verify both TE Python API and backend are functional
-          python3 - <<'EOF'
-          import transformer_engine
-          import transformer_engine_torch as te
-          print("transformer_engine:", transformer_engine)
-          print("transformer_engine_torch:", te)
-          EOF
-
-          echo "===== Environment Setup Complete ===== "
+          bash $GITHUB_WORKSPACE/${{ inputs.setup_script }}
       
       - name: Execute Tests
-        if: steps.should_run.outputs.should_run == 'true'
+        if: inputs.setup_script != ''
         working-directory: ${{ github.workspace }}
         run: |
           set -euo pipefail
@@ -284,19 +138,14 @@ jobs:
 
           # Coverage setup: install once + configure collection via PYTEST_ADDOPTS
           COVERAGE_ENABLED=false
-          if [ "${{ inputs.upload_coverage }}" = "true" ] && [ "${{ matrix.test_group.test_type }}" = "unittest" ]; then
-            if pip3 install coverage pytest-cov --quiet 2>/dev/null; then
-              export PYTEST_ADDOPTS="--cov=transformer_engine --cov-append --cov-report="
-              COVERAGE_ENABLED=true
-            else
-              echo "WARNING: Failed to install coverage/pytest-cov, coverage collection disabled"
-            fi
+          if pip3 install coverage pytest-cov --quiet 2>/dev/null; then
+            export PYTEST_ADDOPTS="--cov=transformer_engine --cov-append --cov-report="
+            COVERAGE_ENABLED=true
+          else
+            echo "WARNING: Failed to install coverage/pytest-cov, coverage collection disabled"
           fi
 
-          if [[ "${{ matrix.test_group.name }}" == *"lint"* ]]; then
-            export CPP_ONLY=0
-            export PYTHON_ONLY=0
-          elif [[ "${{ matrix.test_group.name }}" != *"debug"* ]]; then
+          if [[ "${{ matrix.test_group.name }}" != *"debug"* ]]; then
             # Fail fast on backend/API mismatch before running the full test group.
             # Skip for debug group (does not use FP8/optimizer symbols).
             python3 -c "import sys, importlib; import transformer_engine.common as _te_common; tex = importlib.import_module('transformer_engine_torch'); required=['multi_tensor_scale','multi_tensor_compute_scale_and_scale_inv']; missing=[n for n in required if not hasattr(tex, n)]; print('[TE check] module:', tex); print('[TE check] file:', getattr(tex, '__file__', 'N/A')); print('[TE check] missing:', ', '.join(missing) if missing else 'none'); sys.exit(1 if missing else 0)"
@@ -313,12 +162,10 @@ jobs:
               --include="transformer_engine/*" 2>/dev/null \
               || echo "WARNING: No coverage data found"
           fi
-
           exit $exit_code
         timeout-minutes: 60
 
       - name: Upload Coverage Report
-        if: inputs.upload_coverage && matrix.test_group.test_type == 'unittest'
         uses: actions/upload-artifact@v4
         continue-on-error: true
         with:
@@ -327,7 +174,6 @@ jobs:
             coverage-${{ inputs.platform }}-${{ inputs.device }}-${{ matrix.test_group.name }}.json
 
       - name: Upload Coverage Report to FlagCICD
-        if: inputs.upload_coverage && matrix.test_group.test_type == 'unittest'
         uses: flagos-ai/FlagOps/actions/post-pytest-report@v2
         continue-on-error: true
         env:
@@ -336,12 +182,4 @@ jobs:
           backend_url: 'http://flagcicd-inner.flagos.net:8000/metrics/'
           user_id: '000000000000000000'
           report_path: 'coverage-${{ inputs.platform }}-${{ inputs.device }}-${{ matrix.test_group.name }}.json'
-          fail_on_error: 'false'
-
-      # - name: Debug - keep container alive on failure
-      #   if: failure()
-      #   run: |
-      #     echo "Container sleeping for 200 minutes for debugging..."
-      #     echo "On host, run: docker ps  then  docker exec -it <container_id> bash"
-      #     sleep 60000
-      #   timeout-minutes: 200
\ No newline at end of file
+          fail_on_error: 'false'
\ No newline at end of file
diff --git a/.github/workflows/upload-ci-logs.yml b/.github/workflows/upload-ci-logs.yml
deleted file mode 100644
index c9c7e4ef4d..0000000000
--- a/.github/workflows/upload-ci-logs.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# See LICENSE for license information.
-
-# A workflow to trigger ci on hybrid infra (github + self hosted runner)
-name: TE-CI Logs
-on:
-  workflow_dispatch:
-      inputs:
-          platform:
-            description: 'runs-on argument'
-            required: false
-          args:
-            description: 'argument'
-            required: false
-          job_name:
-            description: 'name of the job'
-            required: true
-          commit_sha:
-            description: 'SHA of the commit that was tested.'
-            required: true
-          result:
-            description: 'Job result'
-            required: true
-run-name: PR ${{ fromJson(github.event.inputs.args).pr }} - ${{ inputs.job_name }}
-jobs:
-  Upload-Log:
-    name: Upload log
-    runs-on: blossom
-    steps:
-      - name: Log
-        run: blossom-ci
-        env:
-          OPERATION: 'POST-PROCESSING'
-          CI_SERVER: ${{ secrets.CI_SERVER }}
-          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  status_update:
-    name: Update commit status
-    runs-on: ubuntu-latest
-    permissions:
-      statuses: write
-    needs: [Upload-Log]
-    if: ${{ always() }}
-    steps:
-      - name: Set status
-        run: |
-          curl \
-          -X POST \
-          -H "Accept: application/vnd.github+json" \
-          -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-          https://api.github.com/repos/${{ github.repository }}/statuses/${{ inputs.commit_sha }} \
-          -d "{\"state\":\"${{ inputs.result }}\",\"target_url\":\"${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}\",\"description\":\"\",\"context\":\"te-ci/${{ inputs.job_name }}\"}"
diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend
index f0c638223e..7500fd8427 160000
--- a/3rdparty/cudnn-frontend
+++ b/3rdparty/cudnn-frontend
@@ -1 +1 @@
-Subproject commit f0c638223eac20a9676941a110c9ad9e9842941d
+Subproject commit 7500fd8427a24a76fadac9f2108106fd22c62737
diff --git a/3rdparty/googletest b/3rdparty/googletest
index a35bc7693c..94be250af7 160000
--- a/3rdparty/googletest
+++ b/3rdparty/googletest
@@ -1 +1 @@
-Subproject commit a35bc7693c117a048152beeb34f6aac354b9423f
+Subproject commit 94be250af7e14c58dcbf476972d2d7141551ff67
diff --git a/qa/L0_pytorch_debug_unittest/test.sh b/qa/L0_pytorch_debug_unittest/test.sh
index 5be88dfe4a..916eecdcca 100644
--- a/qa/L0_pytorch_debug_unittest/test.sh
+++ b/qa/L0_pytorch_debug_unittest/test.sh
@@ -7,6 +7,11 @@
 : ${TE_PATH:=/opt/transformerengine}
 : ${NVTE_TEST_NVINSPECT_FEATURE_DIRS:=$TE_PATH/transformer_engine/debug/features}
 : ${NVTE_TEST_NVINSPECT_CONFIGS_DIR:=$TE_PATH/tests/pytorch/debug/test_configs/}
+# export PLATFORM="metax"
+# export TE_PATH=/workspace/TransformerEngine-FL
+# export NVTE_TEST_NVINSPECT_FEATURE_DIRS=$TE_PATH/transformer_engine/debug/features
+# export NVTE_TEST_NVINSPECT_CONFIGS_DIR=$TE_PATH/tests/pytorch/debug/test_configs/
+# export PYTHONPATH=$TE_PATH/tests/pytorch:$TE_PATH:$PYTHONPATH
 : ${XML_LOG_DIR:=/logs}
 mkdir -p "$XML_LOG_DIR"
 
@@ -28,7 +33,7 @@ run_test_step() {
 
     if [ "$PLATFORM" = "metax" ]; then
         case "$test_path" in
-            *"test_numerics.py" | *"test_api_features.py" | *"test_sanity.py")
+            *tests/pytorch/test_numerics.py | *tests/pytorch/test_sanity.py)
                 echo "-------------------------------------------------------"
                 echo "[SKIP] Platform MetaX: Ignoring $test_path"
                 echo "-------------------------------------------------------"
@@ -70,8 +75,6 @@ run_test_step "test_perf.xml" "$TE_PATH/tests/pytorch/debug/test_perf.py" \
 "pytest -v -s --junitxml=$XML_LOG_DIR/test_perf.xml $TE_PATH/tests/pytorch/debug/test_perf.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR"
 
 
-
-
 # Step 7: Sanity 2
 run_test_step "test_sanity_2.xml" "$TE_PATH/tests/pytorch/test_sanity.py" \
 "NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 \
diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh
index 99a1370ac4..bc4362e23d 100644
--- a/qa/L0_pytorch_unittest/test.sh
+++ b/qa/L0_pytorch_unittest/test.sh
@@ -22,13 +22,11 @@ run_test_step() {
     local cmd=$3
     local label=$4
 
-
     if [ "$PLATFORM" = "metax" ]; then
         case "$test_path" in
             *"test_numerics.py" | \
             *"test_sanity.py" | \
             *"test_parallel_cross_entropy.py" | \
-            *"test_cuda_graphs.py" | \
             *"test_fused_rope.py" | \
             *"test_gqa.py" | \
             *"test_fused_optimizer.py" | \
diff --git a/qa/L1_pytorch_distributed_unittest/test.sh b/qa/L1_pytorch_distributed_unittest/test.sh
index 04860a9729..46b54ed30d 100644
--- a/qa/L1_pytorch_distributed_unittest/test.sh
+++ b/qa/L1_pytorch_distributed_unittest/test.sh
@@ -15,29 +15,134 @@ function test_fail() {
 
 RET=0
 FAILED_CASES=""
+DEBUG_TESTS_READY=0
 
 : ${TE_PATH:=/opt/transformerengine}
 : ${XML_LOG_DIR:=/logs}
 mkdir -p "$XML_LOG_DIR"
 
+# The current CUDA 12.8 test container hits a fused-attention runtime loader
+# issue, so keep the distributed numerics suite on the unfused attention path.
+export NVTE_FLASH_ATTN="${NVTE_FLASH_ATTN:-0}"
+export NVTE_FUSED_ATTN="${NVTE_FUSED_ATTN:-0}"
+export NVTE_UNFUSED_ATTN="${NVTE_UNFUSED_ATTN:-1}"
+
+# Make CUDA runtime libraries discoverable for fused attention kernels.
+if [ -z "${CUDA_HOME:-}" ]; then
+    if [ -d /usr/local/cuda ]; then
+        export CUDA_HOME=/usr/local/cuda
+    elif [ -d /usr/local/cuda-12.8 ]; then
+        export CUDA_HOME=/usr/local/cuda-12.8
+    fi
+fi
+export CUDA_PATH="${CUDA_PATH:-${CUDA_HOME:-}}"
+
+CUDA_LIB_DIRS=()
+for path in \
+    "${CUDA_HOME:-}/lib64" \
+    "${CUDA_HOME:-}/targets/x86_64-linux/lib" \
+    "$(python3 - <<'PY'
+import site
+from pathlib import Path
+
+for root in site.getsitepackages():
+    candidate = Path(root) / "torch" / "lib"
+    if candidate.exists():
+        print(candidate)
+        break
+PY
+)" \
+    "$(python3 - <<'PY'
+import site
+from pathlib import Path
+
+for root in site.getsitepackages():
+    candidate = Path(root) / "nvidia" / "cuda_runtime" / "lib"
+    if candidate.exists():
+        print(candidate)
+        break
+PY
+)"; do
+    if [ -n "$path" ] && [ -d "$path" ]; then
+        CUDA_LIB_DIRS+=("$path")
+    fi
+done
+
+if [ "${#CUDA_LIB_DIRS[@]}" -gt 0 ]; then
+    CUDA_LIB_PATH="$(IFS=:; echo "${CUDA_LIB_DIRS[*]}")"
+    export LD_LIBRARY_PATH="${CUDA_LIB_PATH}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+fi
+
+python3 - <<'PY'
+import ctypes
+
+for name in ("libcudart.so", "libcudart.so.12"):
+    try:
+        ctypes.CDLL(name, mode=ctypes.RTLD_GLOBAL)
+        print(f"[CUDA] Preloaded {name}")
+        break
+    except OSError as exc:
+        print(f"[CUDA] Failed to preload {name}: {exc}")
+PY
+
 
 # It is not installed as a requirement,
 # because it is not available on PyPI.
 pip uninstall -y nvdlfw-inspect
-pip install git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git
+if pip install git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git && \
+   python3 -c "import nvdlfw_inspect.api" >/dev/null 2>&1; then
+    DEBUG_TESTS_READY=1
+else
+    echo "Warning: nvdlfw_inspect is unavailable; debug numerics test will be skipped"
+fi
 
 pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
 
+run_test_step() {
+    local xml_file=$1
+    local test_path=$2
+    local cmd=$3
+    local label=$4
+
+    if [ "$PLATFORM" = "metax" ]; then
+        case "$test_path" in
+            *"test_numerics.py" | \
+            *"test_numerics_exact.py" | \
+            *"test_torch_fsdp2.py" | \
+            *"test_cast_master_weights_to_fp8.py")
+                echo "-------------------------------------------------------"
+                echo "[SKIP] Platform MetaX: Ignoring $label"
+                echo "-------------------------------------------------------"
+                return 0
+                ;;
+        esac
+    fi
+
+    echo "-------------------------------------------------------"
+    echo "[RUN] Executing: $label"
+    eval "$cmd" || test_fail "$label"
+}
+
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/distributed/test_sanity.py || test_fail "test_sanity.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "test_numerics.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_exact.xml $TE_PATH/tests/pytorch/distributed/test_numerics_exact.py || test_fail "test_numerics_exact.py"
+run_test_step "pytest_test_numerics.xml" "$TE_PATH/tests/pytorch/distributed/test_numerics.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py" \
+"test_numerics.py"
+run_test_step "pytest_test_numerics_exact.xml" "$TE_PATH/tests/pytorch/distributed/test_numerics_exact.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_exact.xml $TE_PATH/tests/pytorch/distributed/test_numerics_exact.py" \
+"test_numerics_exact.py"
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops.py || test_fail "test_fusible_ops.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_torch_fsdp2.xml $TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py -k "not (test_distributed)" || test_fail "test_torch_fsdp2.py"
+run_test_step "pytest_test_torch_fsdp2.xml" "$TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_torch_fsdp2.xml $TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py -k 'not (test_distributed)'" \
+"test_torch_fsdp2.py"
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_comm_gemm_overlap.xml $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py || test_fail "test_comm_gemm_overlap.py"
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops_with_userbuffers.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py || test_fail "test_fusible_ops_with_userbuffers.py"
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_attention_with_cp.xml $TE_PATH/tests/pytorch/attention/test_attention_with_cp.py || test_fail "test_attention_with_cp.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cp_utils.xml $TE_PATH/tests/pytorch/attention/test_cp_utils.py || test_fail "test_cp_utils.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_to_fp8.xml $TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py || test_fail "test_cast_master_weights_to_fp8.py"
+run_test_step "pytest_test_cp_utils.xml" "$TE_PATH/tests/pytorch/attention/test_cp_utils.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cp_utils.xml $TE_PATH/tests/pytorch/attention/test_cp_utils.py" \
+"test_cp_utils.py"
+run_test_step "pytest_test_cast_master_weights_to_fp8.xml" "$TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_to_fp8.xml $TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py" \
+"test_cast_master_weights_to_fp8.py"
 
 
 # debug tests
@@ -50,7 +155,13 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_
 
 # pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_distributed.xml $TE_PATH/tests/pytorch/debug/test_distributed.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "debug test_distributed.py"
 # standard numerics tests with initialized debug
-NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_2.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "debug test_numerics.py"
+if [ "$DEBUG_TESTS_READY" -eq 1 ]; then
+    run_test_step "pytest_test_numerics_2.xml" "$TE_PATH/tests/pytorch/distributed/test_numerics.py" \
+    "NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_2.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py" \
+    "test_numerics.py (debug)"
+else
+    echo "Skipping debug test_numerics.py because nvdlfw_inspect is unavailable"
+fi
 
 if [ "$RET" -ne 0 ]; then
     echo "Error in the following test cases:$FAILED_CASES"
diff --git a/qa/L1_pytorch_mcore_integration/test.sh b/qa/L1_pytorch_mcore_integration/test.sh
index a5130a52d3..913e7e6790 100644
--- a/qa/L1_pytorch_mcore_integration/test.sh
+++ b/qa/L1_pytorch_mcore_integration/test.sh
@@ -4,69 +4,148 @@
 
 set -e
 
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
+
+retry_command() {
+    local attempts=$1
+    local delay_seconds=$2
+    shift 2
+
+    local attempt
+    for attempt in $(seq 1 "${attempts}"); do
+        if "$@"; then
+            return 0
+        fi
+        if [ "${attempt}" -lt "${attempts}" ]; then
+            echo "Command failed (attempt ${attempt}/${attempts}): $*"
+            echo "Retrying in ${delay_seconds}s..."
+            sleep "${delay_seconds}"
+        fi
+    done
+
+    echo "Command failed after ${attempts} attempts: $*"
+    return 1
+}
+
 # Paths
-: ${TE_PATH:=/opt/transformerengine}
-: ${MCORE_PATH:=${TE_PATH}/qa/L1_pytorch_mcore_integration/Megatron-LM}
+: "${TE_PATH:=$(cd -- "${SCRIPT_DIR}/../.." && pwd)}"
+: "${MCORE_PATH:=/workspace/Megatron-LM-FL}"
+: "${MCORE_REPO_URL:=https://github.com/BrianPei/Megatron-LM-FL.git}"
+: "${MCORE_REF:=dev}"
+: "${OUTPUT_DIR:=${TE_PATH}/qa/L1_pytorch_mcore_integration/output}"
+: "${DATA_CACHE_PATH:=/tmp/data_cache}"
 
 # Check whether FP8 is supported
-DEVICE_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/[^0-9]//g')
-if [[ ${DEVICE_ARCH} -ge 89 ]]; then
-    WITH_FP8=1
+WITH_FP8=
+if command -v nvidia-smi &>/dev/null; then
+    DEVICE_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/[^0-9]//g')
+    if [[ ${DEVICE_ARCH} -ge 89 ]]; then
+        WITH_FP8=1
+    fi
+elif command -v mx-smi &>/dev/null; then
+    # Metax hardware does not support FP8; leave WITH_FP8 unset
+    :
 fi
 
-# Download Megatron-LM if needed
+# Download or sync Megatron-LM-FL to the requested repo/ref.
 if [ ! -d "${MCORE_PATH}" ]; then
     pushd $(dirname ${MCORE_PATH})
-    git clone -b core_r0.12.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
+    retry_command 3 5 git clone --depth 1 -b "${MCORE_REF}" "${MCORE_REPO_URL}" $(basename ${MCORE_PATH})
     popd
 fi
 
-# Create mock vocab
-VOCAB_FILE=${TE_PATH}/qa/L1_pytorch_mcore_integration/vocab.json
-printf "" > ${VOCAB_FILE}
-printf "{" >> ${VOCAB_FILE}
-printf "\"<|endoftext|>\": 0" >> ${VOCAB_FILE}
-seq 1 4095 | awk '{ printf(", \"%d\": %d", $1, $1) }' >> ${VOCAB_FILE}
-printf "}" >> ${VOCAB_FILE}
+if [ -d "${MCORE_PATH}/.git" ]; then
+    git -C "${MCORE_PATH}" remote set-url origin "${MCORE_REPO_URL}"
+    retry_command 3 5 git -C "${MCORE_PATH}" fetch --depth 1 origin "${MCORE_REF}"
+    git -C "${MCORE_PATH}" checkout -B "${MCORE_REF}" "FETCH_HEAD"
+fi
+
+# Megatron-LM-FL tokenizer imports happen at module import time, so direct
+# source execution needs these Python deps available before pretrain_gpt.py
+# starts.
+python3 - <<'PY' || python3 -m pip install --disable-pip-version-check six regex
+import regex
+import six
+print(f"six available: {six.__version__}")
+print(f"regex available: {regex.__version__}")
+PY
+
+CHECKPOINT_DIR=${OUTPUT_DIR}/checkpoints
+TENSORBOARD_DIR=${OUTPUT_DIR}/tensorboard
+mkdir -p "${CHECKPOINT_DIR}" "${TENSORBOARD_DIR}" "${DATA_CACHE_PATH}" /tmp/checkpoints
+
+echo "Using Megatron-LM-FL repo: ${MCORE_REPO_URL}"
+echo "Using Megatron-LM-FL ref: ${MCORE_REF}"
+git -C "${MCORE_PATH}" rev-parse --short HEAD
 
-# Megatron-LM invocation
+# Megatron-LM-FL invocation. Keep the argument shape aligned with the
+# previously validated tp1/pp1 mock-data GPT functional case while letting CI
+# exit after a few steps.
 COMMAND="
 NVTE_TORCH_COMPILE=0
 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
-NVTE_FLASH_ATTN=1
-NVTE_FWD_LAYERNORM_SM_MARGIN=0
-NVTE_BWD_LAYERNORM_SM_MARGIN=0
 CUDA_DEVICE_MAX_CONNECTIONS=1
-NVTE_BIAS_GELU_NVFUSION=0
-NVTE_BIAS_DROPOUT_FUSION=0
+NCCL_ALGO=Ring
+CUBLAS_WORKSPACE_CONFIG=:4096:8
 
-python3
--m torch.distributed.launch
---use_env
+torchrun
+--standalone
 --nnodes=1
 --nproc_per_node=1
 
 ${MCORE_PATH}/pretrain_gpt.py
 --tensor-model-parallel-size 1
 --pipeline-model-parallel-size 1
---use-cpu-initialization
---num-layers 2
---hidden-size 128
+--num-layers 12
+--hidden-size 512
 --num-attention-heads 8
---seq-length 128
---max-position-embeddings 128
---micro-batch-size 1
---global-batch-size 8
---train-iters 10
+--log-params-norm
+--log-num-zeros-in-grad
+--log-validation-ppl-to-tensorboard
+--log-timers-to-tensorboard
+--seq-length 1024
+--max-position-embeddings 1024
+--micro-batch-size 4
+--global-batch-size 32
+--train-iters 50
 --eval-iters 10
---lr 1e-4
+--timing-log-level 0
+--lr-decay-iters 320000
+--save ${CHECKPOINT_DIR}
+--split 949,50,1
+--tokenizer-type NullTokenizer
+--vocab-size 8192
 --mock-data
---vocab-file ${VOCAB_FILE}
---merge-file ${TE_PATH}/qa/L1_pytorch_mcore_integration/merges.txt
+--distributed-backend nccl
+--lr 0.00015
+--lr-decay-style cosine
+--min-lr 1.0e-5
+--weight-decay 1e-2
+--clip-grad 1.0
+--lr-warmup-fraction .01
+--log-interval 1
+--save-interval 10000
+--eval-interval 1000
 --transformer-impl transformer_engine
+--recompute-granularity full
+--recompute-method uniform
+--recompute-num-layers 1
+--deterministic-mode
+--no-gradient-accumulation-fusion
+--attention-softmax-in-fp32
+--use-mcore-models
+--ckpt-format torch_dist
+--dist-ckpt-optim-fully-reshardable
+--dist-ckpt-strictness log_all
+--data-cache-path ${DATA_CACHE_PATH}
+--bf16
+--attention-backend unfused
+--log-memory-to-tensorboard
+--tensorboard-dir ${TENSORBOARD_DIR}
+--exit-interval 4
 ${WITH_FP8:+--fp8-format hybrid}
 "
 COMMAND=$(echo "${COMMAND}" | tr '\n' ' ')
 
-# Launch Megatron-LM
+# Launch Megatron-LM-FL
 bash -c "${COMMAND}"
diff --git a/qa/L1_pytorch_mcore_integration/test_bak.sh b/qa/L1_pytorch_mcore_integration/test_bak.sh
new file mode 100644
index 0000000000..ec0b47b695
--- /dev/null
+++ b/qa/L1_pytorch_mcore_integration/test_bak.sh
@@ -0,0 +1,79 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+set -e
+
+# Paths
+: ${TE_PATH:=/opt/transformerengine}
+: ${MCORE_PATH:=${TE_PATH}/qa/L1_pytorch_mcore_integration/Megatron-LM}
+
+# Check whether FP8 is supported
+DEVICE_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/[^0-9]//g')
+if [[ ${DEVICE_ARCH} -ge 89 ]]; then
+    WITH_FP8=1
+fi
+
+# Download Megatron-LM if needed
+if [ ! -d "${MCORE_PATH}" ]; then
+    pushd $(dirname ${MCORE_PATH})
+    git clone -b core_r0.12.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
+    popd
+fi
+
+# Megatron tokenizer import chain pulls in bert_tokenization at module import
+# time, which unconditionally depends on `six`.
+python3 - <<'PY' || python3 -m pip install --disable-pip-version-check six
+import six
+print(f"six available: {six.__version__}")
+PY
+
+# Create mock vocab
+VOCAB_FILE=${TE_PATH}/qa/L1_pytorch_mcore_integration/vocab.json
+printf "" > ${VOCAB_FILE}
+printf "{" >> ${VOCAB_FILE}
+printf "\"<|endoftext|>\": 0" >> ${VOCAB_FILE}
+seq 1 4095 | awk '{ printf(", \"%d\": %d", $1, $1) }' >> ${VOCAB_FILE}
+printf "}" >> ${VOCAB_FILE}
+
+# Megatron-LM invocation
+COMMAND="
+NVTE_TORCH_COMPILE=0
+NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
+NVTE_FLASH_ATTN=1
+NVTE_FWD_LAYERNORM_SM_MARGIN=0
+NVTE_BWD_LAYERNORM_SM_MARGIN=0
+CUDA_DEVICE_MAX_CONNECTIONS=1
+NVTE_BIAS_GELU_NVFUSION=0
+NVTE_BIAS_DROPOUT_FUSION=0
+
+python3
+-m torch.distributed.launch
+--use_env
+--nnodes=1
+--nproc_per_node=1
+
+${MCORE_PATH}/pretrain_gpt.py
+--tensor-model-parallel-size 1
+--pipeline-model-parallel-size 1
+--use-cpu-initialization
+--num-layers 2
+--hidden-size 128
+--num-attention-heads 8
+--seq-length 128
+--max-position-embeddings 128
+--micro-batch-size 1
+--global-batch-size 8
+--train-iters 10
+--eval-iters 10
+--lr 1e-4
+--mock-data
+--vocab-file ${VOCAB_FILE}
+--merge-file ${TE_PATH}/qa/L1_pytorch_mcore_integration/merges.txt
+--transformer-impl transformer_engine
+${WITH_FP8:+--fp8-format hybrid}
+"
+COMMAND=$(echo "${COMMAND}" | tr '\n' ' ')
+
+# Launch Megatron-LM
+bash -c "${COMMAND}"
diff --git a/transformer_engine/common/__init__.py b/transformer_engine/common/__init__.py
index f67b5d2470..e0fb0b7209 100644
--- a/transformer_engine/common/__init__.py
+++ b/transformer_engine/common/__init__.py
@@ -367,6 +367,38 @@ def _load_nvrtc():
     return ctypes.CDLL(f"libnvrtc{_get_sys_extension()}", mode=ctypes.RTLD_GLOBAL)
 
 
+@functools.lru_cache(maxsize=None)
+def _load_cudart():
+    """Load CUDA runtime shared library."""
+    # Attempt to locate CUDA runtime in CUDA_HOME, CUDA_PATH or /usr/local/cuda
+    cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") or "/usr/local/cuda"
+    libs = glob.glob(f"{cuda_home}/**/libcudart{_get_sys_extension()}*", recursive=True)
+    libs = list(filter(lambda x: not ("stub" in x), libs))
+    libs.sort(reverse=True, key=os.path.basename)
+    if libs:
+        return ctypes.CDLL(libs[0], mode=ctypes.RTLD_GLOBAL)
+
+    # Attempt to locate CUDA runtime in Python dist-packages
+    found, handle = _load_nvidia_cuda_library("cuda_runtime")
+    if found:
+        return handle
+
+    # Attempt to locate CUDA runtime via ldconfig
+    libs = subprocess.check_output(
+        f"ldconfig -p | grep 'libcudart{_get_sys_extension()}'", shell=True
+    )
+    libs = libs.decode("utf-8").split("\n")
+    sos = []
+    for lib in libs:
+        if "libcudart" in lib and "=>" in lib:
+            sos.append(lib.split(">")[1].strip())
+    if sos:
+        return ctypes.CDLL(sos[0], mode=ctypes.RTLD_GLOBAL)
+
+    # If all else fails, assume that it is in LD_LIBRARY_PATH and error out otherwise
+    return ctypes.CDLL(f"libcudart{_get_sys_extension()}", mode=ctypes.RTLD_GLOBAL)
+
+
 @functools.lru_cache(maxsize=None)
 def _load_curand():
     """Load cuRAND shared library."""
@@ -412,9 +444,9 @@ def _load_core_library():
     if not skip_cuda_build():
         _CUDNN_LIB_CTYPES = _load_cudnn()
         _NVRTC_LIB_CTYPES = _load_nvrtc()
+        _CUDART_LIB_CTYPES = _load_cudart()
         _CURAND_LIB_CTYPES = _load_curand()
         _CUBLAS_LIB_CTYPES = _load_nvidia_cuda_library("cublas")
-        _CUDART_LIB_CTYPES = _load_nvidia_cuda_library("cuda_runtime")
         _TE_LIB_CTYPES = _load_core_library()
 
         # Needed to find the correct headers for NVRTC kernels.
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
index 4309cc4a2e..2d0ae3c936 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
@@ -7,6 +7,8 @@
 import torch
 from ....ops import *
 
+_cuda_lib_handles: List[Any] = []
+
 
 def _load_cuda_libs():
     import ctypes
@@ -24,6 +26,13 @@ def get_ext():
 
     ext = get_ext()
 
+    python_nvidia_pkg = {
+        "cudnn": "cudnn",
+        "cudart": "cuda_runtime",
+        "nvrtc": "cuda_nvrtc",
+        "curand": "curand",
+    }
+
     def try_load_lib(name, search_patterns):
         for env_var in [f"{name.upper()}_HOME", f"{name.upper()}_PATH"]:
             path = os.environ.get(env_var)
@@ -36,6 +45,20 @@ def try_load_lib(name, search_patterns):
                     except:
                         pass
 
+        pkg_name = python_nvidia_pkg.get(name)
+        if pkg_name:
+            purelib = Path(sysconfig.get_path("purelib"))
+            libs = glob_module.glob(
+                str(purelib / "nvidia" / pkg_name / "**" / f"lib{name}{ext}*"),
+                recursive=True,
+            )
+            if libs:
+                libs.sort(reverse=True, key=os.path.basename)
+                try:
+                    return ctypes.CDLL(libs[0], mode=ctypes.RTLD_GLOBAL)
+                except:
+                    pass
+
         cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") or "/usr/local/cuda"
         for pattern in search_patterns:
             libs = glob_module.glob(f"{cuda_home}/**/{pattern}", recursive=True)
@@ -61,10 +84,19 @@ def try_load_lib(name, search_patterns):
         except:
             return None
 
+    global _cuda_lib_handles
+
     try:
-        try_load_lib("cudnn", [f"libcudnn{ext}*"])
-        try_load_lib("nvrtc", [f"libnvrtc{ext}*"])
-        try_load_lib("curand", [f"libcurand{ext}*"])
+        handles = []
+        for name, patterns in [
+            ("cudnn", [f"libcudnn{ext}*"]),
+            ("cudart", [f"libcudart{ext}*"]),
+            ("nvrtc", [f"libnvrtc{ext}*"]),
+            ("curand", [f"libcurand{ext}*"]),
+        ]:
+            handle = try_load_lib(name, patterns)
+            if handle is not None:
+                handles.append(handle)
 
         te_path_override = os.environ.get("TE_LIB_PATH")
         if te_path_override:
@@ -75,7 +107,8 @@ def try_load_lib(name, search_patterns):
             if search_dir.exists():
                 matches = list(search_dir.glob(f"libtransformer_engine{ext}*"))
                 if matches:
-                    ctypes.CDLL(str(matches[0]), mode=ctypes.RTLD_GLOBAL)
+                    handles.append(ctypes.CDLL(str(matches[0]), mode=ctypes.RTLD_GLOBAL))
+                    _cuda_lib_handles = handles
                     return True
         return False
     except Exception as e:

From 95c00ff99589f9ef186b06a18fc6a230c406d6f5 Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Fri, 17 Apr 2026 15:30:35 +0800
Subject: [PATCH 02/25] remove duplicate scripts

---
 .../workflows/integration_tests_common.yml    | 62 +------------------
 1 file changed, 1 insertion(+), 61 deletions(-)

diff --git a/.github/workflows/integration_tests_common.yml b/.github/workflows/integration_tests_common.yml
index 70d0ca829d..fdc8e68f84 100644
--- a/.github/workflows/integration_tests_common.yml
+++ b/.github/workflows/integration_tests_common.yml
@@ -116,64 +116,4 @@ jobs:
           echo "=== Running L1 PyTorch Megatron-FL MCore Integration Test ==="
           bash ${{ matrix.test_group.path }}
         timeout-minutes: 30
-      
-      # - name: Execute Tests
-      #   if: inputs.setup_script != ''
-      #   working-directory: ${{ github.workspace }}
-      #   run: |
-      #     set -euo pipefail
-
-      #     # Load platform-specific environment variables
-      #     while IFS='=' read -r key value; do
-      #       [ -n "$key" ] && export "$key=$value"
-      #     done < <(echo '${{ inputs.build_env }}' | python3 -c "
-      #     import json, sys
-      #     env = json.load(sys.stdin)
-      #     for k, v in env.items():
-      #         print(f'{k}={v}')
-      #     ")
-
-      #     export TE_PATH=$GITHUB_WORKSPACE
-      #     export TE_LIB_PATH=$(python3 -c "import site; print(site.getsitepackages()[0])")
-      #     export PYTHONPATH=$GITHUB_WORKSPACE:${PYTHONPATH:-}
-      #     export PATH=${CUDA_HOME:-/usr/local/cuda}/bin:$PATH
-      #     export LD_LIBRARY_PATH=${CUDA_HOME:-/usr/local/cuda}/lib:${LD_LIBRARY_PATH:-}
-          
-      #     # check envs before running tests
-      #     echo "TE_PATH=$TE_PATH"
-      #     echo "TE_LIB_PATH=$TE_LIB_PATH"
-      #     echo "PYTHONPATH=$PYTHONPATH"
-      #     echo "PATH=$PATH"
-      #     echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
-          
-      #     # Ensure log directory exists regardless of volume mount state
-      #     mkdir -p /logs
-
-      #     # Coverage setup: install once + configure collection via PYTEST_ADDOPTS
-      #     COVERAGE_ENABLED=false
-      #     if pip3 install coverage pytest-cov --quiet 2>/dev/null; then
-      #       export PYTEST_ADDOPTS="--cov=transformer_engine --cov-append --cov-report="
-      #       COVERAGE_ENABLED=true
-      #     else
-      #       echo "WARNING: Failed to install coverage/pytest-cov, coverage collection disabled"
-      #     fi
-
-      #     if [[ "${{ matrix.test_group.name }}" != *"debug"* ]]; then
-      #       # Fail fast on backend/API mismatch before running the full test group.
-      #       # Skip for debug group (does not use FP8/optimizer symbols).
-      #       python3 -c "import sys, importlib; import transformer_engine.common as _te_common; tex = importlib.import_module('transformer_engine_torch'); required=['multi_tensor_scale','multi_tensor_compute_scale_and_scale_inv']; missing=[n for n in required if not hasattr(tex, n)]; print('[TE check] module:', tex); print('[TE check] file:', getattr(tex, '__file__', 'N/A')); print('[TE check] missing:', ', '.join(missing) if missing else 'none'); sys.exit(1 if missing else 0)"
-      #     fi
-
-      #     bash ${{ matrix.test_group.path }}
-      #     exit_code=$?
-
-      #     # Combine coverage fragments and generate JSON report
-      #     if [ "$COVERAGE_ENABLED" = "true" ]; then
-      #       python3 -m coverage combine --keep 2>/dev/null || true
-      #       python3 -m coverage json \
-      #         -o "coverage-${{ inputs.platform }}-${{ inputs.device }}-${{ matrix.test_group.name }}.json" \
-      #         --include="transformer_engine/*" 2>/dev/null \
-      #         || echo "WARNING: No coverage data found"
-      #     fi
-      #     exit $exit_code
-      #   timeout-minutes: 60
\ No newline at end of file
+        
\ No newline at end of file

From 263a761956745a33b052d9bf12878c46281e9de3 Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Sat, 18 Apr 2026 11:31:36 +0800
Subject: [PATCH 03/25] restore nv original ymls

---
 .github/workflows/blossom-ci.yml          |  86 ++++++++++++++++++
 .github/workflows/deploy_nightly_docs.yml |  39 +++++++++
 .github/workflows/qa-format.yml           |  32 +++++++
 .github/workflows/trigger-ci.yml          | 102 ++++++++++++++++++++++
 .github/workflows/upload-ci-logs.yml      |  52 +++++++++++
 5 files changed, 311 insertions(+)
 create mode 100644 .github/workflows/blossom-ci.yml
 create mode 100644 .github/workflows/deploy_nightly_docs.yml
 create mode 100644 .github/workflows/qa-format.yml
 create mode 100644 .github/workflows/trigger-ci.yml
 create mode 100644 .github/workflows/upload-ci-logs.yml

diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
new file mode 100644
index 0000000000..cc2f9eb9a8
--- /dev/null
+++ b/.github/workflows/blossom-ci.yml
@@ -0,0 +1,86 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+# A workflow to trigger ci on hybrid infra (github + self hosted runner)
+
+# DISABLED in FlagOS
+name: Blossom-CI
+on:
+  issue_comment:
+    types: [__disabled_do_not_remove__]
+  workflow_dispatch:
+      inputs:
+          platform:
+            description: 'runs-on argument'
+            required: false
+          args:
+            description: 'argument'
+            required: false
+jobs:
+  Authorization:
+    name: Authorization
+    runs-on: blossom
+    outputs:
+      args: ${{ env.args }}
+
+    # This job only runs for pull request comments
+    if: >
+         github.event.comment.body == '/blossom-ci'
+         && (
+           github.actor == 'ptrendx'
+           || github.actor == 'ksivaman'
+         )
+    steps:
+      - name: Check if comment is issued by authorized person
+        run: blossom-ci
+        env:
+          OPERATION: 'AUTH'
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
+
+  Vulnerability-scan:
+    name: Vulnerability scan
+    needs: [Authorization]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+        with:
+          repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
+          ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
+          lfs: 'true'
+
+      - name: Run blossom action
+        uses: NVIDIA/blossom-action@main
+        env:
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
+        with:
+          args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
+          args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
+          args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}
+
+  Job-trigger:
+    name: Start ci job
+    needs: [Vulnerability-scan]
+    runs-on: blossom
+    steps:
+      - name: Start ci job
+        run: blossom-ci
+        env:
+          OPERATION: 'START-CI-JOB'
+          CI_SERVER: ${{ secrets.CI_SERVER }}
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+  Upload-Log:
+    name: Upload log
+    runs-on: blossom
+    if : github.event_name == 'workflow_dispatch'
+    steps:
+      - name: Jenkins log for pull request ${{ fromJson(github.event.inputs.args).pr }} (click here)
+        run: blossom-ci
+        env:
+          OPERATION: 'POST-PROCESSING'
+          CI_SERVER: ${{ secrets.CI_SERVER }}
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/deploy_nightly_docs.yml b/.github/workflows/deploy_nightly_docs.yml
new file mode 100644
index 0000000000..38a3e1dbc2
--- /dev/null
+++ b/.github/workflows/deploy_nightly_docs.yml
@@ -0,0 +1,39 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+# A workflow to deploy the nightly version of TE documentation to GitHub Pages
+name: Deploy nightly docs
+on:
+  push:
+    branches: [ "__disabled_do_not_remove__" ]
+jobs:
+  build:
+    uses: ./.github/workflows/docs.yml
+
+  prepare:
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download artifact
+        uses: actions/download-artifact@v4
+        with:
+            name: "te_docs"
+            path: "html"
+      - name: Prepare for pages
+        uses: actions/upload-pages-artifact@v1.0.7
+        with:
+          name: github-pages
+          path: "html"
+  deploy:
+    needs: prepare
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    permissions:
+      pages: write
+      id-token: write
+    runs-on: ubuntu-latest
+    steps:
+    - name: Deploy
+      uses: actions/deploy-pages@v2.0.0
diff --git a/.github/workflows/qa-format.yml b/.github/workflows/qa-format.yml
new file mode 100644
index 0000000000..ff1cddf312
--- /dev/null
+++ b/.github/workflows/qa-format.yml
@@ -0,0 +1,32 @@
+name: format_check
+
+on:
+  pull_request:
+    branches: [ "main" ]
+    types: [opened, synchronize, reopened]
+
+jobs:
+  format:
+    runs-on: ubuntu-22.04
+    env:
+      PRID: ${{ github.event.pull_request.number }}
+      BRANCH: main
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.base.ref }}
+
+      - name: Merge PR to sub-branch
+        run: |
+          git fetch origin pull/${PRID}/merge
+          git checkout -b test FETCH_HEAD
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Run pre-commit
+        run: bash ./qa/format.sh
\ No newline at end of file
diff --git a/.github/workflows/trigger-ci.yml b/.github/workflows/trigger-ci.yml
new file mode 100644
index 0000000000..37754fbfb7
--- /dev/null
+++ b/.github/workflows/trigger-ci.yml
@@ -0,0 +1,102 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+# A workflow to trigger ci on hybrid infra (github + self hosted runner)
+name: TE-CI Trigger
+on:
+  issue_comment:
+    types: [__disabled_do_not_remove__]
+jobs:
+  Authorization:
+    name: Authorization
+    runs-on: blossom
+    outputs:
+      args: ${{ env.args }}
+
+    # This job only runs for pull request comments
+    if: >
+         startsWith(github.event.comment.body, '/te-ci')
+         && (
+           github.actor == 'ptrendx'
+           || github.actor == 'ksivaman'
+           || github.actor == 'schetlur-nv'
+           || github.actor == 'timmoon10'
+           || github.actor == 'zlsh80826'
+           || github.actor == 'mingxu1067'
+           || github.actor == 'cyanguwa'
+           || github.actor == 'nzmora-nvidia'
+           || github.actor == 'galagam'
+           || github.actor == 'nouiz'
+           || github.actor == 'denera'
+           || github.actor == 'sudhakarsingh27'
+           || github.actor == 'Oleg-Goncharov'
+           || github.actor == 'phu0ngng'
+           || github.actor == 'xrennvidia'
+           || github.actor == 'yaox12'
+           || github.actor == 'huanghua1994'
+           || github.actor == 'mgoldfarb-nvidia'
+           || github.actor == 'pggPL'
+           || github.actor == 'vasunvidia'
+           || github.actor == 'erhoo82'
+           || github.actor == 'kocchop'
+           || github.actor == 'youngeunkwon0405'
+           || github.actor == 'KshitijLakhani'
+           || github.actor == 'jberchtold-nvidia'
+           || github.actor == 'sanandaraj5597'
+           || github.actor == 'negvet'
+           || github.actor == 'zhongbozhu'
+           || github.actor == 'kwyss-nvidia'
+           || github.actor == 'BestJuly'
+           || github.actor == 'xiaopoc'
+           || github.actor == 'jreiffers'
+           || github.actor == 'lhb8125'
+           || github.actor == 'kunlunl'
+           || github.actor == 'pstjohn'
+           || github.actor == 'vcherepanov-nv'
+           || github.actor == 'tdophung'
+           || github.actor == 'vthumbe1503'
+           || github.actor == 'janekb04'
+           || github.actor == 'shengfangd'
+         )
+    steps:
+      - name: Check if comment is issued by authorized person
+        run: blossom-ci
+        env:
+          OPERATION: 'AUTH'
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
+
+  Vulnerability-scan:
+    name: Vulnerability scan
+    needs: [Authorization]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+        with:
+          repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
+          ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
+          lfs: 'true'
+
+      - name: Run blossom action
+        uses: NVIDIA/blossom-action@main
+        env:
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
+        with:
+          args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
+          args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
+          args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}
+
+  Job-trigger:
+    name: Start ci job
+    needs: [Vulnerability-scan]
+    runs-on: blossom
+    steps:
+      - name: Start ci job
+        run: blossom-ci
+        env:
+          OPERATION: 'START-CI-JOB'
+          CI_SERVER: ${{ secrets.CI_SERVER }}
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/upload-ci-logs.yml b/.github/workflows/upload-ci-logs.yml
new file mode 100644
index 0000000000..c9c7e4ef4d
--- /dev/null
+++ b/.github/workflows/upload-ci-logs.yml
@@ -0,0 +1,52 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+# A workflow to trigger ci on hybrid infra (github + self hosted runner)
+name: TE-CI Logs
+on:
+  workflow_dispatch:
+      inputs:
+          platform:
+            description: 'runs-on argument'
+            required: false
+          args:
+            description: 'argument'
+            required: false
+          job_name:
+            description: 'name of the job'
+            required: true
+          commit_sha:
+            description: 'SHA of the commit that was tested.'
+            required: true
+          result:
+            description: 'Job result'
+            required: true
+run-name: PR ${{ fromJson(github.event.inputs.args).pr }} - ${{ inputs.job_name }}
+jobs:
+  Upload-Log:
+    name: Upload log
+    runs-on: blossom
+    steps:
+      - name: Log
+        run: blossom-ci
+        env:
+          OPERATION: 'POST-PROCESSING'
+          CI_SERVER: ${{ secrets.CI_SERVER }}
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  status_update:
+    name: Update commit status
+    runs-on: ubuntu-latest
+    permissions:
+      statuses: write
+    needs: [Upload-Log]
+    if: ${{ always() }}
+    steps:
+      - name: Set status
+        run: |
+          curl \
+          -X POST \
+          -H "Accept: application/vnd.github+json" \
+          -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+          https://api.github.com/repos/${{ github.repository }}/statuses/${{ inputs.commit_sha }} \
+          -d "{\"state\":\"${{ inputs.result }}\",\"target_url\":\"${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}\",\"description\":\"\",\"context\":\"te-ci/${{ inputs.job_name }}\"}"

From f4ae7cde63f4cfaac3bf755a9de8f477839a4b08 Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Sat, 18 Apr 2026 15:44:22 +0800
Subject: [PATCH 04/25] modify all_tests to pipeline workflow

---
 .github/workflows/all_tests_common.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml
index e1b60c4cd9..51e37fc7fc 100644
--- a/.github/workflows/all_tests_common.yml
+++ b/.github/workflows/all_tests_common.yml
@@ -81,6 +81,8 @@ jobs:
 
   lint:
     name: lint
+    needs: 
+      - checkout_and_config
     uses: ./.github/workflows/lint_common.yml
 
   unit_tests:
@@ -126,6 +128,7 @@ jobs:
     needs:
       - checkout_and_config
       - lint
+      - unit_tests_complete
     uses: ./.github/workflows/integration_tests_common.yml
     with:
       platform: ${{ inputs.platform }}

From f45e9d0752a0cdab34528dd453227799d4653ce0 Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Sat, 18 Apr 2026 15:51:48 +0800
Subject: [PATCH 05/25] remove custom lint

---
 .github/workflows/all_tests_common.yml | 15 ------
 .github/workflows/lint.yml             | 63 ++++++++++++++++++++++++++
 .github/workflows/lint_common.yml      | 47 -------------------
 3 files changed, 63 insertions(+), 62 deletions(-)
 create mode 100644 .github/workflows/lint.yml
 delete mode 100644 .github/workflows/lint_common.yml

diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml
index 51e37fc7fc..401d1e2cfd 100644
--- a/.github/workflows/all_tests_common.yml
+++ b/.github/workflows/all_tests_common.yml
@@ -79,18 +79,11 @@ jobs:
           BUILD_ENV=$(yq '.build_env // {} | tojson(0)' "$CONFIG_FILE")
           echo "build_env=$BUILD_ENV" >> $GITHUB_OUTPUT
 
-  lint:
-    name: lint
-    needs: 
-      - checkout_and_config
-    uses: ./.github/workflows/lint_common.yml
-
   unit_tests:
     name: unit_tests
     if: inputs.run_unit_tests
     needs: 
       - checkout_and_config
-      - lint
     strategy:
       fail-fast: false
       matrix:
@@ -127,7 +120,6 @@ jobs:
     if: inputs.run_integration_tests
     needs:
       - checkout_and_config
-      - lint
       - unit_tests_complete
     uses: ./.github/workflows/integration_tests_common.yml
     with:
@@ -162,7 +154,6 @@ jobs:
         shell: bash
     needs:
       - checkout_and_config
-      - lint
       - unit_tests_complete
       - integration_tests_complete
     runs-on: ubuntu-latest
@@ -173,12 +164,6 @@ jobs:
           # Check all test jobs (skip if not run)
           failed=false
 
-          if [ "${{ needs.lint.result }}" != "success" ] && \
-            [ "${{ needs.lint.result }}" != "skipped" ]; then
-            echo "❌ Lint failed"
-            failed=true
-          fi
-
           if [ "${{ needs.unit_tests_complete.result }}" != "success" ] && \
             [ "${{ needs.unit_tests_complete.result }}" != "skipped" ]; then
             echo "❌ Unit tests failed or cancelled: ${{ needs.unit_tests_complete.result }}"
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 0000000000..ee6433d484
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,63 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+# A workflow to trigger lint tests on GitHub
+name: 'Lint'
+on:
+  pull_request:
+  workflow_dispatch:
+jobs:
+  pytorch_cpplint:
+    name: 'PyTorch C++'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: 'Lint'
+        run: |
+          sudo apt-get update
+          sudo apt-get install pip -y
+          export CPP_ONLY=1
+          export TE_PATH=.
+          bash ./qa/L0_pytorch_lint/test.sh
+  pytorch_pylint:
+    name: 'PyTorch Python'
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout'
+        uses: actions/checkout@v3
+      - name: 'Lint'
+        run: |
+          sudo apt-get update
+          sudo apt-get install pip -y
+          pip install torch numpy
+          export PYTHON_ONLY=1
+          export TE_PATH=.
+          bash ./qa/L0_pytorch_lint/test.sh
+  jax_cpplint:
+    name: 'JAX C++'
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout'
+        uses: actions/checkout@v3
+      - name: 'Lint'
+        run: |
+          sudo apt-get update
+          sudo apt-get install pip -y
+          export CPP_ONLY=1
+          export TE_PATH=.
+          bash ./qa/L0_jax_lint/test.sh
+  jax_pylint:
+    name: 'JAX Python'
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout'
+        uses: actions/checkout@v3
+      - name: 'Lint'
+        run: |
+          sudo apt-get update
+          sudo apt-get install pip -y
+          export PYTHON_ONLY=1
+          export TE_PATH=.
+          bash ./qa/L0_jax_lint/test.sh
diff --git a/.github/workflows/lint_common.yml b/.github/workflows/lint_common.yml
deleted file mode 100644
index 850b93640b..0000000000
--- a/.github/workflows/lint_common.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-name: 'Lint'
-
-on:
-  workflow_call: 
-  workflow_dispatch:
-
-jobs:
-  pytorch_pylint:
-    name: 'pytorch_lint'
-    runs-on: ubuntu-latest
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Checkout (attempt 1)
-        id: checkout1
-        uses: actions/checkout@v4
-        continue-on-error: true
-        with:
-          fetch-depth: 0
-
-      - name: Checkout (attempt 2)
-        id: checkout2
-        if: steps.checkout1.outcome == 'failure'
-        uses: actions/checkout@v4
-        continue-on-error: true
-        with:
-          fetch-depth: 0
-
-      - name: Checkout (attempt 3)
-        if: steps.checkout2.outcome == 'failure'
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      
-      - name: Setup Python
-        run: |
-          sudo apt-get update -q
-          sudo apt-get install -y -q python3-pip
-          pip3 install torch numpy --quiet
-
-      - name: Run Lint
-        env:
-          TE_PATH: ${{ github.workspace }}
-          PYTHON_ONLY: '1'
-        run: bash ./qa/L0_pytorch_lint/test.sh
-        timeout-minutes: 15

From 25e1bf487bbcd76cb4f84fe27beeed478cd38a33 Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Mon, 20 Apr 2026 15:09:49 +0800
Subject: [PATCH 06/25] remove unnecessary if condition

---
 .github/workflows/integration_tests_common.yml | 1 -
 .github/workflows/unit_tests_common.yml        | 1 -
 2 files changed, 2 deletions(-)

diff --git a/.github/workflows/integration_tests_common.yml b/.github/workflows/integration_tests_common.yml
index fdc8e68f84..b37947eaf9 100644
--- a/.github/workflows/integration_tests_common.yml
+++ b/.github/workflows/integration_tests_common.yml
@@ -99,7 +99,6 @@ jobs:
           bash $GITHUB_WORKSPACE/${{ inputs.setup_script }}
 
       - name: Execute Tests
-        if: inputs.setup_script != ''
         env:
           TE_PATH: ${{ github.workspace }}
           TE_FL_PREFER: vendor
diff --git a/.github/workflows/unit_tests_common.yml b/.github/workflows/unit_tests_common.yml
index 69b45aa5a5..b06f0413e7 100644
--- a/.github/workflows/unit_tests_common.yml
+++ b/.github/workflows/unit_tests_common.yml
@@ -105,7 +105,6 @@ jobs:
           bash $GITHUB_WORKSPACE/${{ inputs.setup_script }}
       
       - name: Execute Tests
-        if: inputs.setup_script != ''
         working-directory: ${{ github.workspace }}
         run: |
           set -euo pipefail

From d1711bcaa2cb41e4ade65f3989c81d984d807606 Mon Sep 17 00:00:00 2001
From: qqjxzxq <1376782660@qq.com>
Date: Tue, 21 Apr 2026 06:42:49 +0000
Subject: [PATCH 07/25] check conda & python

---
 .github/workflows/integration_tests_common.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/integration_tests_common.yml b/.github/workflows/integration_tests_common.yml
index b37947eaf9..3ec406e7fb 100644
--- a/.github/workflows/integration_tests_common.yml
+++ b/.github/workflows/integration_tests_common.yml
@@ -107,12 +107,15 @@ jobs:
         run: |
           set -euo pipefail
 
-          # # Activate conda environment
-          # source /opt/miniconda3/etc/profile.d/conda.sh
-          # conda activate flagscale-train
+          # Activate conda environment
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
           export TE_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/transformer_engine
 
           echo "=== Running L1 PyTorch Megatron-FL MCore Integration Test ==="
+          python3 --version
+          pip list | grep -E "regex|six|torch" || true
+          
           bash ${{ matrix.test_group.path }}
         timeout-minutes: 30
         
\ No newline at end of file

From 4035436446c14761119be0bdd70a9262ec4832fe Mon Sep 17 00:00:00 2001
From: HermiaHuan <3081497279@qq.com>
Date: Tue, 21 Apr 2026 15:43:57 +0800
Subject: [PATCH 08/25] chore: clean debug leftovers and centralize metax
 ignore rules

---
 qa/L0_pytorch_debug_unittest/README.rst | 26 ++++++++++++++++
 qa/L0_pytorch_debug_unittest/test.sh    | 41 +++++++++++++++----------
 2 files changed, 51 insertions(+), 16 deletions(-)
 create mode 100644 qa/L0_pytorch_debug_unittest/README.rst

diff --git a/qa/L0_pytorch_debug_unittest/README.rst b/qa/L0_pytorch_debug_unittest/README.rst
new file mode 100644
index 0000000000..2ba6e9fb0c
--- /dev/null
+++ b/qa/L0_pytorch_debug_unittest/README.rst
@@ -0,0 +1,26 @@
+L0 PyTorch Debug Unittest
+=========================
+
+This directory contains the L0 PyTorch debug unittest runner.
+
+MetaX ignore rules
+------------------
+
+MetaX-specific ignored tests are maintained in one place in ``test.sh`` through
+the ``METAX_IGNORED_TESTS`` list.
+
+The main execution flow only calls a helper to decide whether a test should be
+skipped, instead of embedding platform-specific matching rules directly in the
+main logic.
+
+This keeps the script easier to maintain and makes it simpler to add new
+ignored cases later if needed.
+
+How to extend
+-------------
+
+If a new test needs to be skipped on MetaX:
+
+1. Add the full test path to ``METAX_IGNORED_TESTS`` in ``test.sh``.
+2. Avoid adding new platform-specific matching logic directly into the main
+   execution flow.
\ No newline at end of file
diff --git a/qa/L0_pytorch_debug_unittest/test.sh b/qa/L0_pytorch_debug_unittest/test.sh
index 916eecdcca..2ab7340986 100644
--- a/qa/L0_pytorch_debug_unittest/test.sh
+++ b/qa/L0_pytorch_debug_unittest/test.sh
@@ -7,11 +7,7 @@
 : ${TE_PATH:=/opt/transformerengine}
 : ${NVTE_TEST_NVINSPECT_FEATURE_DIRS:=$TE_PATH/transformer_engine/debug/features}
 : ${NVTE_TEST_NVINSPECT_CONFIGS_DIR:=$TE_PATH/tests/pytorch/debug/test_configs/}
-# export PLATFORM="metax"
-# export TE_PATH=/workspace/TransformerEngine-FL
-# export NVTE_TEST_NVINSPECT_FEATURE_DIRS=$TE_PATH/transformer_engine/debug/features
-# export NVTE_TEST_NVINSPECT_CONFIGS_DIR=$TE_PATH/tests/pytorch/debug/test_configs/
-# export PYTHONPATH=$TE_PATH/tests/pytorch:$TE_PATH:$PYTHONPATH
+
 : ${XML_LOG_DIR:=/logs}
 mkdir -p "$XML_LOG_DIR"
 
@@ -25,24 +21,37 @@ FAIL=0
 # because it is not available on PyPI.
 pip install pytest==8.2.1
 
+METAX_IGNORED_TESTS=(
+    "$TE_PATH/tests/pytorch/test_numerics.py"
+    "$TE_PATH/tests/pytorch/test_sanity.py"
+)
+
+should_skip_on_metax() {
+    local test_path=$1
+
+    [ "$PLATFORM" = "metax" ] || return 1
+
+    local ignored_test
+    for ignored_test in "${METAX_IGNORED_TESTS[@]}"; do
+        if [ "$test_path" = "$ignored_test" ]; then
+            echo "[SKIP] Platform MetaX: Ignoring $test_path"
+            return 0
+        fi
+    done
+
+    return 1
+}
+
+
 run_test_step() {
     local xml_file=$1
     local test_path=$2
     local cmd=$3
 
-
-    if [ "$PLATFORM" = "metax" ]; then
-        case "$test_path" in
-            *tests/pytorch/test_numerics.py | *tests/pytorch/test_sanity.py)
-                echo "-------------------------------------------------------"
-                echo "[SKIP] Platform MetaX: Ignoring $test_path"
-                echo "-------------------------------------------------------"
-                return 0
-                ;;
-        esac
+    if should_skip_on_metax "$test_path"; then
+        return 0
     fi
 
-
     echo "-------------------------------------------------------"
     echo "[RUN] Executing: $test_path"
     eval "$cmd" || FAIL=1

From e8951243ff1397e5191c0e482aa0e4f2ce3b144b Mon Sep 17 00:00:00 2001
From: qqjxzxq <1376782660@qq.com>
Date: Tue, 21 Apr 2026 08:09:44 +0000
Subject: [PATCH 09/25] turn back

---
 .github/workflows/integration_tests_common.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/integration_tests_common.yml b/.github/workflows/integration_tests_common.yml
index 3ec406e7fb..d4d2dd1b53 100644
--- a/.github/workflows/integration_tests_common.yml
+++ b/.github/workflows/integration_tests_common.yml
@@ -108,14 +108,14 @@ jobs:
           set -euo pipefail
 
           # Activate conda environment
-          source /opt/miniconda3/etc/profile.d/conda.sh
-          conda activate flagscale-train
+          # source /opt/miniconda3/etc/profile.d/conda.sh
+          # conda activate flagscale-train
           export TE_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/transformer_engine
 
           echo "=== Running L1 PyTorch Megatron-FL MCore Integration Test ==="
-          python3 --version
-          pip list | grep -E "regex|six|torch" || true
-          
+          # python3 --version
+          # pip list | grep -E "regex|six|torch" || true
+
           bash ${{ matrix.test_group.path }}
         timeout-minutes: 30
         
\ No newline at end of file

From 5e281e8250eeef2790d494144eb3c0ae0097b3bb Mon Sep 17 00:00:00 2001
From: qqjxzxq <1376782660@qq.com>
Date: Wed, 22 Apr 2026 02:36:43 +0000
Subject: [PATCH 10/25] add network config

---
 .github/workflows/integration_tests_common.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/integration_tests_common.yml b/.github/workflows/integration_tests_common.yml
index d4d2dd1b53..c24706c41c 100644
--- a/.github/workflows/integration_tests_common.yml
+++ b/.github/workflows/integration_tests_common.yml
@@ -104,6 +104,11 @@ jobs:
           TE_FL_PREFER: vendor
           MCORE_REPO_URL: https://github.com/BrianPei/Megatron-LM-FL.git
           MCORE_REF: dev
+          MASTER_ADDR: "127.0.0.1"
+          MASTER_PORT: "39821"
+          NCCL_SOCKET_IFNAME: "lo"
+          GLOO_SOCKET_IFNAME: "lo"
+          TP_SOCKET_IFNAME: "lo"
         run: |
           set -euo pipefail
 

From 77209759e37fe3bba0d6ddc591af24da79f8bcc7 Mon Sep 17 00:00:00 2001
From: qqjxzxq <1376782660@qq.com>
Date: Wed, 22 Apr 2026 03:03:47 +0000
Subject: [PATCH 11/25] set network again

---
 .github/workflows/integration_tests_common.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/integration_tests_common.yml b/.github/workflows/integration_tests_common.yml
index c24706c41c..8e76ac6335 100644
--- a/.github/workflows/integration_tests_common.yml
+++ b/.github/workflows/integration_tests_common.yml
@@ -103,6 +103,7 @@ jobs:
           TE_PATH: ${{ github.workspace }}
           TE_FL_PREFER: vendor
           MCORE_REPO_URL: https://github.com/BrianPei/Megatron-LM-FL.git
+          # network set
           MCORE_REF: dev
           MASTER_ADDR: "127.0.0.1"
           MASTER_PORT: "39821"

From dc22a22b8d6db5ac6ce7d2d4539263a5f3f91423 Mon Sep 17 00:00:00 2001
From: HermiaHuan <3081497279@qq.com>
Date: Wed, 22 Apr 2026 11:03:37 +0800
Subject: [PATCH 12/25] revert: keep cudart workaround at test layer

---
 transformer_engine/common/__init__.py         | 34 +--------------
 .../plugin/core/backends/vendor/cuda/cuda.py  | 41 ++-----------------
 2 files changed, 5 insertions(+), 70 deletions(-)

diff --git a/transformer_engine/common/__init__.py b/transformer_engine/common/__init__.py
index e0fb0b7209..53cf64b43c 100644
--- a/transformer_engine/common/__init__.py
+++ b/transformer_engine/common/__init__.py
@@ -367,38 +367,6 @@ def _load_nvrtc():
     return ctypes.CDLL(f"libnvrtc{_get_sys_extension()}", mode=ctypes.RTLD_GLOBAL)
 
 
-@functools.lru_cache(maxsize=None)
-def _load_cudart():
-    """Load CUDA runtime shared library."""
-    # Attempt to locate CUDA runtime in CUDA_HOME, CUDA_PATH or /usr/local/cuda
-    cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") or "/usr/local/cuda"
-    libs = glob.glob(f"{cuda_home}/**/libcudart{_get_sys_extension()}*", recursive=True)
-    libs = list(filter(lambda x: not ("stub" in x), libs))
-    libs.sort(reverse=True, key=os.path.basename)
-    if libs:
-        return ctypes.CDLL(libs[0], mode=ctypes.RTLD_GLOBAL)
-
-    # Attempt to locate CUDA runtime in Python dist-packages
-    found, handle = _load_nvidia_cuda_library("cuda_runtime")
-    if found:
-        return handle
-
-    # Attempt to locate CUDA runtime via ldconfig
-    libs = subprocess.check_output(
-        f"ldconfig -p | grep 'libcudart{_get_sys_extension()}'", shell=True
-    )
-    libs = libs.decode("utf-8").split("\n")
-    sos = []
-    for lib in libs:
-        if "libcudart" in lib and "=>" in lib:
-            sos.append(lib.split(">")[1].strip())
-    if sos:
-        return ctypes.CDLL(sos[0], mode=ctypes.RTLD_GLOBAL)
-
-    # If all else fails, assume that it is in LD_LIBRARY_PATH and error out otherwise
-    return ctypes.CDLL(f"libcudart{_get_sys_extension()}", mode=ctypes.RTLD_GLOBAL)
-
-
 @functools.lru_cache(maxsize=None)
 def _load_curand():
     """Load cuRAND shared library."""
@@ -444,7 +412,7 @@ def _load_core_library():
     if not skip_cuda_build():
         _CUDNN_LIB_CTYPES = _load_cudnn()
         _NVRTC_LIB_CTYPES = _load_nvrtc()
-        _CUDART_LIB_CTYPES = _load_cudart()
+        _CUDART_LIB_CTYPES = _load_nvidia_cuda_library("cuda_runtime")
         _CURAND_LIB_CTYPES = _load_curand()
         _CUBLAS_LIB_CTYPES = _load_nvidia_cuda_library("cublas")
         _TE_LIB_CTYPES = _load_core_library()
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
index 2d0ae3c936..683e51d395 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
@@ -7,7 +7,6 @@
 import torch
 from ....ops import *
 
-_cuda_lib_handles: List[Any] = []
 
 
 def _load_cuda_libs():
@@ -16,7 +15,6 @@ def _load_cuda_libs():
     import subprocess
     from pathlib import Path
     import importlib.util
-    import sysconfig
     import platform
     import glob as glob_module
 
@@ -26,13 +24,6 @@ def get_ext():
 
     ext = get_ext()
 
-    python_nvidia_pkg = {
-        "cudnn": "cudnn",
-        "cudart": "cuda_runtime",
-        "nvrtc": "cuda_nvrtc",
-        "curand": "curand",
-    }
-
     def try_load_lib(name, search_patterns):
         for env_var in [f"{name.upper()}_HOME", f"{name.upper()}_PATH"]:
             path = os.environ.get(env_var)
@@ -45,20 +36,6 @@ def try_load_lib(name, search_patterns):
                     except:
                         pass
 
-        pkg_name = python_nvidia_pkg.get(name)
-        if pkg_name:
-            purelib = Path(sysconfig.get_path("purelib"))
-            libs = glob_module.glob(
-                str(purelib / "nvidia" / pkg_name / "**" / f"lib{name}{ext}*"),
-                recursive=True,
-            )
-            if libs:
-                libs.sort(reverse=True, key=os.path.basename)
-                try:
-                    return ctypes.CDLL(libs[0], mode=ctypes.RTLD_GLOBAL)
-                except:
-                    pass
-
         cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") or "/usr/local/cuda"
         for pattern in search_patterns:
             libs = glob_module.glob(f"{cuda_home}/**/{pattern}", recursive=True)
@@ -84,19 +61,10 @@ def try_load_lib(name, search_patterns):
         except:
             return None
 
-    global _cuda_lib_handles
-
     try:
-        handles = []
-        for name, patterns in [
-            ("cudnn", [f"libcudnn{ext}*"]),
-            ("cudart", [f"libcudart{ext}*"]),
-            ("nvrtc", [f"libnvrtc{ext}*"]),
-            ("curand", [f"libcurand{ext}*"]),
-        ]:
-            handle = try_load_lib(name, patterns)
-            if handle is not None:
-                handles.append(handle)
+        try_load_lib("cudnn", [f"libcudnn{ext}*"])
+        try_load_lib("nvrtc", [f"libnvrtc{ext}*"])
+        try_load_lib("curand", [f"libcurand{ext}*"])
 
         te_path_override = os.environ.get("TE_LIB_PATH")
         if te_path_override:
@@ -107,8 +75,7 @@ def try_load_lib(name, search_patterns):
             if search_dir.exists():
                 matches = list(search_dir.glob(f"libtransformer_engine{ext}*"))
                 if matches:
-                    handles.append(ctypes.CDLL(str(matches[0]), mode=ctypes.RTLD_GLOBAL))
-                    _cuda_lib_handles = handles
+                    ctypes.CDLL(str(matches[0]), mode=ctypes.RTLD_GLOBAL)
                     return True
         return False
     except Exception as e:

From db6a45908c9597d53db49d78cce87d8a5798cbf6 Mon Sep 17 00:00:00 2001
From: HermiaHuan <3081497279@qq.com>
Date: Wed, 22 Apr 2026 15:37:25 +0800
Subject: [PATCH 13/25] chore: restore original cudart load ordering

---
 transformer_engine/common/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformer_engine/common/__init__.py b/transformer_engine/common/__init__.py
index 53cf64b43c..f67b5d2470 100644
--- a/transformer_engine/common/__init__.py
+++ b/transformer_engine/common/__init__.py
@@ -412,9 +412,9 @@ def _load_core_library():
     if not skip_cuda_build():
         _CUDNN_LIB_CTYPES = _load_cudnn()
         _NVRTC_LIB_CTYPES = _load_nvrtc()
-        _CUDART_LIB_CTYPES = _load_nvidia_cuda_library("cuda_runtime")
         _CURAND_LIB_CTYPES = _load_curand()
         _CUBLAS_LIB_CTYPES = _load_nvidia_cuda_library("cublas")
+        _CUDART_LIB_CTYPES = _load_nvidia_cuda_library("cuda_runtime")
         _TE_LIB_CTYPES = _load_core_library()
 
         # Needed to find the correct headers for NVRTC kernels.

From 9467cce74ab9296d785bb119604b919ea65b4f7f Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Wed, 22 Apr 2026 16:38:17 +0800
Subject: [PATCH 14/25] disable original qa-l1 & qa-l3 workflow

---
 .../qa-l0-te-cpp-unittest-pytorch-lint.yml    |  1 -
 .../workflows/qa-l1-te-cpp-pytorch-tests.yml  | 29 +++----------------
 .../qa-l3-te-pytorch-fa-versions-test.yml     | 13 +++------
 3 files changed, 8 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml b/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
index 7b072d1d5e..f214990581 100644
--- a/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
+++ b/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
@@ -7,7 +7,6 @@ on:
   pull_request:
     branches:
       - __disabled_do_not_remove__
-  workflow_dispatch:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
diff --git a/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml b/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
index e7b2cf97c0..ac97441368 100644
--- a/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
+++ b/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
@@ -2,32 +2,11 @@ name: QA L1 - Comprehensive Integration Tests
 
 on:
   push:
-    branches: main
-    paths:
-      - '.github/workflows/qa-l1-te-cpp-pytorch-tests.yml'
-      - 'qa/L1_cpp_distributed/**'
-      - 'tests/cpp_distributed/**'
-      - 'qa/L1_pytorch_thunder_integration/**'
-      - 'qa/L1_pytorch_distributed_unittest/**'
-      - 'tests/pytorch/distributed/**'
-      - 'tests/pytorch/attention/**'
-      - 'qa/L1_pytorch_onnx_unittest/**'
-      - 'tests/pytorch/test_onnx_export.py'
-
+    branches:
+      - __disabled_do_not_remove__
   pull_request:
-    branches: main
-    paths:
-      - '.github/workflows/qa-l1-te-cpp-pytorch-tests.yml'
-      - 'qa/L1_cpp_distributed/**'
-      - 'tests/cpp_distributed/**'
-      - 'qa/L1_pytorch_thunder_integration/**'
-      - 'qa/L1_pytorch_distributed_unittest/**'
-      - 'tests/pytorch/distributed/**'
-      - 'tests/pytorch/attention/**'
-      - 'qa/L1_pytorch_onnx_unittest/**'
-      - 'tests/pytorch/test_onnx_export.py'
-      
-  workflow_dispatch:
+    branches:
+      - __disabled_do_not_remove__
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
diff --git a/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml b/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml
index 9a881dd2d9..bb3e0a73fe 100644
--- a/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml
+++ b/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml
@@ -3,16 +3,11 @@ name: QA L3 - Attention Tests
 
 on:
   push:
-    branches: __disable__
-    paths:
-      - '.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml'
-      - 'tests/pytorch/attention/test_attention.py'
-
+    branches: 
+      - __disabled_do_not_remove__
   pull_request:
-    branches: __disable__
-    paths:
-      - '.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml'
-      - 'tests/pytorch/attention/test_attention.py'
+    branches:
+      - __disabled_do_not_remove__
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}

From 24320f7fed110a24c4969efbcf2cf714a5d07086 Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Wed, 22 Apr 2026 17:55:07 +0800
Subject: [PATCH 15/25] fix format_check

---
 .../plugin/core/backends/vendor/cuda/cuda.py  | 221 ++++++++++++++----
 1 file changed, 171 insertions(+), 50 deletions(-)

diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
index 683e51d395..38f73c0070 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
@@ -8,7 +8,6 @@
 from ....ops import *
 
 
-
 def _load_cuda_libs():
     import ctypes
     import os
@@ -20,7 +19,9 @@ def _load_cuda_libs():
 
     def get_ext():
         system = platform.system()
-        return ".so" if system == "Linux" else ".dylib" if system == "Darwin" else ".dll"
+        return (
+            ".so" if system == "Linux" else ".dylib" if system == "Darwin" else ".dll"
+        )
 
     ext = get_ext()
 
@@ -36,7 +37,11 @@ def try_load_lib(name, search_patterns):
                     except:
                         pass
 
-        cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") or "/usr/local/cuda"
+        cuda_home = (
+            os.environ.get("CUDA_HOME")
+            or os.environ.get("CUDA_PATH")
+            or "/usr/local/cuda"
+        )
         for pattern in search_patterns:
             libs = glob_module.glob(f"{cuda_home}/**/{pattern}", recursive=True)
             if libs:
@@ -47,7 +52,9 @@ def try_load_lib(name, search_patterns):
                     pass
 
         try:
-            result = subprocess.check_output(f"ldconfig -p | grep 'lib{name}{ext}'", shell=True)
+            result = subprocess.check_output(
+                f"ldconfig -p | grep 'lib{name}{ext}'", shell=True
+            )
             for line in result.decode().split("\n"):
                 if f"lib{name}" in line and "=>" in line:
                     so_path = line.split(">")[1].strip()
@@ -70,7 +77,9 @@ def try_load_lib(name, search_patterns):
         if te_path_override:
             te_path = Path(te_path_override)
         else:
-            te_path = Path(importlib.util.find_spec("transformer_engine").origin).parent.parent
+            te_path = Path(
+                importlib.util.find_spec("transformer_engine").origin
+            ).parent.parent
         for search_dir in [te_path, te_path / "transformer_engine"]:
             if search_dir.exists():
                 matches = list(search_dir.glob(f"libtransformer_engine{ext}*"))
@@ -154,7 +163,9 @@ def get_attention_backend(self, attention_params=None):
                      fused_attention_backend, use_unfused_attention, available_backends)
         """
         # Import the original get_attention_backend function
-        from transformer_engine.pytorch.attention.dot_product_attention import utils as dpa_utils
+        from transformer_engine.pytorch.attention.dot_product_attention import (
+            utils as dpa_utils,
+        )
 
         return dpa_utils._original_get_attention_backend(attention_params)
 
@@ -169,7 +180,11 @@ def quantize(
         tex = self._get_tex()
         # Normalize quantizer.dtype to this backend's `tex.DType`.
         try:
-            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+            if (
+                quantizer is not None
+                and hasattr(quantizer, "dtype")
+                and hasattr(tex, "DType")
+            ):
                 qdtype = quantizer.dtype
                 if qdtype is not None:
                     quantizer.dtype = tex.DType(int(qdtype))
@@ -196,7 +211,11 @@ def bgrad_quantize(
 
         # Normalize quantizer.dtype to this backend's `tex.DType`.
         try:
-            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+            if (
+                quantizer is not None
+                and hasattr(quantizer, "dtype")
+                and hasattr(tex, "DType")
+            ):
                 qdtype = quantizer.dtype
                 if qdtype is not None:
                     quantizer.dtype = tex.DType(int(qdtype))
@@ -233,8 +252,12 @@ def generic_gemm(
         tex = self._get_tex()
 
         bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
-        comm_type = tex.CommOverlapType(int(comm_type)) if comm_type is not None else None
-        output_dtype = tex.DType(int(output_dtype)) if output_dtype is not None else None
+        comm_type = (
+            tex.CommOverlapType(int(comm_type)) if comm_type is not None else None
+        )
+        output_dtype = (
+            tex.DType(int(output_dtype)) if output_dtype is not None else None
+        )
         return tex.generic_gemm(
             A,
             transA,
@@ -318,15 +341,21 @@ def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) ->
         tex = self._get_tex()
         return tex.dgelu(grad, fwd_input, quantizer)
 
-    def dgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+    def dgeglu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> Any:
         tex = self._get_tex()
         return tex.dgeglu(grad, fwd_input, quantizer)
 
-    def dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+    def dqgelu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> Any:
         tex = self._get_tex()
         return tex.dqgelu(grad, fwd_input, quantizer)
 
-    def dqgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+    def dqgeglu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> Any:
         tex = self._get_tex()
         return tex.dqgeglu(grad, fwd_input, quantizer)
 
@@ -335,15 +364,21 @@ def drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) ->
         tex = self._get_tex()
         return tex.drelu(grad, fwd_input, quantizer)
 
-    def dreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+    def dreglu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> Any:
         tex = self._get_tex()
         return tex.dreglu(grad, fwd_input, quantizer)
 
-    def dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+    def dsrelu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> Any:
         tex = self._get_tex()
         return tex.dsrelu(grad, fwd_input, quantizer)
 
-    def dsreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+    def dsreglu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> Any:
         tex = self._get_tex()
         return tex.dsreglu(grad, fwd_input, quantizer)
 
@@ -352,7 +387,9 @@ def dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) ->
         tex = self._get_tex()
         return tex.dsilu(grad, fwd_input, quantizer)
 
-    def dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+    def dswiglu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> Any:
         tex = self._get_tex()
         return tex.dswiglu(grad, fwd_input, quantizer)
 
@@ -368,15 +405,21 @@ def clamped_dswiglu(
         return tex.clamped_dswiglu(grad, fwd_input, quantizer, limit, alpha)
 
     # DBias + DAct fusions #
-    def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
+    def dbias_dgelu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dgelu(grad, fwd_input, quantizer)
 
-    def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
+    def dbias_dsilu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dsilu(grad, fwd_input, quantizer)
 
-    def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
+    def dbias_drelu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_drelu(grad, fwd_input, quantizer)
 
@@ -480,7 +523,9 @@ def scaled_masked_softmax_backward(
         scale_factor: float,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.scaled_masked_softmax_backward(output_grad_, softmax_results_, scale_factor)
+        return tex.scaled_masked_softmax_backward(
+            output_grad_, softmax_results_, scale_factor
+        )
 
     def scaled_upper_triang_masked_softmax_forward(
         self,
@@ -536,7 +581,15 @@ def layernorm_fwd(
         tex = self._get_tex()
         otype = tex.DType(int(otype)) if otype is not None else None
         return tex.layernorm_fwd(
-            input, weight, bias, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
+            input,
+            weight,
+            bias,
+            eps,
+            ln_out,
+            quantizer,
+            otype,
+            sm_margin,
+            zero_centered_gamma,
         )
 
     def layernorm_bwd(
@@ -550,7 +603,9 @@ def layernorm_bwd(
         zero_centered_gamma: bool,
     ) -> List[Any]:
         tex = self._get_tex()
-        return tex.layernorm_bwd(dz, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma)
+        return tex.layernorm_bwd(
+            dz, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma
+        )
 
     def rmsnorm_fwd(
         self,
@@ -592,7 +647,9 @@ def rmsnorm_bwd_add(
         zero_centered_gamma: bool,
     ) -> List[Any]:
         tex = self._get_tex()
-        return tex.rmsnorm_bwd_add(dz, x, add, rsigma, gamma, sm_margin, zero_centered_gamma)
+        return tex.rmsnorm_bwd_add(
+            dz, x, add, rsigma, gamma, sm_margin, zero_centered_gamma
+        )
 
     def multi_tensor_quantize(
         self,
@@ -696,13 +753,21 @@ def get_fused_attn_backend(
 
         q_dtype = tex.DType(int(q_dtype)) if q_dtype is not None else None
         kv_dtype = tex.DType(int(kv_dtype)) if kv_dtype is not None else None
-        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
-        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        qkv_layout = (
+            tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        )
+        bias_type = (
+            tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        )
         attn_mask_type = (
-            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+            tex.NVTE_Mask_Type(int(attn_mask_type))
+            if attn_mask_type is not None
+            else None
         )
         softmax_type = (
-            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+            tex.NVTE_Softmax_Type(int(softmax_type))
+            if softmax_type is not None
+            else None
         )
 
         result = tex.get_fused_attn_backend(
@@ -746,7 +811,12 @@ def fused_amax_and_scale_update_after_reduction(
         tex = self._get_tex()
         fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
         return tex.fused_amax_and_scale_update_after_reduction(
-            amax_reduction_buffer, amax_histories, scales, amax_compute_algo, fp8_dtype, margin
+            amax_reduction_buffer,
+            amax_histories,
+            scales,
+            amax_compute_algo,
+            fp8_dtype,
+            margin,
         )
 
     def fp8_block_scaling_compute_partial_amax(
@@ -788,7 +858,9 @@ def fused_multi_row_padding(
         padded_input_row_list: List[int],
     ) -> None:
         tex = self._get_tex()
-        return tex.fused_multi_row_padding(input, output, input_row_list, padded_input_row_list)
+        return tex.fused_multi_row_padding(
+            input, output, input_row_list, padded_input_row_list
+        )
 
     def fused_multi_row_unpadding(
         self,
@@ -798,7 +870,9 @@ def fused_multi_row_unpadding(
         unpadded_input_row_list: List[int],
     ) -> None:
         tex = self._get_tex()
-        return tex.fused_multi_row_unpadding(input, output, input_row_list, unpadded_input_row_list)
+        return tex.fused_multi_row_unpadding(
+            input, output, input_row_list, unpadded_input_row_list
+        )
 
     # attention kernels
     def fa_prepare_fwd(
@@ -850,13 +924,21 @@ def fused_attn_fwd(
     ) -> List[Any]:
         tex = self._get_tex()
 
-        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
-        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        qkv_layout = (
+            tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        )
+        bias_type = (
+            tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        )
         attn_mask_type = (
-            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+            tex.NVTE_Mask_Type(int(attn_mask_type))
+            if attn_mask_type is not None
+            else None
         )
         softmax_type = (
-            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+            tex.NVTE_Softmax_Type(int(softmax_type))
+            if softmax_type is not None
+            else None
         )
 
         return tex.fused_attn_fwd(
@@ -921,13 +1003,21 @@ def fused_attn_bwd(
     ) -> List[Any]:
         tex = self._get_tex()
 
-        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
-        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        qkv_layout = (
+            tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        )
+        bias_type = (
+            tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        )
         attn_mask_type = (
-            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+            tex.NVTE_Mask_Type(int(attn_mask_type))
+            if attn_mask_type is not None
+            else None
         )
         softmax_type = (
-            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+            tex.NVTE_Softmax_Type(int(softmax_type))
+            if softmax_type is not None
+            else None
         )
         dqkv_type = tex.DType(int(dqkv_type)) if dqkv_type is not None else None
 
@@ -977,7 +1067,9 @@ def copy_to_kv_cache(
         is_non_paged: bool,
     ) -> None:
         tex = self._get_tex()
-        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        qkv_format = (
+            tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        )
         return tex.copy_to_kv_cache(
             new_k,
             new_v,
@@ -1026,9 +1118,18 @@ def fused_rope_forward(
         cp_rank: int,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        qkv_format = (
+            tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        )
         return tex.fused_rope_forward(
-            input, freqs, start_positions, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
+            input,
+            freqs,
+            start_positions,
+            qkv_format,
+            interleaved,
+            cu_seqlens,
+            cp_size,
+            cp_rank,
         )
 
     def fused_rope_backward(
@@ -1042,7 +1143,9 @@ def fused_rope_backward(
         cp_rank: int,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        qkv_format = (
+            tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        )
         return tex.fused_rope_backward(
             output_grads, freqs, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
         )
@@ -1060,7 +1163,9 @@ def fused_qkv_rope_forward(
         cp_rank: int,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
-        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        qkv_format = (
+            tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        )
         return tex.fused_qkv_rope_forward(
             qkv_input,
             q_freqs,
@@ -1087,7 +1192,9 @@ def fused_qkv_rope_backward(
         cp_rank: int,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        qkv_format = (
+            tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        )
         return tex.fused_qkv_rope_backward(
             q_grad_out,
             k_grad_out,
@@ -1269,7 +1376,9 @@ def thd_second_half_lse_correction(
         lse_packed: bool,
     ) -> None:
         tex = self._get_tex()
-        return tex.thd_second_half_lse_correction(lse, lse_per_step, cu_seqlens, lse_packed)
+        return tex.thd_second_half_lse_correction(
+            lse, lse_per_step, cu_seqlens, lse_packed
+        )
 
     def thd_read_second_half_lse(
         self,
@@ -1279,7 +1388,9 @@ def thd_read_second_half_lse(
         second_half_lse_seqlen: int,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_read_second_half_lse(lse, cu_seqlens, lse_packed, second_half_lse_seqlen)
+        return tex.thd_read_second_half_lse(
+            lse, cu_seqlens, lse_packed, second_half_lse_seqlen
+        )
 
     def thd_out_correction(
         self,
@@ -1293,7 +1404,13 @@ def thd_out_correction(
     ) -> None:
         tex = self._get_tex()
         return tex.thd_out_correction(
-            out, out_per_step, lse, lse_per_step, cu_seqlens, only_second_half, lse_packed
+            out,
+            out_per_step,
+            lse,
+            lse_per_step,
+            cu_seqlens,
+            only_second_half,
+            lse_packed,
         )
 
     def thd_grad_correction(
@@ -1305,7 +1422,9 @@ def thd_grad_correction(
         second_half: str,
     ) -> None:
         tex = self._get_tex()
-        return tex.thd_grad_correction(grad, grad_per_step, cu_seqlens, first_half, second_half)
+        return tex.thd_grad_correction(
+            grad, grad_per_step, cu_seqlens, first_half, second_half
+        )
 
     def thd_get_partitioned_indices(
         self,
@@ -1315,7 +1434,9 @@ def thd_get_partitioned_indices(
         rank: int,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_get_partitioned_indices(cu_seqlens, total_tokens, world_size, rank)
+        return tex.thd_get_partitioned_indices(
+            cu_seqlens, total_tokens, world_size, rank
+        )
 
     # nvshmem functions
     def init_nvshmem_backend(

From 26c6df985523ae94b18918963c5f034bf7548816 Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Wed, 22 Apr 2026 18:09:12 +0800
Subject: [PATCH 16/25] fix: apply black formatting with correct CI flags

---
 .../plugin/core/backends/vendor/cuda/cuda.py  | 182 +++++-------------
 1 file changed, 44 insertions(+), 138 deletions(-)

diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
index 38f73c0070..4045997666 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
@@ -19,9 +19,7 @@ def _load_cuda_libs():
 
     def get_ext():
         system = platform.system()
-        return (
-            ".so" if system == "Linux" else ".dylib" if system == "Darwin" else ".dll"
-        )
+        return ".so" if system == "Linux" else ".dylib" if system == "Darwin" else ".dll"
 
     ext = get_ext()
 
@@ -37,11 +35,7 @@ def try_load_lib(name, search_patterns):
                     except:
                         pass
 
-        cuda_home = (
-            os.environ.get("CUDA_HOME")
-            or os.environ.get("CUDA_PATH")
-            or "/usr/local/cuda"
-        )
+        cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") or "/usr/local/cuda"
         for pattern in search_patterns:
             libs = glob_module.glob(f"{cuda_home}/**/{pattern}", recursive=True)
             if libs:
@@ -52,9 +46,7 @@ def try_load_lib(name, search_patterns):
                     pass
 
         try:
-            result = subprocess.check_output(
-                f"ldconfig -p | grep 'lib{name}{ext}'", shell=True
-            )
+            result = subprocess.check_output(f"ldconfig -p | grep 'lib{name}{ext}'", shell=True)
             for line in result.decode().split("\n"):
                 if f"lib{name}" in line and "=>" in line:
                     so_path = line.split(">")[1].strip()
@@ -77,9 +69,7 @@ def try_load_lib(name, search_patterns):
         if te_path_override:
             te_path = Path(te_path_override)
         else:
-            te_path = Path(
-                importlib.util.find_spec("transformer_engine").origin
-            ).parent.parent
+            te_path = Path(importlib.util.find_spec("transformer_engine").origin).parent.parent
         for search_dir in [te_path, te_path / "transformer_engine"]:
             if search_dir.exists():
                 matches = list(search_dir.glob(f"libtransformer_engine{ext}*"))
@@ -180,11 +170,7 @@ def quantize(
         tex = self._get_tex()
         # Normalize quantizer.dtype to this backend's `tex.DType`.
         try:
-            if (
-                quantizer is not None
-                and hasattr(quantizer, "dtype")
-                and hasattr(tex, "DType")
-            ):
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
                 qdtype = quantizer.dtype
                 if qdtype is not None:
                     quantizer.dtype = tex.DType(int(qdtype))
@@ -211,11 +197,7 @@ def bgrad_quantize(
 
         # Normalize quantizer.dtype to this backend's `tex.DType`.
         try:
-            if (
-                quantizer is not None
-                and hasattr(quantizer, "dtype")
-                and hasattr(tex, "DType")
-            ):
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
                 qdtype = quantizer.dtype
                 if qdtype is not None:
                     quantizer.dtype = tex.DType(int(qdtype))
@@ -252,12 +234,8 @@ def generic_gemm(
         tex = self._get_tex()
 
         bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
-        comm_type = (
-            tex.CommOverlapType(int(comm_type)) if comm_type is not None else None
-        )
-        output_dtype = (
-            tex.DType(int(output_dtype)) if output_dtype is not None else None
-        )
+        comm_type = tex.CommOverlapType(int(comm_type)) if comm_type is not None else None
+        output_dtype = tex.DType(int(output_dtype)) if output_dtype is not None else None
         return tex.generic_gemm(
             A,
             transA,
@@ -341,21 +319,15 @@ def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) ->
         tex = self._get_tex()
         return tex.dgelu(grad, fwd_input, quantizer)
 
-    def dgeglu(
-        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
-    ) -> Any:
+    def dgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgeglu(grad, fwd_input, quantizer)
 
-    def dqgelu(
-        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
-    ) -> Any:
+    def dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dqgelu(grad, fwd_input, quantizer)
 
-    def dqgeglu(
-        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
-    ) -> Any:
+    def dqgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dqgeglu(grad, fwd_input, quantizer)
 
@@ -364,21 +336,15 @@ def drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) ->
         tex = self._get_tex()
         return tex.drelu(grad, fwd_input, quantizer)
 
-    def dreglu(
-        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
-    ) -> Any:
+    def dreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dreglu(grad, fwd_input, quantizer)
 
-    def dsrelu(
-        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
-    ) -> Any:
+    def dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsrelu(grad, fwd_input, quantizer)
 
-    def dsreglu(
-        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
-    ) -> Any:
+    def dsreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsreglu(grad, fwd_input, quantizer)
 
@@ -387,9 +353,7 @@ def dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) ->
         tex = self._get_tex()
         return tex.dsilu(grad, fwd_input, quantizer)
 
-    def dswiglu(
-        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
-    ) -> Any:
+    def dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dswiglu(grad, fwd_input, quantizer)
 
@@ -405,21 +369,15 @@ def clamped_dswiglu(
         return tex.clamped_dswiglu(grad, fwd_input, quantizer, limit, alpha)
 
     # DBias + DAct fusions #
-    def dbias_dgelu(
-        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
-    ) -> List[Any]:
+    def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dgelu(grad, fwd_input, quantizer)
 
-    def dbias_dsilu(
-        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
-    ) -> List[Any]:
+    def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dsilu(grad, fwd_input, quantizer)
 
-    def dbias_drelu(
-        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
-    ) -> List[Any]:
+    def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_drelu(grad, fwd_input, quantizer)
 
@@ -523,9 +481,7 @@ def scaled_masked_softmax_backward(
         scale_factor: float,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.scaled_masked_softmax_backward(
-            output_grad_, softmax_results_, scale_factor
-        )
+        return tex.scaled_masked_softmax_backward(output_grad_, softmax_results_, scale_factor)
 
     def scaled_upper_triang_masked_softmax_forward(
         self,
@@ -603,9 +559,7 @@ def layernorm_bwd(
         zero_centered_gamma: bool,
     ) -> List[Any]:
         tex = self._get_tex()
-        return tex.layernorm_bwd(
-            dz, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma
-        )
+        return tex.layernorm_bwd(dz, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma)
 
     def rmsnorm_fwd(
         self,
@@ -647,9 +601,7 @@ def rmsnorm_bwd_add(
         zero_centered_gamma: bool,
     ) -> List[Any]:
         tex = self._get_tex()
-        return tex.rmsnorm_bwd_add(
-            dz, x, add, rsigma, gamma, sm_margin, zero_centered_gamma
-        )
+        return tex.rmsnorm_bwd_add(dz, x, add, rsigma, gamma, sm_margin, zero_centered_gamma)
 
     def multi_tensor_quantize(
         self,
@@ -753,21 +705,13 @@ def get_fused_attn_backend(
 
         q_dtype = tex.DType(int(q_dtype)) if q_dtype is not None else None
         kv_dtype = tex.DType(int(kv_dtype)) if kv_dtype is not None else None
-        qkv_layout = (
-            tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
-        )
-        bias_type = (
-            tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
-        )
+        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
         attn_mask_type = (
-            tex.NVTE_Mask_Type(int(attn_mask_type))
-            if attn_mask_type is not None
-            else None
+            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
         )
         softmax_type = (
-            tex.NVTE_Softmax_Type(int(softmax_type))
-            if softmax_type is not None
-            else None
+            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
         )
 
         result = tex.get_fused_attn_backend(
@@ -858,9 +802,7 @@ def fused_multi_row_padding(
         padded_input_row_list: List[int],
     ) -> None:
         tex = self._get_tex()
-        return tex.fused_multi_row_padding(
-            input, output, input_row_list, padded_input_row_list
-        )
+        return tex.fused_multi_row_padding(input, output, input_row_list, padded_input_row_list)
 
     def fused_multi_row_unpadding(
         self,
@@ -870,9 +812,7 @@ def fused_multi_row_unpadding(
         unpadded_input_row_list: List[int],
     ) -> None:
         tex = self._get_tex()
-        return tex.fused_multi_row_unpadding(
-            input, output, input_row_list, unpadded_input_row_list
-        )
+        return tex.fused_multi_row_unpadding(input, output, input_row_list, unpadded_input_row_list)
 
     # attention kernels
     def fa_prepare_fwd(
@@ -924,21 +864,13 @@ def fused_attn_fwd(
     ) -> List[Any]:
         tex = self._get_tex()
 
-        qkv_layout = (
-            tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
-        )
-        bias_type = (
-            tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
-        )
+        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
         attn_mask_type = (
-            tex.NVTE_Mask_Type(int(attn_mask_type))
-            if attn_mask_type is not None
-            else None
+            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
         )
         softmax_type = (
-            tex.NVTE_Softmax_Type(int(softmax_type))
-            if softmax_type is not None
-            else None
+            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
         )
 
         return tex.fused_attn_fwd(
@@ -1003,21 +935,13 @@ def fused_attn_bwd(
     ) -> List[Any]:
         tex = self._get_tex()
 
-        qkv_layout = (
-            tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
-        )
-        bias_type = (
-            tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
-        )
+        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
         attn_mask_type = (
-            tex.NVTE_Mask_Type(int(attn_mask_type))
-            if attn_mask_type is not None
-            else None
+            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
         )
         softmax_type = (
-            tex.NVTE_Softmax_Type(int(softmax_type))
-            if softmax_type is not None
-            else None
+            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
         )
         dqkv_type = tex.DType(int(dqkv_type)) if dqkv_type is not None else None
 
@@ -1067,9 +991,7 @@ def copy_to_kv_cache(
         is_non_paged: bool,
     ) -> None:
         tex = self._get_tex()
-        qkv_format = (
-            tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
-        )
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.copy_to_kv_cache(
             new_k,
             new_v,
@@ -1118,9 +1040,7 @@ def fused_rope_forward(
         cp_rank: int,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        qkv_format = (
-            tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
-        )
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_rope_forward(
             input,
             freqs,
@@ -1143,9 +1063,7 @@ def fused_rope_backward(
         cp_rank: int,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        qkv_format = (
-            tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
-        )
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_rope_backward(
             output_grads, freqs, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
         )
@@ -1163,9 +1081,7 @@ def fused_qkv_rope_forward(
         cp_rank: int,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
-        qkv_format = (
-            tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
-        )
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_qkv_rope_forward(
             qkv_input,
             q_freqs,
@@ -1192,9 +1108,7 @@ def fused_qkv_rope_backward(
         cp_rank: int,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        qkv_format = (
-            tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
-        )
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_qkv_rope_backward(
             q_grad_out,
             k_grad_out,
@@ -1376,9 +1290,7 @@ def thd_second_half_lse_correction(
         lse_packed: bool,
     ) -> None:
         tex = self._get_tex()
-        return tex.thd_second_half_lse_correction(
-            lse, lse_per_step, cu_seqlens, lse_packed
-        )
+        return tex.thd_second_half_lse_correction(lse, lse_per_step, cu_seqlens, lse_packed)
 
     def thd_read_second_half_lse(
         self,
@@ -1388,9 +1300,7 @@ def thd_read_second_half_lse(
         second_half_lse_seqlen: int,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_read_second_half_lse(
-            lse, cu_seqlens, lse_packed, second_half_lse_seqlen
-        )
+        return tex.thd_read_second_half_lse(lse, cu_seqlens, lse_packed, second_half_lse_seqlen)
 
     def thd_out_correction(
         self,
@@ -1422,9 +1332,7 @@ def thd_grad_correction(
         second_half: str,
     ) -> None:
         tex = self._get_tex()
-        return tex.thd_grad_correction(
-            grad, grad_per_step, cu_seqlens, first_half, second_half
-        )
+        return tex.thd_grad_correction(grad, grad_per_step, cu_seqlens, first_half, second_half)
 
     def thd_get_partitioned_indices(
         self,
@@ -1434,9 +1342,7 @@ def thd_get_partitioned_indices(
         rank: int,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_get_partitioned_indices(
-            cu_seqlens, total_tokens, world_size, rank
-        )
+        return tex.thd_get_partitioned_indices(cu_seqlens, total_tokens, world_size, rank)
 
     # nvshmem functions
     def init_nvshmem_backend(

From fc2b4a3f48c1c9023ab707304ecbaba5757d6cea Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Thu, 23 Apr 2026 09:31:21 +0800
Subject: [PATCH 17/25] remove torchrun standalone for integration_tests

---
 .github/workflows/integration_tests_common.yml | 12 ++++++------
 qa/L1_pytorch_mcore_integration/test.sh        |  3 ++-
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/integration_tests_common.yml b/.github/workflows/integration_tests_common.yml
index 8e76ac6335..b7766e7404 100644
--- a/.github/workflows/integration_tests_common.yml
+++ b/.github/workflows/integration_tests_common.yml
@@ -103,13 +103,13 @@ jobs:
           TE_PATH: ${{ github.workspace }}
           TE_FL_PREFER: vendor
           MCORE_REPO_URL: https://github.com/BrianPei/Megatron-LM-FL.git
-          # network set
           MCORE_REF: dev
-          MASTER_ADDR: "127.0.0.1"
-          MASTER_PORT: "39821"
-          NCCL_SOCKET_IFNAME: "lo"
-          GLOO_SOCKET_IFNAME: "lo"
-          TP_SOCKET_IFNAME: "lo"
+          # network set
+          # MASTER_ADDR: "127.0.0.1"
+          # MASTER_PORT: "39821"
+          # NCCL_SOCKET_IFNAME: "lo"
+          # GLOO_SOCKET_IFNAME: "lo"
+          # TP_SOCKET_IFNAME: "lo"
         run: |
           set -euo pipefail
 
diff --git a/qa/L1_pytorch_mcore_integration/test.sh b/qa/L1_pytorch_mcore_integration/test.sh
index 913e7e6790..0ca0557801 100644
--- a/qa/L1_pytorch_mcore_integration/test.sh
+++ b/qa/L1_pytorch_mcore_integration/test.sh
@@ -50,6 +50,8 @@ fi
 # Download or sync Megatron-LM-FL to the requested repo/ref.
 if [ ! -d "${MCORE_PATH}" ]; then
     pushd $(dirname ${MCORE_PATH})
+    git config --global --unset-all credential.helper 2>/dev/null || true
+    git config --system --unset-all credential.helper 2>/dev/null || true
     retry_command 3 5 git clone --depth 1 -b "${MCORE_REF}" "${MCORE_REPO_URL}" $(basename ${MCORE_PATH})
     popd
 fi
@@ -89,7 +91,6 @@ NCCL_ALGO=Ring
 CUBLAS_WORKSPACE_CONFIG=:4096:8
 
 torchrun
---standalone
 --nnodes=1
 --nproc_per_node=1
 

From 3cc6661aa94762578fc8c8d9d824573a55594ecb Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Thu, 23 Apr 2026 09:55:29 +0800
Subject: [PATCH 18/25] updated build & plugin runner label

---
 .github/workflows/build.yml           | 2 +-
 .github/workflows/te-plugin-tests.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 6c9c967950..ef6e10be72 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,7 +10,7 @@ on:
 jobs:
   pytorch:
     name: 'PyTorch'
-    runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ]
+    runs-on: [ nv-8g-cicd-te ]
     defaults:
       run:
         shell: bash
diff --git a/.github/workflows/te-plugin-tests.yml b/.github/workflows/te-plugin-tests.yml
index f487673444..8b73c17833 100644
--- a/.github/workflows/te-plugin-tests.yml
+++ b/.github/workflows/te-plugin-tests.yml
@@ -18,7 +18,7 @@ concurrency:
 
 jobs:
   run-plugin-tests:
-    runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ]
+    runs-on: [ nv-8g-cicd-te ]
     defaults:
       run:
         shell: bash

From 9401c7d589dbef91e4796739dd28b8798f2beb91 Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Thu, 23 Apr 2026 11:50:29 +0800
Subject: [PATCH 19/25] fix cuda build scripts

---
 .github/workflows/build.yml           | 41 +++++++++++++++++++++------
 .github/workflows/te-plugin-tests.yml |  2 +-
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ef6e10be72..b7cc465c3b 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -3,6 +3,7 @@
 # See LICENSE for license information.
 
 # A workflow to trigger TE build on GitHub
+
 name: 'Build'
 on:
   pull_request:
@@ -16,22 +17,46 @@ jobs:
         shell: bash
     container:
       image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
-      options: --user root
+      ports:
+        - 80:80
+      options: >-
+        --gpus all 
+        --shm-size=500g 
+        --privileged 
+        --ipc=host 
+        --ulimit memlock=-1 
+        --ulimit stack=67108864 
+        --ulimit nofile=65535:65535 
+        --user root
+        --pull never
     steps:
       - name: 'Checkout'
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
+          fetch-depth: 0
           submodules: recursive
-      - name: 'Build'
-        run:
+          set-safe-directory: true
+
+      - name: 'Setup Environment'
+        run: |
           source /opt/miniconda3/etc/profile.d/conda.sh
           conda activate flagscale-train
-          pip install --no-build-isolation . -v --no-deps
+          echo "PATH=$PATH" >> $GITHUB_ENV
+
+      - name: 'Build'
+        run:
+          pip uninstall transformer_engine transformer_engine_torch -y || true
+          cd $GITHUB_WORKSPACE
+          pip install nvdlfw-inspect --quiet
+          pip install expecttest --quiet
+          pip install . -v --no-deps --no-build-isolation
         env:
           NVTE_FRAMEWORK: pytorch
-          TE_WITH_NCCL: 1
+          TE_WITH_NCCL: '1'
+          NVTE_WITH_CUDA: '1'
+          CUDA_HOME: /usr/local/cuda-12.8
+          NVCC: /usr/local/cuda-12.8/bin/nvcc
+
       - name: 'Sanity check'
         run: 
-          source /opt/miniconda3/etc/profile.d/conda.sh
-          conda activate flagscale-train
           python3 tests/pytorch/test_sanity_import.py
diff --git a/.github/workflows/te-plugin-tests.yml b/.github/workflows/te-plugin-tests.yml
index 8b73c17833..9b640fcce8 100644
--- a/.github/workflows/te-plugin-tests.yml
+++ b/.github/workflows/te-plugin-tests.yml
@@ -35,7 +35,7 @@ jobs:
         --ulimit stack=67108864 
         --ulimit nofile=65535:65535 
         --user root
-        --pull always
+        --pull never
     steps:
       - name: Checkout Code
         uses: actions/checkout@v6.0.1

From 9983ed63c0e1bf3f91cb10eea5ffe8f5bb590bac Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Thu, 23 Apr 2026 12:05:45 +0800
Subject: [PATCH 20/25] Add clean vscode-remote-container step on metax

---
 .github/workflows/build.yml                    | 7 ++++---
 .github/workflows/integration_tests_common.yml | 7 +++++++
 .github/workflows/unit_tests_common.yml        | 7 +++++++
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b7cc465c3b..8696eba00f 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -44,11 +44,12 @@ jobs:
           echo "PATH=$PATH" >> $GITHUB_ENV
 
       - name: 'Build'
-        run:
+        run: |
           pip uninstall transformer_engine transformer_engine_torch -y || true
+          echo "GITHUB_WORKSPACE=$GITHUB_WORKSPACE"
           cd $GITHUB_WORKSPACE
-          pip install nvdlfw-inspect --quiet
-          pip install expecttest --quiet
+          pip install nvdlfw-inspect
+          pip install expecttest
           pip install . -v --no-deps --no-build-isolation
         env:
           NVTE_FRAMEWORK: pytorch
diff --git a/.github/workflows/integration_tests_common.yml b/.github/workflows/integration_tests_common.yml
index b7766e7404..8f2ff6605e 100644
--- a/.github/workflows/integration_tests_common.yml
+++ b/.github/workflows/integration_tests_common.yml
@@ -86,6 +86,13 @@ jobs:
           submodules: recursive
           set-safe-directory: true
 
+      # Metax requires to clean vscode-remote-container
+      - name: Configure Clean Git Env on Metax
+        if: inputs.platform == 'metax'
+        run: |
+          git config --global --unset-all credential.helper 2>/dev/null || true
+          git config --system --unset-all credential.helper 2>/dev/null || true
+
       # Metax no need submodules
       - name: Checkout Source Code on Metax
         if: inputs.platform == 'metax'
diff --git a/.github/workflows/unit_tests_common.yml b/.github/workflows/unit_tests_common.yml
index b06f0413e7..fc441c52c9 100644
--- a/.github/workflows/unit_tests_common.yml
+++ b/.github/workflows/unit_tests_common.yml
@@ -92,6 +92,13 @@ jobs:
           submodules: recursive
           set-safe-directory: true
 
+      # Metax requires to clean vscode-remote-container
+      - name: Configure Clean Git Env on Metax
+        if: inputs.platform == 'metax'
+        run: |
+          git config --global --unset-all credential.helper 2>/dev/null || true
+          git config --system --unset-all credential.helper 2>/dev/null || true
+
       # Metax no need submodules
       - name: Checkout Source Code on Metax
         if: inputs.platform == 'metax'

From 0dcd0cec68f47b6b003c0c5c3906c559bef2960d Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Thu, 23 Apr 2026 13:43:26 +0800
Subject: [PATCH 21/25] fix depedences installation on Metax runner

---
 .github/scripts/setup_metax.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/setup_metax.sh b/.github/scripts/setup_metax.sh
index b05e0190b1..a2d0b0a4cf 100755
--- a/.github/scripts/setup_metax.sh
+++ b/.github/scripts/setup_metax.sh
@@ -21,8 +21,12 @@ ln -sf /opt/maca/tools/cu-bridge/bin/cucc /opt/maca/tools/cu-bridge/bin/nvcc
 which nvcc || true
 
 echo "===== Step 3: Install Required System Tools ====="
-# Install essential build tools (avoid modifying Python dependencies)
-apt-get update -qq && apt-get install -y -qq git cmake ninja-build curl
+# Use apt to install git, curl
+sed -i 's|http://mirrors.aliyun.com/ubuntu|http://archive.ubuntu.com/ubuntu|g' /etc/apt/sources.list
+apt-get update -qq || true
+apt-get install -y -qq git curl
+# Install cmake and ninja via pip (more reliable than apt in this env)
+python3 -m pip install cmake ninja torch --no-cache-dir
 
 echo "===== Step 4: Remove Existing TransformerEngine ====="
 # Prevent conflicts with preinstalled or incompatible versions

From 6f130c46b58e2e99331bddb232f8c9e813554af4 Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Thu, 23 Apr 2026 15:10:42 +0800
Subject: [PATCH 22/25] set git safe directory for build

---
 .github/workflows/build.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 8696eba00f..2ef6d1893d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -30,6 +30,9 @@ jobs:
         --user root
         --pull never
     steps:
+      - name: Configure Git Safe Directory on Cuda
+        run: /usr/bin/git config --global safe.directory '*'
+
       - name: 'Checkout'
         uses: actions/checkout@v4
         with:

From 78a999a0e4a2a3da94e76ef1161acf3c6026fe5f Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Thu, 23 Apr 2026 15:48:07 +0800
Subject: [PATCH 23/25] integration_tests job add strategy metrix

---
 .github/workflows/all_tests_common.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml
index 401d1e2cfd..606a0d3e86 100644
--- a/.github/workflows/all_tests_common.yml
+++ b/.github/workflows/all_tests_common.yml
@@ -121,6 +121,10 @@ jobs:
     needs:
       - checkout_and_config
       - unit_tests_complete
+    strategy:
+      fail-fast: false
+      matrix:
+        device: ${{ fromJson(needs.checkout_and_config.outputs.device_types) }}
     uses: ./.github/workflows/integration_tests_common.yml
     with:
       platform: ${{ inputs.platform }}

From c72c2f915a8eb8f3918b38191770e8e1830ab49e Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Fri, 24 Apr 2026 15:08:15 +0800
Subject: [PATCH 24/25] change integration repo to flagOS megatron-LM-FL

---
 .github/configs/cuda.yml                         |  6 +++---
 .github/configs/metax.yml                        |  4 ++--
 .github/workflows/integration_tests_common.yml   | 10 ++--------
 .github/workflows/qa-l1-te-cpp-pytorch-tests.yml |  4 ++--
 qa/L1_pytorch_mcore_integration/test.sh          |  4 ++--
 5 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml
index fcfc2f1be8..1c77fe6c25 100644
--- a/.github/configs/cuda.yml
+++ b/.github/configs/cuda.yml
@@ -1,11 +1,11 @@
 # CUDA Hardware Configuration for TransformerEngine-FL
-# Refactored for BAAI DGX A100 Nodes
+# Refactored for A100 Nodes
 # This file defines environment variables, volumes, and test filters for TE tests.
 
 hardware_name: cuda
 display_name: 'NVIDIA CUDA (A100)'
 
-# CI image for BAAI env
+# CI image for online env
 ci_image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
 
 # Runner labels for self-hosted A100 node
@@ -16,7 +16,7 @@ ci_image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:202
 #   - nvidia
 #   - gpu-8
 
-# Runner labels for BAAI env
+# Runner labels for online env
 runner_labels:
   - nv-8g-cicd-te
 
diff --git a/.github/configs/metax.yml b/.github/configs/metax.yml
index 07ae49925f..00b4e1df34 100644
--- a/.github/configs/metax.yml
+++ b/.github/configs/metax.yml
@@ -8,7 +8,7 @@ display_name: 'Metax Tests'
 # CI image for Metax dev env
 # ci_image: localhost:5000/megatron-lm-with-te:v1
 
-# CI image for BAAI env
+# CI image for online env
 ci_image: harbor.baai.ac.cn/flagscale/megatron-lm-with-te:202603231839
 
 # Runner labels for self-hosted Metax node
@@ -19,7 +19,7 @@ ci_image: harbor.baai.ac.cn/flagscale/megatron-lm-with-te:202603231839
 #   - metax
 #   - dev
 
-# Runner labels for BAAI env
+# Runner labels for online env
 runner_labels:
   - mx-4g-cicd-te
 
diff --git a/.github/workflows/integration_tests_common.yml b/.github/workflows/integration_tests_common.yml
index 8f2ff6605e..2e75c983e7 100644
--- a/.github/workflows/integration_tests_common.yml
+++ b/.github/workflows/integration_tests_common.yml
@@ -109,14 +109,8 @@ jobs:
         env:
           TE_PATH: ${{ github.workspace }}
           TE_FL_PREFER: vendor
-          MCORE_REPO_URL: https://github.com/BrianPei/Megatron-LM-FL.git
-          MCORE_REF: dev
-          # network set
-          # MASTER_ADDR: "127.0.0.1"
-          # MASTER_PORT: "39821"
-          # NCCL_SOCKET_IFNAME: "lo"
-          # GLOO_SOCKET_IFNAME: "lo"
-          # TP_SOCKET_IFNAME: "lo"
+          MCORE_REPO_URL: https://github.com/flagos-ai/Megatron-LM-FL.git
+          MCORE_REF: main
         run: |
           set -euo pipefail
 
diff --git a/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml b/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
index ac97441368..32a13813ff 100644
--- a/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
+++ b/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
@@ -151,8 +151,8 @@ jobs:
         env:
           TE_PATH: .
           TE_FL_PREFER: vendor
-          MCORE_REPO_URL: https://github.com/BrianPei/Megatron-LM-FL.git
-          MCORE_REF: dev
+          MCORE_REPO_URL: https://github.com/flagos-ai/Megatron-LM-FL.git
+          MCORE_REF: main
         run: |
           # Activate conda environment
           source /opt/miniconda3/etc/profile.d/conda.sh
diff --git a/qa/L1_pytorch_mcore_integration/test.sh b/qa/L1_pytorch_mcore_integration/test.sh
index 0ca0557801..b4ccb8f9ad 100644
--- a/qa/L1_pytorch_mcore_integration/test.sh
+++ b/qa/L1_pytorch_mcore_integration/test.sh
@@ -30,8 +30,8 @@ retry_command() {
 # Paths
 : "${TE_PATH:=$(cd -- "${SCRIPT_DIR}/../.." && pwd)}"
 : "${MCORE_PATH:=/workspace/Megatron-LM-FL}"
-: "${MCORE_REPO_URL:=https://github.com/BrianPei/Megatron-LM-FL.git}"
-: "${MCORE_REF:=dev}"
+: "${MCORE_REPO_URL:=https://github.com/flagos-ai/Megatron-LM-FL.git}"
+: "${MCORE_REF:=main}"
 : "${OUTPUT_DIR:=${TE_PATH}/qa/L1_pytorch_mcore_integration/output}"
 : "${DATA_CACHE_PATH:=/tmp/data_cache}"
 

From b505a6ccb4b6a1cf4b60881034dca898c79d6a71 Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Fri, 24 Apr 2026 16:16:34 +0800
Subject: [PATCH 25/25] excute tests step add activate conda

---
 .github/workflows/integration_tests_common.yml | 10 ++++++++--
 .github/workflows/unit_tests_common.yml        | 10 ++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/integration_tests_common.yml b/.github/workflows/integration_tests_common.yml
index 2e75c983e7..25f18c866d 100644
--- a/.github/workflows/integration_tests_common.yml
+++ b/.github/workflows/integration_tests_common.yml
@@ -115,8 +115,14 @@ jobs:
           set -euo pipefail
 
           # Activate conda environment
-          # source /opt/miniconda3/etc/profile.d/conda.sh
-          # conda activate flagscale-train
+          if ${{inputs.platform == 'metax'}}; then
+            source /opt/conda/etc/profile.d/conda.sh
+            conda activate base
+          else
+            source /opt/miniconda3/etc/profile.d/conda.sh
+            conda activate flagscale-train
+          fi
+          echo "PATH=$PATH" >> $GITHUB_ENV
           export TE_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/transformer_engine
 
           echo "=== Running L1 PyTorch Megatron-FL MCore Integration Test ==="
diff --git a/.github/workflows/unit_tests_common.yml b/.github/workflows/unit_tests_common.yml
index fc441c52c9..10a070d9df 100644
--- a/.github/workflows/unit_tests_common.yml
+++ b/.github/workflows/unit_tests_common.yml
@@ -125,6 +125,16 @@ jobs:
           for k, v in env.items():
               print(f'{k}={v}')
           ")
+          
+          # Activate conda environment
+          if ${{inputs.platform == 'metax'}}; then
+            source /opt/conda/etc/profile.d/conda.sh
+            conda activate base
+          else
+            source /opt/miniconda3/etc/profile.d/conda.sh
+            conda activate flagscale-train
+          fi
+          echo "PATH=$PATH" >> $GITHUB_ENV
 
           export TE_PATH=$GITHUB_WORKSPACE
           export TE_LIB_PATH=$(python3 -c "import site; print(site.getsitepackages()[0])")