diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml
index 6975fab589..1c77fe6c25 100644
--- a/.github/configs/cuda.yml
+++ b/.github/configs/cuda.yml
@@ -1,26 +1,28 @@
 # CUDA Hardware Configuration for TransformerEngine-FL
-# Refactored for BAAI DGX A100 Nodes
+# Refactored for A100 Nodes
 # This file defines environment variables, volumes, and test filters for TE tests.
 
 hardware_name: cuda
 display_name: 'NVIDIA CUDA (A100)'
 
+# CI image for online env
 ci_image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
 
 # Runner labels for self-hosted A100 node
 # runner_labels:
-# - self-hosted
-# - Linux
-# - X64
-# - nvidia
-# - gpu-8
+#   - self-hosted
+#   - Linux
+#   - X64
+#   - nvidia
+#   - gpu-8
+
+# Runner labels for online env
 runner_labels:
   - nv-8g-cicd-te
 
 # Container volumes
 container_volumes:
   - /home/flagscale_cicd/flask/static:/workspace/report
-  # - /home/flagscale_cicd/data:/opt/data
 
 # Container options
 container_options: >-
@@ -32,9 +34,8 @@ container_options: >-
   --ulimit stack=67108864 
   --user root
 
-# Device types
-device_types:
-  - a100
+# Platform-specific environment setup script
+setup_script: .github/scripts/setup_cuda.sh
 
 # Build environment variables (platform-specific)
 build_env:
@@ -47,6 +48,10 @@ build_env:
   CUDA_HOME: /usr/local/cuda-12.8
   NVCC: /usr/local/cuda-12.8/bin/nvcc
 
+# Device types to run tests on
+device_types:
+  - a100
+
 # Test matrix configuration
 test_matrix:
   l0_pytorch:
diff --git a/.github/configs/metax.yml b/.github/configs/metax.yml
index e3b10c892d..00b4e1df34 100644
--- a/.github/configs/metax.yml
+++ b/.github/configs/metax.yml
@@ -1,28 +1,33 @@
 # Metax Hardware Configuration for TE-FL
 # This file defines CI/CD settings for Metax-based testing
-# Test configurations are defined in tests/test_utils/config/platforms/metax.yaml
+# This file defines environment variables, volumes, and test filters for TE tests.
 
 hardware_name: metax
 display_name: 'Metax Tests'
 
-ci_image: localhost:5000/megatron-lm-with-te:v1
-# ci_image: harbor.baai.ac.cn/flagscale/megatron-lm-with-te:202603231839
+# CI image for Metax dev env
+# ci_image: localhost:5000/megatron-lm-with-te:v1
 
-runner_labels:
-  - self-hosted
-  - Linux
-  - X64
-  - metax
-  - dev
+# CI image for online env
+ci_image: harbor.baai.ac.cn/flagscale/megatron-lm-with-te:202603231839
+
+# Runner labels for self-hosted Metax node
 # runner_labels:
-#   - mx-4g-cicd-te
+#   - self-hosted
+#   - Linux
+#   - X64
+#   - metax
+#   - dev
+
+# Runner labels for online env
+runner_labels:
+  - mx-4g-cicd-te
 
+# Container volumes
 container_volumes:
   - /nfs/metax_fs:/nfs/metax_fs
-  - /dev/dri:/dev/dri
-  - /dev/mxcd:/dev/mxcd
-  - /dev/infiniband:/dev/infiniband
 
+# Container options
 container_options: >-
   --uts=host
   --ipc=host
@@ -30,17 +35,16 @@ container_options: >-
   --group-add video
   --shm-size=100gb
   --ulimit memlock=-1
-  --security-opt seccomp=unconfined
-  --security-opt apparmor=unconfined
-  --device=/dev/dri
-  --device=/dev/mxcd
-  --device=/dev/infiniband
   --user root
   --ulimit nofile=65535:65535
   -e PLATFORM=metax
   -e TORCH_DISTRIBUTED_BACKEND=mccl
   -e LD_LIBRARY_PATH=/opt/maca/lib:/usr/local/lib:$LD_LIBRARY_PATH
 
+# Platform-specific environment setup script
+setup_script: .github/scripts/setup_metax.sh
+
+# Build environment variables (platform-specific)
 build_env:
   TE_FL_SKIP_CUDA: '1'
   NVTE_WITH_MACA: '1'
@@ -62,10 +66,3 @@ test_matrix:
       # example: tests/unit_tests/test_example.py
       # - tests/unit_tests/test_inference.py
       # - tests/unit_tests/test_rl_utils.py
-
-  # functional:
-  #   train:
-  #     - device: c500
-  #       task: train
-  #       model: deepseek
-  #       case: tp2_pp2_ep2
diff --git a/.github/scripts/setup_cuda.sh b/.github/scripts/setup_cuda.sh
new file mode 100755
index 0000000000..f9e289c6d0
--- /dev/null
+++ b/.github/scripts/setup_cuda.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# CUDA Platform Environment Setup Script
+# Called by unit_tests_common.yml for CUDA platforms (A100, H100, etc.)
+set -euo pipefail
+
+echo "===== Step 0: Activate Python environment ====="
+source /opt/miniconda3/etc/profile.d/conda.sh
+conda activate flagscale-train
+echo "PATH=$PATH" >> $GITHUB_ENV
+echo "Python: $(which python3) ($(python3 --version 2>&1))"
+
+echo "===== Step 1: Remove Existing TransformerEngine ====="
+pip uninstall transformer_engine transformer_engine_torch -y || true
+
+echo "===== Step 2: Build & Install TransformerEngine ====="
+cd $GITHUB_WORKSPACE
+
+pip install nvdlfw-inspect --quiet
+pip install expecttest --quiet
+pip install . -v --no-deps --no-build-isolation
+
+echo "===== Step 3: Verify Installation ====="
+python3 tests/pytorch/test_sanity_import.py
+
+echo "===== Environment Setup Complete ====="
diff --git a/.github/scripts/setup_metax.sh b/.github/scripts/setup_metax.sh
new file mode 100755
index 0000000000..a2d0b0a4cf
--- /dev/null
+++ b/.github/scripts/setup_metax.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# Metax Platform Environment Setup Script
+# Called by unit_tests_common.yml for Metax platforms (C500, etc.)
+set -euo pipefail
+
+echo "===== Step 0: Activate Python environment ====="
+source /opt/conda/etc/profile.d/conda.sh
+conda activate base
+echo "PATH=$PATH" >> $GITHUB_ENV
+echo "Python: $(which python3) ($(python3 --version 2>&1))"
+
+echo "===== Step 1: Base Environment Setup ====="
+# Configure MACA toolchain paths
+export PATH=/opt/maca/bin:$PATH
+export LD_LIBRARY_PATH=/opt/maca/lib:$LD_LIBRARY_PATH
+service ssh restart
+
+echo "===== Step 2: Create nvcc Symlink (cucc -> nvcc) ====="
+# TransformerEngine expects nvcc, but MACA provides cucc
+ln -sf /opt/maca/tools/cu-bridge/bin/cucc /opt/maca/tools/cu-bridge/bin/nvcc
+which nvcc || true
+
+echo "===== Step 3: Install Required System Tools ====="
+# Use apt to install git, curl
+sed -i 's|http://mirrors.aliyun.com/ubuntu|http://archive.ubuntu.com/ubuntu|g' /etc/apt/sources.list
+apt-get update -qq || true
+apt-get install -y -qq git curl
+# Install cmake and ninja via pip (more reliable than apt in this env)
+python3 -m pip install cmake ninja torch --no-cache-dir
+
+echo "===== Step 4: Remove Existing TransformerEngine ====="
+# Prevent conflicts with preinstalled or incompatible versions
+python3 -m pip uninstall transformer_engine -y || true
+python3 -m pip install nvdlfw-inspect --no-deps || true
+
+echo "===== Step 5: Install TE-FL Plugin Layer ====="
+# Install TransformerEngine-FL Python layer (plugin logic)
+cd $GITHUB_WORKSPACE
+TE_FL_SKIP_CUDA=1 python3 setup.py install
+
+echo "===== Step 6: Final Verification ====="
+# Verify both TE Python API and backend are functional
+python3 - <<'EOF'
+import transformer_engine
+import transformer_engine_torch as te
+print("transformer_engine:", transformer_engine)
+print("transformer_engine_torch:", te)
+EOF
+
+echo "===== Environment Setup Complete ====="
diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml
index 2165de9b49..606a0d3e86 100644
--- a/.github/workflows/all_tests_common.yml
+++ b/.github/workflows/all_tests_common.yml
@@ -7,13 +7,20 @@ on:
         required: true
         type: string
         description: Platform name (e.g., cuda, default)
-      setup_commands:
+      run_unit_tests:
         required: false
-        type: string
-        default: ''
+        type: boolean
+        default: true
+        description: Whether to run unit tests in this workflow
+      run_integration_tests:
+        required: false
+        type: boolean
+        default: true
+        description: Whether to run integration tests in this workflow
 
 jobs:
   checkout_and_config:
+    name: checkout_and_config
     defaults:
       run:
         shell: bash
@@ -24,19 +31,12 @@ jobs:
       container_volumes: ${{ steps.config.outputs.container_volumes }}
       container_options: ${{ steps.config.outputs.container_options }}
       device_types: ${{ steps.config.outputs.device_types }}
-      train_test_matrix: ${{ steps.config.outputs.train_test_matrix }}
-      ignored_tests: ${{ steps.config.outputs.ignored_tests }}
+      setup_script: ${{ steps.config.outputs.setup_script }}
       build_env: ${{ steps.config.outputs.build_env }}
     steps:
       - name: Checkout source code
         uses: actions/checkout@v4
 
-      - name: Check if tests should run
-        id: should_run
-        run: |
-         
-          echo "should_run=true" >> $GITHUB_OUTPUT
-
       - name: Load platform configuration
         id: config
         run: |
@@ -71,26 +71,24 @@ jobs:
           DEVICE_TYPES=$(yq '.device_types | tojson(0)' "$CONFIG_FILE")
           echo "device_types=$DEVICE_TYPES" >> $GITHUB_OUTPUT
 
-          # Read test matrix for training
-          TRAIN_MATRIX=$(yq '.test_matrix.functional.train | tojson(0)' "$CONFIG_FILE")
-          echo "train_test_matrix=$TRAIN_MATRIX" >> $GITHUB_OUTPUT
-
-          # Read ignored tests list from test_matrix.unit (default to empty array if not defined)
-          IGNORED_TESTS=$(yq '.test_matrix.unit.ignored_tests // [] | tojson(0)' "$CONFIG_FILE")
-          echo "ignored_tests=$IGNORED_TESTS" >> $GITHUB_OUTPUT
+          # Read setup script path
+          SETUP_SCRIPT=$(yq '.setup_script // ""' "$CONFIG_FILE")
+          echo "setup_script=$SETUP_SCRIPT" >> $GITHUB_OUTPUT
 
           # Read build environment variables (default to empty object if not defined)
           BUILD_ENV=$(yq '.build_env // {} | tojson(0)' "$CONFIG_FILE")
           echo "build_env=$BUILD_ENV" >> $GITHUB_OUTPUT
 
   unit_tests:
-    needs: checkout_and_config
+    name: unit_tests
+    if: inputs.run_unit_tests
+    needs: 
+      - checkout_and_config
     strategy:
       fail-fast: false
       matrix:
         device: ${{ fromJson(needs.checkout_and_config.outputs.device_types) }}
     uses: ./.github/workflows/unit_tests_common.yml
-    name: unit_tests
     with:
       platform: ${{ inputs.platform }}
       device: ${{ matrix.device }}
@@ -98,24 +96,61 @@ jobs:
       runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
       container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
       container_options: ${{ needs.checkout_and_config.outputs.container_options }}
-      setup_commands: ${{ inputs.setup_commands }}
-      ignored_tests: ${{ needs.checkout_and_config.outputs.ignored_tests }}
+      setup_script: ${{ needs.checkout_and_config.outputs.setup_script }}
       build_env: ${{ needs.checkout_and_config.outputs.build_env }}
 
-  # arguments.py not compatible with megatron-core-fl
-  # functional_tests:
-  #   needs:
-  #     - checkout_and_config
-  #   if: fromJson(needs.checkout_and_config.outputs.train_test_matrix)[0] != null
-  #   uses: ./.github/workflows/functional_tests_common.yml
-  #   with:
-  #     platform: ${{ inputs.platform }}
-  #     test_matrix: ${{ needs.checkout_and_config.outputs.train_test_matrix }}
-  #     image: ${{ needs.checkout_and_config.outputs.ci_image }}
-  #     runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
-  #     container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
-  #     container_options: ${{ needs.checkout_and_config.outputs.container_options }}
+  unit_tests_complete:
+    name: unit_tests_complete
+    needs: 
+      - unit_tests
+    runs-on: ubuntu-latest
+    if: always() && inputs.run_unit_tests
+    steps:
+      - name: Check unit tests result
+        run: |
+          if [ "${{ needs.unit_tests.result }}" != "success" ] && \
+             [ "${{ needs.unit_tests.result }}" != "skipped" ]; then
+            echo "❌ Unit tests failed: ${{ needs.unit_tests.result }}"
+            exit 1
+          fi
+          echo "✅ Unit tests passed"
 
+  integration_tests:
+    name: integration_tests
+    if: inputs.run_integration_tests
+    needs:
+      - checkout_and_config
+      - unit_tests_complete
+    strategy:
+      fail-fast: false
+      matrix:
+        device: ${{ fromJson(needs.checkout_and_config.outputs.device_types) }}
+    uses: ./.github/workflows/integration_tests_common.yml
+    with:
+      platform: ${{ inputs.platform }}
+      device: ${{ matrix.device }}
+      image: ${{ needs.checkout_and_config.outputs.ci_image }}
+      runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
+      container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
+      container_options: ${{ needs.checkout_and_config.outputs.container_options }}
+      setup_script: ${{ needs.checkout_and_config.outputs.setup_script }}
+      build_env: ${{ needs.checkout_and_config.outputs.build_env }}
+
+  integration_tests_complete:
+    name: integration_tests_complete
+    if: always() && inputs.run_integration_tests
+    needs: 
+      - integration_tests
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check integration tests result
+        run: |
+          if [ "${{ needs.integration_tests.result }}" != "success" ] && \
+             [ "${{ needs.integration_tests.result }}" != "skipped" ]; then
+            echo "❌ Integration tests failed: ${{ needs.integration_tests.result }}"
+            exit 1
+          fi
+          echo "✅ Integration tests passed"
 
   all_tests_complete:
     defaults:
@@ -123,8 +158,8 @@ jobs:
         shell: bash
     needs:
       - checkout_and_config
-      - unit_tests
-      # - functional_tests
+      - unit_tests_complete
+      - integration_tests_complete
     runs-on: ubuntu-latest
     if: always()
     steps:
@@ -133,15 +168,17 @@ jobs:
           # Check all test jobs (skip if not run)
           failed=false
 
-          if [ "${{ needs.unit_tests.result }}" != "success" ]; then
-            echo "❌ Unit tests failed"
+          if [ "${{ needs.unit_tests_complete.result }}" != "success" ] && \
+            [ "${{ needs.unit_tests_complete.result }}" != "skipped" ]; then
+            echo "❌ Unit tests failed or cancelled: ${{ needs.unit_tests_complete.result }}"
             failed=true
           fi
 
-          # if [ "${{ needs.functional_tests.result }}" != "success" ]; then
-          #   echo "❌ Training functional tests failed"
-          #   failed=true
-          # fi
+          if [ "${{ needs.integration_tests_complete.result }}" != "success" ] && \
+            [ "${{ needs.integration_tests_complete.result }}" != "skipped" ]; then
+            echo "❌ Integration tests failed or cancelled: ${{ needs.integration_tests_complete.result }}"
+            failed=true
+          fi
 
           if [ "$failed" = "true" ]; then
             exit 1
diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml
index 0aa652f64b..cc7ade9f50 100644
--- a/.github/workflows/all_tests_cuda.yml
+++ b/.github/workflows/all_tests_cuda.yml
@@ -17,6 +17,8 @@ jobs:
     uses: ./.github/workflows/all_tests_common.yml
     with:
       platform: cuda
+      run_unit_tests: true
+      run_integration_tests: true
 
   all_tests:
     needs: run_tests
diff --git a/.github/workflows/all_tests_metax.yml b/.github/workflows/all_tests_metax.yml
index d3e496c4b2..0af545e291 100644
--- a/.github/workflows/all_tests_metax.yml
+++ b/.github/workflows/all_tests_metax.yml
@@ -13,15 +13,12 @@ concurrency:
 
 jobs:
   run_tests:
+    # Package manager and environment settings are read from .github/configs/metax.yml
     uses: ./.github/workflows/all_tests_common.yml
     with:
       platform: metax
-      # Metax Environment Setup
-      setup_commands: |
-        export PATH=/opt/conda/bin:$PATH        
-        export LD_LIBRARY_PATH=/usr/local/maca/lib:/opt/maca/lib:$LD_LIBRARY_PATH        
-        which python3
-        python3 -m pip --version
+      run_unit_tests: true
+      run_integration_tests: true
 
   all_tests:
     needs: run_tests
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 6c9c967950..2ef6d1893d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -3,6 +3,7 @@
 # See LICENSE for license information.
 
 # A workflow to trigger TE build on GitHub
+
 name: 'Build'
 on:
   pull_request:
@@ -10,28 +11,56 @@ on:
 jobs:
   pytorch:
     name: 'PyTorch'
-    runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ]
+    runs-on: [ nv-8g-cicd-te ]
     defaults:
       run:
         shell: bash
     container:
       image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
-      options: --user root
+      ports:
+        - 80:80
+      options: >-
+        --gpus all 
+        --shm-size=500g 
+        --privileged 
+        --ipc=host 
+        --ulimit memlock=-1 
+        --ulimit stack=67108864 
+        --ulimit nofile=65535:65535 
+        --user root
+        --pull never
     steps:
+      - name: Configure Git Safe Directory on Cuda
+        run: /usr/bin/git config --global safe.directory '*'
+
       - name: 'Checkout'
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
+          fetch-depth: 0
           submodules: recursive
-      - name: 'Build'
-        run:
+          set-safe-directory: true
+
+      - name: 'Setup Environment'
+        run: |
           source /opt/miniconda3/etc/profile.d/conda.sh
           conda activate flagscale-train
-          pip install --no-build-isolation . -v --no-deps
+          echo "PATH=$PATH" >> $GITHUB_ENV
+
+      - name: 'Build'
+        run: |
+          pip uninstall transformer_engine transformer_engine_torch -y || true
+          echo "GITHUB_WORKSPACE=$GITHUB_WORKSPACE"
+          cd $GITHUB_WORKSPACE
+          pip install nvdlfw-inspect
+          pip install expecttest
+          pip install . -v --no-deps --no-build-isolation
         env:
           NVTE_FRAMEWORK: pytorch
-          TE_WITH_NCCL: 1
+          TE_WITH_NCCL: '1'
+          NVTE_WITH_CUDA: '1'
+          CUDA_HOME: /usr/local/cuda-12.8
+          NVCC: /usr/local/cuda-12.8/bin/nvcc
+
       - name: 'Sanity check'
         run: 
-          source /opt/miniconda3/etc/profile.d/conda.sh
-          conda activate flagscale-train
           python3 tests/pytorch/test_sanity_import.py
diff --git a/.github/workflows/functional_tests_common.yml b/.github/workflows/functional_tests_common.yml
deleted file mode 100644
index aa6b734778..0000000000
--- a/.github/workflows/functional_tests_common.yml
+++ /dev/null
@@ -1,190 +0,0 @@
-# Disabled for compatibility issues
-name: Common Functional Tests - Training
-
-on:
-  workflow_call:
-    inputs:
-      platform:
-        required: true
-        type: string
-        description: Platform name (e.g., cuda, default)
-      test_matrix:
-        required: true
-        type: string
-        description: JSON array of test configurations
-      image:
-        required: true
-        type: string
-      runs_on:
-        required: true
-        type: string
-      container_volumes:
-        required: true
-        type: string
-      container_options:
-        required: true
-        type: string
-
-jobs:
-  functional_test_train:
-    defaults:
-      run:
-        shell: bash
-    env:
-      PROJECT_ROOT: ${{ github.workspace }}
-    runs-on: ${{ fromJson(inputs.runs_on) }}
-    strategy:
-      fail-fast: false
-      matrix:
-        test_config: ${{ fromJson(inputs.test_matrix) }}
-    container:
-      image: ${{ inputs.image }}
-      ports:
-        - 80
-      volumes: ${{ fromJson(inputs.container_volumes) }}
-      options: ${{ inputs.container_options }}
-
-    steps:
-      - name: Checkout source code
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-      
-      # - name: Set safe directory
-      #   run: |
-      #     git config --global --add safe.directory $PROJECT_ROOT
-      ## The above step is commented out because there is no git cli in the container, and it causes the step to fail. The safe directory is set in the next step with a conditional check.
-      - name: Set safe directory
-        run: |
-          command -v git && git config --global --add safe.directory $PROJECT_ROOT || true
-
-      - name: Activate Python environment
-        run: |
-          source /opt/conda/etc/profile.d/conda.sh
-          conda activate base   
-          echo "PATH=$PATH" >> $GITHUB_ENV
-    
-      - name: Setup Python environment
-        env:
-          NVTE_WITH_MACA: '1'
-          NVTE_WITH_CUDA: '0'
-          NVCC: /opt/maca/bin/mcc
-          CUDA_HOME: /opt/maca
-          
-          PATH: /opt/maca/bin:${{ env.PATH }}
-          LD_LIBRARY_PATH: /opt/maca/lib:${{ env.LD_LIBRARY_PATH }}
-        run: |
-          set -euo pipefail
-          cd $PROJECT_ROOT
-          pip install -e . --no-deps --no-build-isolation
-        timeout-minutes: 60
-
-      - name: L0 Pytorch Wheel
-        id: L0_pytoech_wheel
-        # timeout-minutes: 50
-        env:
-          TE_PATH: .
-          RUN_LOG: /logs/pytorch/wheel
-        run: |
-          echo "TE_PATH: ${TE_PATH}"
-          sed -i "s/^cd transformer_engine\/pytorch\s*$/pushd transformer_engine\/pytorch/" qa/L0_pytorch_wheel/test.sh
-          sed -i '44 s/^cd \s*\$TE_PATH\s*$/popd/' qa/L0_pytorch_wheel/test.sh
-
-          cat qa/L0_pytorch_wheel/test.sh
-          # source /opt/miniconda3/etc/profile.d/conda.sh
-          # conda activate flagscale-train
-          pip uninstall -y transformer_engine
-
-          set -euo pipefail
-          cd $PROJECT_ROOT
-
-          PLATFORM='${{ inputs.platform }}'
-          DEVICE='${{ matrix.test_config.device }}'
-          TASK='${{ matrix.test_config.task }}'
-          MODEL='${{ matrix.test_config.model }}'
-          CASE='${{ matrix.test_config.case }}'
-
-          echo "Running functional tests for training"
-          echo "Platform: $PLATFORM"
-          echo "Device: $DEVICE"
-          echo "Task: $TASK"
-          echo "Model: $MODEL"
-          echo "Case: ${CASE:-all}"
-
-          # Set environment variables
-          export PYTHONPATH=$PROJECT_ROOT:${PYTHONPATH:-}
-          
-          set +e
-          bash qa/L0_pytorch_wheel/test.sh | tee ${RUN_LOG}/pytorch_wheel-${{ github.run_id }}.log
-          exit_code=$?
-          set -e
-
-          if [ $exit_code -eq 0 ]; then
-            echo "✅ Functional tests passed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE"
-          else
-            echo "❌ Functional tests failed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE (exit code: $exit_code)"
-          fi
-
-          echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
-          exit $exit_code
-
-      - name: Upload Installation Logs
-        if: always() && steps.L0_pytoech_wheel.outcome == 'failure'
-        uses: actions/upload-artifact@v4
-        with:
-          name: L0-pytorch-logs-${{ github.run_id }}
-          path: /logs/pytorch/wheel
-          retention-days: 7
-          if-no-files-found: warn
-
-      # - name: Run functional tests
-      #   id: functional_test
-      #   run: |
-      #     set -euo pipefail
-      #     cd $PROJECT_ROOT
-
-      #     PLATFORM='${{ inputs.platform }}'
-      #     DEVICE='${{ matrix.test_config.device }}'
-      #     TASK='${{ matrix.test_config.task }}'
-      #     MODEL='${{ matrix.test_config.model }}'
-      #     CASE='${{ matrix.test_config.case }}'
-
-      #     echo "Running functional tests for training"
-      #     echo "Platform: $PLATFORM"
-      #     echo "Device: $DEVICE"
-      #     echo "Task: $TASK"
-      #     echo "Model: $MODEL"
-      #     echo "Case: ${CASE:-all}"
-
-      #     # Set environment variables
-      #     export PYTHONPATH=$PROJECT_ROOT:${PYTHONPATH:-}
-
-      #     # Run functional tests via run_tests.sh with explicit platform/device/task/model/case
-      #     set +e
-      #     bash "$PROJECT_ROOT/tests/test_utils/runners/run_tests.sh" \
-      #       --platform "$PLATFORM" \
-      #       --device "$DEVICE" \
-      #       --type functional \
-      #       --task "$TASK" \
-      #       --model "$MODEL" \
-      #       --list "$CASE"
-      #     exit_code=$?
-      #     set -e
-
-      #     if [ $exit_code -eq 0 ]; then
-      #       echo "✅ Functional tests passed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE"
-      #     else
-      #       echo "❌ Functional tests failed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE (exit code: $exit_code)"
-      #     fi
-
-      #     echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
-      #     exit $exit_code
-      #   timeout-minutes: 60
-
-      # - name: Debug - keep container alive on failure
-      #   if: failure()
-      #   run: |
-      #     echo "Container sleeping for 60 minutes for debugging..."
-      #     echo "On host, run: docker ps  then  docker exec -it <container_id> bash"
-      #     sleep 3600
-      #   timeout-minutes: 60
\ No newline at end of file
diff --git a/.github/workflows/integration_tests_common.yml b/.github/workflows/integration_tests_common.yml
new file mode 100644
index 0000000000..25f18c866d
--- /dev/null
+++ b/.github/workflows/integration_tests_common.yml
@@ -0,0 +1,134 @@
+name: Common Integration Tests
+
+on:
+  workflow_call:
+    inputs:
+      platform:
+        required: true
+        type: string
+      device:
+        required: true
+        type: string
+      image:
+        required: true
+        type: string
+      runs_on:
+        required: true
+        type: string
+      container_volumes:
+        required: true
+        type: string
+      container_options:
+        required: true
+        type: string
+      # Platform-specific environment setup script path (from platform config)
+      setup_script:
+        required: false
+        type: string
+        default: ''
+      # Platform-specific build environment variables (JSON object from config)
+      build_env:
+        required: false
+        type: string
+        default: '{}'
+
+jobs:
+  integration_test:
+    defaults:
+      run:
+        shell: bash
+    runs-on: ${{ fromJson(inputs.runs_on) }}
+    strategy:
+      fail-fast: false
+      matrix:
+        test_group:
+          - name: pytorch_mcore_integration
+            path: "qa/L1_pytorch_mcore_integration/test.sh"
+            test_type: "integration"
+    name: integration-${{ inputs.device }}-${{ matrix.test_group.name }}
+    container:
+      image: ${{ inputs.image }}
+      volumes: ${{ fromJson(inputs.container_volumes) }}
+      options: --pull never ${{ inputs.container_options }}
+
+    steps:
+      # Cuda requires git safe.directory configuration and 3 checkout attempts to handle submodule-heavy repos
+      - name: Configure Git Safe Directory on Cuda
+        if: inputs.platform == 'cuda'
+        run: /usr/bin/git config --global safe.directory '*'
+
+      - name: Checkout Source Code on Cuda (attempt 1)
+        id: checkout1
+        if: inputs.platform == 'cuda'
+        uses: actions/checkout@v4
+        continue-on-error: true
+        with:
+          fetch-depth: 0
+          submodules: recursive
+          set-safe-directory: true
+
+      - name: Checkout Source Code on Cuda (attempt 2)
+        id: checkout2
+        if: inputs.platform == 'cuda' && steps.checkout1.outcome == 'failure'
+        uses: actions/checkout@v4
+        continue-on-error: true
+        with:
+          fetch-depth: 0
+          submodules: recursive
+          set-safe-directory: true
+
+      - name: Checkout Source Code on Cuda (attempt 3)
+        id: checkout3
+        if: inputs.platform == 'cuda' && steps.checkout2.outcome == 'failure'
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          submodules: recursive
+          set-safe-directory: true
+
+      # Metax requires to clean vscode-remote-container
+      - name: Configure Clean Git Env on Metax
+        if: inputs.platform == 'metax'
+        run: |
+          git config --global --unset-all credential.helper 2>/dev/null || true
+          git config --system --unset-all credential.helper 2>/dev/null || true
+
+      # Metax no need submodules
+      - name: Checkout Source Code on Metax
+        if: inputs.platform == 'metax'
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      
+      - name: Environment Setup
+        if: inputs.setup_script != ''
+        run: |
+          bash $GITHUB_WORKSPACE/${{ inputs.setup_script }}
+
+      - name: Execute Tests
+        env:
+          TE_PATH: ${{ github.workspace }}
+          TE_FL_PREFER: vendor
+          MCORE_REPO_URL: https://github.com/flagos-ai/Megatron-LM-FL.git
+          MCORE_REF: main
+        run: |
+          set -euo pipefail
+
+          # Activate conda environment
+          if ${{inputs.platform == 'metax'}}; then
+            source /opt/conda/etc/profile.d/conda.sh
+            conda activate base
+          else
+            source /opt/miniconda3/etc/profile.d/conda.sh
+            conda activate flagscale-train
+          fi
+          echo "PATH=$PATH" >> $GITHUB_ENV
+          export TE_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/transformer_engine
+
+          echo "=== Running L1 PyTorch Megatron-FL MCore Integration Test ==="
+          # python3 --version
+          # pip list | grep -E "regex|six|torch" || true
+
+          bash ${{ matrix.test_group.path }}
+        timeout-minutes: 30
+        
\ No newline at end of file
diff --git a/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml b/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
index b026f9aa10..f214990581 100644
--- a/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
+++ b/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
@@ -2,21 +2,11 @@ name: QA L0 - Core Unit & Lint Tests
 
 on:
   push:
-    branches: main
-    paths:
-      - '.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml'
-      - 'qa/L0_pytorch_lint/**'
-      - 'transformer_engine/**'
-      - 'tests/pytorch/**'
+    branches:
+      - __disabled_do_not_remove__
   pull_request:
-    branches: main
-    paths:
-      - '.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml'
-      - 'qa/L0_pytorch_lint/**'
-      - 'transformer_engine/**'
-      - 'tests/pytorch/**'
-
-  workflow_dispatch:
+    branches:
+      - __disabled_do_not_remove__
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
diff --git a/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml b/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
index 51f071aa3b..32a13813ff 100644
--- a/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
+++ b/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
@@ -2,32 +2,11 @@ name: QA L1 - Comprehensive Integration Tests
 
 on:
   push:
-    branches: main
-    paths:
-      - '.github/workflows/qa-l1-te-cpp-pytorch-tests.yml'
-      - 'qa/L1_cpp_distributed/**'
-      - 'tests/cpp_distributed/**'
-      - 'qa/L1_pytorch_thunder_integration/**'
-      - 'qa/L1_pytorch_distributed_unittest/**'
-      - 'tests/pytorch/distributed/**'
-      - 'tests/pytorch/attention/**'
-      - 'qa/L1_pytorch_onnx_unittest/**'
-      - 'tests/pytorch/test_onnx_export.py'
-
+    branches:
+      - __disabled_do_not_remove__
   pull_request:
-    branches: main
-    paths:
-      - '.github/workflows/qa-l1-te-cpp-pytorch-tests.yml'
-      - 'qa/L1_cpp_distributed/**'
-      - 'tests/cpp_distributed/**'
-      - 'qa/L1_pytorch_thunder_integration/**'
-      - 'qa/L1_pytorch_distributed_unittest/**'
-      - 'tests/pytorch/distributed/**'
-      - 'tests/pytorch/attention/**'
-      - 'qa/L1_pytorch_onnx_unittest/**'
-      - 'tests/pytorch/test_onnx_export.py'
-      
-  workflow_dispatch:
+    branches:
+      - __disabled_do_not_remove__
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
@@ -57,8 +36,8 @@ jobs:
       - name: Checkout Code
         uses: actions/checkout@v6.0.1
         with:
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
-          ref: ${{ github.event.pull_request.head.ref }}
+          repository: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name || github.repository }}
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.ref || github.ref_name }}
           ssh-strict: true
           ssh-user: git
           persist-credentials: true
@@ -166,3 +145,21 @@ jobs:
           echo "=== Running L1 PyTorch ONNX Unit Tests ==="
           bash ./qa/L1_pytorch_onnx_unittest/test.sh
         # timeout-minutes: 30
+
+            
+      - name: Run L1 PyTorch Megatron-FL MCore Integration Test
+        env:
+          TE_PATH: .
+          TE_FL_PREFER: vendor
+          MCORE_REPO_URL: https://github.com/flagos-ai/Megatron-LM-FL.git
+          MCORE_REF: main
+        run: |
+          # Activate conda environment
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+
+          export TE_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/transformer_engine
+
+          echo "=== Running L1 PyTorch Megatron-FL MCore Integration Test ==="
+          bash ./qa/L1_pytorch_mcore_integration/test.sh
+        timeout-minutes: 30
diff --git a/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml b/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml
index 9a881dd2d9..bb3e0a73fe 100644
--- a/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml
+++ b/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml
@@ -3,16 +3,11 @@ name: QA L3 - Attention Tests
 
 on:
   push:
-    branches: __disable__
-    paths:
-      - '.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml'
-      - 'tests/pytorch/attention/test_attention.py'
-
+    branches: 
+      - __disabled_do_not_remove__
   pull_request:
-    branches: __disable__
-    paths:
-      - '.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml'
-      - 'tests/pytorch/attention/test_attention.py'
+    branches:
+      - __disabled_do_not_remove__
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
diff --git a/.github/workflows/te-plugin-tests.yml b/.github/workflows/te-plugin-tests.yml
index f487673444..9b640fcce8 100644
--- a/.github/workflows/te-plugin-tests.yml
+++ b/.github/workflows/te-plugin-tests.yml
@@ -18,7 +18,7 @@ concurrency:
 
 jobs:
   run-plugin-tests:
-    runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ]
+    runs-on: [ nv-8g-cicd-te ]
     defaults:
       run:
         shell: bash
@@ -35,7 +35,7 @@ jobs:
         --ulimit stack=67108864 
         --ulimit nofile=65535:65535 
         --user root
-        --pull always
+        --pull never
     steps:
       - name: Checkout Code
         uses: actions/checkout@v6.0.1
diff --git a/.github/workflows/unit_tests_common.yml b/.github/workflows/unit_tests_common.yml
index 615f7c9001..10a070d9df 100644
--- a/.github/workflows/unit_tests_common.yml
+++ b/.github/workflows/unit_tests_common.yml
@@ -1,6 +1,5 @@
 name: Common Unit Tests
 
-
 on:
   workflow_call:
     inputs:
@@ -22,12 +21,8 @@ on:
       container_options:
         required: true
         type: string
-      ignored_tests:
-        required: false
-        type: string
-        default: ''
-      # New input for hardware-specific initialization (e.g., conda activate)
-      setup_commands:
+      # Platform-specific environment setup script path (from platform config)
+      setup_script:
         required: false
         type: string
         default: ''
@@ -36,41 +31,9 @@ on:
         required: false
         type: string
         default: '{}'
-      # Whether to upload coverage report
-      upload_coverage:
-        description: "Whether to upload coverage report"
-        required: false
-        type: boolean
-        default: true
 
 jobs:
-  # 1. Change Detection
-  detect_changes:
-    runs-on: ubuntu-latest
-    outputs:
-      core: ${{ steps.filter.outputs.core }}
-      qa_l0: ${{ steps.filter.outputs.qa_l0 }}
-    steps:
-      - name: Checkout source code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Detect changed paths
-        id: filter
-        run: |
-          set -euo pipefail
-          BASE_REF="${{ github.event_name == 'pull_request' && format('origin/{0}', github.base_ref) || 'HEAD~1' }}"
-          [ "${{ github.event_name }}" == "pull_request" ] && git fetch origin ${{ github.base_ref }} --depth=1
-          
-          CHANGED_FILES=$(git diff --name-only $BASE_REF...HEAD 2>/dev/null || git diff --name-only $BASE_REF HEAD)
-
-          echo "core=$(echo "$CHANGED_FILES" | grep -qE "^tests/unit_tests/|^megatron/core/|^.github/" && echo "true" || echo "false")" >> $GITHUB_OUTPUT
-          echo "qa_l0=$(echo "$CHANGED_FILES" | grep -qE "^qa/L0_|^transformer_engine/|^tests/pytorch/|^.github/" && echo "true" || echo "false")" >> $GITHUB_OUTPUT
-  
-  # 2. Unified Test Execution
   unit_test:
-    needs: detect_changes
     defaults:
       run:
         shell: bash
@@ -79,16 +42,15 @@ jobs:
       fail-fast: false
       matrix:
         test_group:
-          - name: pytorch_lint
-            path: "qa/L0_pytorch_lint/test.sh"
-            test_type: "lint"
           - name: pytorch_debug
             path: "qa/L0_pytorch_debug_unittest/test.sh"
             test_type: "debug"
           - name: pytorch_unittest
             path: "qa/L0_pytorch_unittest/test.sh"
             test_type: "unittest"
-
+          - name: pytorch_distributed_unittest
+            path: "qa/L1_pytorch_distributed_unittest/test.sh"
+            test_type: "unittest"
     name: unit-${{ inputs.device }}-${{ matrix.test_group.name }}
     container:
       image: ${{ inputs.image }}
@@ -96,33 +58,14 @@ jobs:
       options: --pull never ${{ inputs.container_options }}
 
     steps:
-      - name: Check if tests should run
-        id: should_run
-        run: |
-          echo "should_run=true" >> $GITHUB_OUTPUT
-          GROUP='${{ matrix.test_group.name }}'
-          # Force run if 'full ci' label exists
-          if [ "${{ contains(github.event.pull_request.labels.*.name, 'full ci') }}" == "true" ]; then 
-            echo "should_run=true" >> $GITHUB_OUTPUT; exit 0
-          fi
-
-          if [[ "$GROUP" == "pytorch_"* ]]; then
-            CHANGED='${{ needs.detect_changes.outputs.qa_l0 }}'
-          else
-            CHANGED='${{ needs.detect_changes.outputs.core }}'
-          fi
-          
-          # For debugging, you can force this to true
-          echo "should_run=true" >> $GITHUB_OUTPUT
-
       # Cuda requires git safe.directory configuration and 3 checkout attempts to handle submodule-heavy repos
       - name: Configure Git Safe Directory on Cuda
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda'
+        if: inputs.platform == 'cuda'
         run: /usr/bin/git config --global safe.directory '*'
 
       - name: Checkout Source Code on Cuda (attempt 1)
         id: checkout1
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda'
+        if: inputs.platform == 'cuda'
         uses: actions/checkout@v4
         continue-on-error: true
         with:
@@ -132,7 +75,7 @@ jobs:
 
       - name: Checkout Source Code on Cuda (attempt 2)
         id: checkout2
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda' && steps.checkout1.outcome == 'failure'
+        if: inputs.platform == 'cuda' && steps.checkout1.outcome == 'failure'
         uses: actions/checkout@v4
         continue-on-error: true
         with:
@@ -142,116 +85,33 @@ jobs:
 
       - name: Checkout Source Code on Cuda (attempt 3)
         id: checkout3
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda' && steps.checkout2.outcome == 'failure'
+        if: inputs.platform == 'cuda' && steps.checkout2.outcome == 'failure'
         uses: actions/checkout@v4
         with:
           fetch-depth: 0
           submodules: recursive
           set-safe-directory: true
 
+      # Metax requires to clean vscode-remote-container
+      - name: Configure Clean Git Env on Metax
+        if: inputs.platform == 'metax'
+        run: |
+          git config --global --unset-all credential.helper 2>/dev/null || true
+          git config --system --unset-all credential.helper 2>/dev/null || true
+
       # Metax no need submodules
       - name: Checkout Source Code on Metax
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'metax'
+        if: inputs.platform == 'metax'
         uses: actions/checkout@v4
         with:
           fetch-depth: 0
       
-      - name: Environment Setup on Cuda
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda'
+      - name: Environment Setup
+        if: inputs.setup_script != ''
         run: |
-          set -euo pipefail
-
-          echo "===== Step 0: Activate Python environment ====="
-          source /opt/miniconda3/etc/profile.d/conda.sh
-          conda activate flagscale-train
-          echo "PATH=$PATH" >> $GITHUB_ENV
-          echo "Python: $(which python3) ($(python3 --version 2>&1))"
-
-          echo "===== Step 1: Remove Existing TransformerEngine ====="
-          pip uninstall transformer_engine transformer_engine_torch -y || true
-
-          echo "===== Step 2: Build & Install TransformerEngine ====="
-          cd $GITHUB_WORKSPACE
-
-          pip install nvdlfw-inspect --quiet
-          pip install expecttest --quiet
-          pip install . -v --no-deps --no-build-isolation
-
-          echo "===== Step 3: Verify Installation ====="
-          python3 tests/pytorch/test_sanity_import.py
-
-          echo "===== Environment Setup Complete ===== "
-
-      - name: Environment Setup on Metax
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'metax'
-        run: |
-          set -euo pipefail
-
-          echo "===== Step 0: Activate Python environment ====="
-          source /opt/conda/etc/profile.d/conda.sh
-          conda activate base
-          echo "PATH=$PATH" >> $GITHUB_ENV
-          echo "Python: $(which python3) ($(python3 --version 2>&1))"
-
-          echo "===== Step 1: Base Environment Setup ====="
-          # Configure MACA toolchain paths
-          export PATH=/opt/maca/bin:$PATH
-          export LD_LIBRARY_PATH=/opt/maca/lib:$LD_LIBRARY_PATH
-          service ssh restart
-
-          echo "===== Step 2: Create nvcc Symlink (cucc -> nvcc) ====="
-          # TransformerEngine expects nvcc, but MACA provides cucc
-          ln -sf /opt/maca/tools/cu-bridge/bin/cucc /opt/maca/tools/cu-bridge/bin/nvcc
-          which nvcc || true
-
-          echo "===== Step 3: Install Required System Tools ====="
-          # Install essential build tools (avoid modifying Python dependencies)
-          apt-get update -qq && apt-get install -y -qq git cmake ninja-build curl
-
-          echo "===== Step 4: Remove Existing TransformerEngine ====="
-          # Prevent conflicts with preinstalled or incompatible versions
-          python3 -m pip uninstall transformer_engine -y || true
-          python3 -m pip install nvdlfw-inspect --quiet
-          python3 -m pip install expecttest --quiet
-
-          # echo "===== Step 5: Install Metax Binary Backend ====="
-          # # Install prebuilt Metax backend (required for MACA operators)
-          # WHL_PATH="/home/muxiuser/transformer_engine_metax-2.9.0-cp312-cp312-linux_x86_64.whl"
-          # if [ ! -f "$WHL_PATH" ]; then
-          #   echo "ERROR: Wheel file not found at $WHL_PATH"
-          #   echo "Please verify volume mount: -v /home/muxiuser:/home/muxiuser"
-          #   exit 1
-          # fi
-
-          # # Use --no-deps to avoid overwriting Metax-optimized PyTorch
-          # python3 -m pip install "$WHL_PATH" --no-deps --force-reinstall
-
-          # echo "===== Step 6: Verify Metax Backend ====="
-          # # Ensure transformer_engine_torch is correctly loaded
-          # python3 - <<'EOF'
-          # import transformer_engine_torch as te
-          # print("Backend loaded successfully:", te)
-          # EOF
-
-          echo "===== Step 7: Install TE-FL Plugin Layer ====="
-          # Install TransformerEngine-FL Python layer (plugin logic)
-          # cd /workspace/TransformerEngine-FL
-          cd $GITHUB_WORKSPACE
-          TE_FL_SKIP_CUDA=1 python3 setup.py install
-
-          echo "===== Step 8: Final Verification ====="
-          # Verify both TE Python API and backend are functional
-          python3 - <<'EOF'
-          import transformer_engine
-          import transformer_engine_torch as te
-          print("transformer_engine:", transformer_engine)
-          print("transformer_engine_torch:", te)
-          EOF
-
-          echo "===== Environment Setup Complete ===== "
+          bash $GITHUB_WORKSPACE/${{ inputs.setup_script }}
       
       - name: Execute Tests
-        if: steps.should_run.outputs.should_run == 'true'
         working-directory: ${{ github.workspace }}
         run: |
           set -euo pipefail
@@ -265,6 +125,16 @@ jobs:
           for k, v in env.items():
               print(f'{k}={v}')
           ")
+          
+          # Activate conda environment
+          if ${{inputs.platform == 'metax'}}; then
+            source /opt/conda/etc/profile.d/conda.sh
+            conda activate base
+          else
+            source /opt/miniconda3/etc/profile.d/conda.sh
+            conda activate flagscale-train
+          fi
+          echo "PATH=$PATH" >> $GITHUB_ENV
 
           export TE_PATH=$GITHUB_WORKSPACE
           export TE_LIB_PATH=$(python3 -c "import site; print(site.getsitepackages()[0])")
@@ -284,19 +154,14 @@ jobs:
 
           # Coverage setup: install once + configure collection via PYTEST_ADDOPTS
           COVERAGE_ENABLED=false
-          if [ "${{ inputs.upload_coverage }}" = "true" ] && [ "${{ matrix.test_group.test_type }}" = "unittest" ]; then
-            if pip3 install coverage pytest-cov --quiet 2>/dev/null; then
-              export PYTEST_ADDOPTS="--cov=transformer_engine --cov-append --cov-report="
-              COVERAGE_ENABLED=true
-            else
-              echo "WARNING: Failed to install coverage/pytest-cov, coverage collection disabled"
-            fi
+          if pip3 install coverage pytest-cov --quiet 2>/dev/null; then
+            export PYTEST_ADDOPTS="--cov=transformer_engine --cov-append --cov-report="
+            COVERAGE_ENABLED=true
+          else
+            echo "WARNING: Failed to install coverage/pytest-cov, coverage collection disabled"
           fi
 
-          if [[ "${{ matrix.test_group.name }}" == *"lint"* ]]; then
-            export CPP_ONLY=0
-            export PYTHON_ONLY=0
-          elif [[ "${{ matrix.test_group.name }}" != *"debug"* ]]; then
+          if [[ "${{ matrix.test_group.name }}" != *"debug"* ]]; then
             # Fail fast on backend/API mismatch before running the full test group.
             # Skip for debug group (does not use FP8/optimizer symbols).
             python3 -c "import sys, importlib; import transformer_engine.common as _te_common; tex = importlib.import_module('transformer_engine_torch'); required=['multi_tensor_scale','multi_tensor_compute_scale_and_scale_inv']; missing=[n for n in required if not hasattr(tex, n)]; print('[TE check] module:', tex); print('[TE check] file:', getattr(tex, '__file__', 'N/A')); print('[TE check] missing:', ', '.join(missing) if missing else 'none'); sys.exit(1 if missing else 0)"
@@ -313,12 +178,10 @@ jobs:
               --include="transformer_engine/*" 2>/dev/null \
               || echo "WARNING: No coverage data found"
           fi
-
           exit $exit_code
         timeout-minutes: 60
 
       - name: Upload Coverage Report
-        if: inputs.upload_coverage && matrix.test_group.test_type == 'unittest'
         uses: actions/upload-artifact@v4
         continue-on-error: true
         with:
@@ -327,7 +190,6 @@ jobs:
             coverage-${{ inputs.platform }}-${{ inputs.device }}-${{ matrix.test_group.name }}.json
 
       - name: Upload Coverage Report to FlagCICD
-        if: inputs.upload_coverage && matrix.test_group.test_type == 'unittest'
         uses: flagos-ai/FlagOps/actions/post-pytest-report@v2
         continue-on-error: true
         env:
@@ -336,12 +198,4 @@ jobs:
           backend_url: 'http://flagcicd-inner.flagos.net:8000/metrics/'
           user_id: '000000000000000000'
           report_path: 'coverage-${{ inputs.platform }}-${{ inputs.device }}-${{ matrix.test_group.name }}.json'
-          fail_on_error: 'false'
-
-      # - name: Debug - keep container alive on failure
-      #   if: failure()
-      #   run: |
-      #     echo "Container sleeping for 200 minutes for debugging..."
-      #     echo "On host, run: docker ps  then  docker exec -it <container_id> bash"
-      #     sleep 60000
-      #   timeout-minutes: 200
\ No newline at end of file
+          fail_on_error: 'false'
\ No newline at end of file
diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend
index f0c638223e..7500fd8427 160000
--- a/3rdparty/cudnn-frontend
+++ b/3rdparty/cudnn-frontend
@@ -1 +1 @@
-Subproject commit f0c638223eac20a9676941a110c9ad9e9842941d
+Subproject commit 7500fd8427a24a76fadac9f2108106fd22c62737
diff --git a/3rdparty/googletest b/3rdparty/googletest
index a35bc7693c..94be250af7 160000
--- a/3rdparty/googletest
+++ b/3rdparty/googletest
@@ -1 +1 @@
-Subproject commit a35bc7693c117a048152beeb34f6aac354b9423f
+Subproject commit 94be250af7e14c58dcbf476972d2d7141551ff67
diff --git a/qa/L0_pytorch_debug_unittest/README.rst b/qa/L0_pytorch_debug_unittest/README.rst
new file mode 100644
index 0000000000..2ba6e9fb0c
--- /dev/null
+++ b/qa/L0_pytorch_debug_unittest/README.rst
@@ -0,0 +1,26 @@
+L0 PyTorch Debug Unittest
+=========================
+
+This directory contains the L0 PyTorch debug unittest runner.
+
+MetaX ignore rules
+------------------
+
+MetaX-specific ignored tests are maintained in one place in ``test.sh`` through
+the ``METAX_IGNORED_TESTS`` list.
+
+The main execution flow only calls a helper to decide whether a test should be
+skipped, instead of embedding platform-specific matching rules directly in the
+main logic.
+
+This keeps the script easier to maintain and makes it simpler to add new
+ignored cases later if needed.
+
+How to extend
+-------------
+
+If a new test needs to be skipped on MetaX:
+
+1. Add the full test path to ``METAX_IGNORED_TESTS`` in ``test.sh``.
+2. Avoid adding new platform-specific matching logic directly into the main
+   execution flow.
\ No newline at end of file
diff --git a/qa/L0_pytorch_debug_unittest/test.sh b/qa/L0_pytorch_debug_unittest/test.sh
index 5be88dfe4a..2ab7340986 100644
--- a/qa/L0_pytorch_debug_unittest/test.sh
+++ b/qa/L0_pytorch_debug_unittest/test.sh
@@ -7,6 +7,7 @@
 : ${TE_PATH:=/opt/transformerengine}
 : ${NVTE_TEST_NVINSPECT_FEATURE_DIRS:=$TE_PATH/transformer_engine/debug/features}
 : ${NVTE_TEST_NVINSPECT_CONFIGS_DIR:=$TE_PATH/tests/pytorch/debug/test_configs/}
+
 : ${XML_LOG_DIR:=/logs}
 mkdir -p "$XML_LOG_DIR"
 
@@ -20,24 +21,37 @@ FAIL=0
 # because it is not available on PyPI.
 pip install pytest==8.2.1
 
+METAX_IGNORED_TESTS=(
+    "$TE_PATH/tests/pytorch/test_numerics.py"
+    "$TE_PATH/tests/pytorch/test_sanity.py"
+)
+
+should_skip_on_metax() {
+    local test_path=$1
+
+    [ "$PLATFORM" = "metax" ] || return 1
+
+    local ignored_test
+    for ignored_test in "${METAX_IGNORED_TESTS[@]}"; do
+        if [ "$test_path" = "$ignored_test" ]; then
+            echo "[SKIP] Platform MetaX: Ignoring $test_path"
+            return 0
+        fi
+    done
+
+    return 1
+}
+
+
 run_test_step() {
     local xml_file=$1
     local test_path=$2
     local cmd=$3
 
-
-    if [ "$PLATFORM" = "metax" ]; then
-        case "$test_path" in
-            *"test_numerics.py" | *"test_api_features.py" | *"test_sanity.py")
-                echo "-------------------------------------------------------"
-                echo "[SKIP] Platform MetaX: Ignoring $test_path"
-                echo "-------------------------------------------------------"
-                return 0
-                ;;
-        esac
+    if should_skip_on_metax "$test_path"; then
+        return 0
     fi
 
-
     echo "-------------------------------------------------------"
     echo "[RUN] Executing: $test_path"
     eval "$cmd" || FAIL=1
@@ -70,8 +84,6 @@ run_test_step "test_perf.xml" "$TE_PATH/tests/pytorch/debug/test_perf.py" \
 "pytest -v -s --junitxml=$XML_LOG_DIR/test_perf.xml $TE_PATH/tests/pytorch/debug/test_perf.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR"
 
 
-
-
 # Step 7: Sanity 2
 run_test_step "test_sanity_2.xml" "$TE_PATH/tests/pytorch/test_sanity.py" \
 "NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 \
diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh
index 99a1370ac4..bc4362e23d 100644
--- a/qa/L0_pytorch_unittest/test.sh
+++ b/qa/L0_pytorch_unittest/test.sh
@@ -22,13 +22,11 @@ run_test_step() {
     local cmd=$3
     local label=$4
 
-
     if [ "$PLATFORM" = "metax" ]; then
         case "$test_path" in
             *"test_numerics.py" | \
             *"test_sanity.py" | \
             *"test_parallel_cross_entropy.py" | \
-            *"test_cuda_graphs.py" | \
             *"test_fused_rope.py" | \
             *"test_gqa.py" | \
             *"test_fused_optimizer.py" | \
diff --git a/qa/L1_pytorch_distributed_unittest/test.sh b/qa/L1_pytorch_distributed_unittest/test.sh
index 04860a9729..46b54ed30d 100644
--- a/qa/L1_pytorch_distributed_unittest/test.sh
+++ b/qa/L1_pytorch_distributed_unittest/test.sh
@@ -15,29 +15,134 @@ function test_fail() {
 
 RET=0
 FAILED_CASES=""
+DEBUG_TESTS_READY=0
 
 : ${TE_PATH:=/opt/transformerengine}
 : ${XML_LOG_DIR:=/logs}
 mkdir -p "$XML_LOG_DIR"
 
+# The current CUDA 12.8 test container hits a fused-attention runtime loader
+# issue, so keep the distributed numerics suite on the unfused attention path.
+export NVTE_FLASH_ATTN="${NVTE_FLASH_ATTN:-0}"
+export NVTE_FUSED_ATTN="${NVTE_FUSED_ATTN:-0}"
+export NVTE_UNFUSED_ATTN="${NVTE_UNFUSED_ATTN:-1}"
+
+# Make CUDA runtime libraries discoverable for fused attention kernels.
+if [ -z "${CUDA_HOME:-}" ]; then
+    if [ -d /usr/local/cuda ]; then
+        export CUDA_HOME=/usr/local/cuda
+    elif [ -d /usr/local/cuda-12.8 ]; then
+        export CUDA_HOME=/usr/local/cuda-12.8
+    fi
+fi
+export CUDA_PATH="${CUDA_PATH:-${CUDA_HOME:-}}"
+
+CUDA_LIB_DIRS=()
+for path in \
+    "${CUDA_HOME:-}/lib64" \
+    "${CUDA_HOME:-}/targets/x86_64-linux/lib" \
+    "$(python3 - <<'PY'
+import site
+from pathlib import Path
+
+for root in site.getsitepackages():
+    candidate = Path(root) / "torch" / "lib"
+    if candidate.exists():
+        print(candidate)
+        break
+PY
+)" \
+    "$(python3 - <<'PY'
+import site
+from pathlib import Path
+
+for root in site.getsitepackages():
+    candidate = Path(root) / "nvidia" / "cuda_runtime" / "lib"
+    if candidate.exists():
+        print(candidate)
+        break
+PY
+)"; do
+    if [ -n "$path" ] && [ -d "$path" ]; then
+        CUDA_LIB_DIRS+=("$path")
+    fi
+done
+
+if [ "${#CUDA_LIB_DIRS[@]}" -gt 0 ]; then
+    CUDA_LIB_PATH="$(IFS=:; echo "${CUDA_LIB_DIRS[*]}")"
+    export LD_LIBRARY_PATH="${CUDA_LIB_PATH}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+fi
+
+python3 - <<'PY'
+import ctypes
+
+for name in ("libcudart.so", "libcudart.so.12"):
+    try:
+        ctypes.CDLL(name, mode=ctypes.RTLD_GLOBAL)
+        print(f"[CUDA] Preloaded {name}")
+        break
+    except OSError as exc:
+        print(f"[CUDA] Failed to preload {name}: {exc}")
+PY
+
 
 # It is not installed as a requirement,
 # because it is not available on PyPI.
 pip uninstall -y nvdlfw-inspect
-pip install git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git
+if pip install git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git && \
+   python3 -c "import nvdlfw_inspect.api" >/dev/null 2>&1; then
+    DEBUG_TESTS_READY=1
+else
+    echo "Warning: nvdlfw_inspect is unavailable; debug numerics test will be skipped"
+fi
 
 pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
 
+run_test_step() {
+    local xml_file=$1
+    local test_path=$2
+    local cmd=$3
+    local label=$4
+
+    if [ "$PLATFORM" = "metax" ]; then
+        case "$test_path" in
+            *"test_numerics.py" | \
+            *"test_numerics_exact.py" | \
+            *"test_torch_fsdp2.py" | \
+            *"test_cast_master_weights_to_fp8.py")
+                echo "-------------------------------------------------------"
+                echo "[SKIP] Platform MetaX: Ignoring $label"
+                echo "-------------------------------------------------------"
+                return 0
+                ;;
+        esac
+    fi
+
+    echo "-------------------------------------------------------"
+    echo "[RUN] Executing: $label"
+    eval "$cmd" || test_fail "$label"
+}
+
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/distributed/test_sanity.py || test_fail "test_sanity.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "test_numerics.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_exact.xml $TE_PATH/tests/pytorch/distributed/test_numerics_exact.py || test_fail "test_numerics_exact.py"
+run_test_step "pytest_test_numerics.xml" "$TE_PATH/tests/pytorch/distributed/test_numerics.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py" \
+"test_numerics.py"
+run_test_step "pytest_test_numerics_exact.xml" "$TE_PATH/tests/pytorch/distributed/test_numerics_exact.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_exact.xml $TE_PATH/tests/pytorch/distributed/test_numerics_exact.py" \
+"test_numerics_exact.py"
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops.py || test_fail "test_fusible_ops.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_torch_fsdp2.xml $TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py -k "not (test_distributed)" || test_fail "test_torch_fsdp2.py"
+run_test_step "pytest_test_torch_fsdp2.xml" "$TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_torch_fsdp2.xml $TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py -k 'not (test_distributed)'" \
+"test_torch_fsdp2.py"
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_comm_gemm_overlap.xml $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py || test_fail "test_comm_gemm_overlap.py"
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops_with_userbuffers.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py || test_fail "test_fusible_ops_with_userbuffers.py"
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_attention_with_cp.xml $TE_PATH/tests/pytorch/attention/test_attention_with_cp.py || test_fail "test_attention_with_cp.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cp_utils.xml $TE_PATH/tests/pytorch/attention/test_cp_utils.py || test_fail "test_cp_utils.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_to_fp8.xml $TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py || test_fail "test_cast_master_weights_to_fp8.py"
+run_test_step "pytest_test_cp_utils.xml" "$TE_PATH/tests/pytorch/attention/test_cp_utils.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cp_utils.xml $TE_PATH/tests/pytorch/attention/test_cp_utils.py" \
+"test_cp_utils.py"
+run_test_step "pytest_test_cast_master_weights_to_fp8.xml" "$TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_to_fp8.xml $TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py" \
+"test_cast_master_weights_to_fp8.py"
 
 
 # debug tests
@@ -50,7 +155,13 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_
 
 # pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_distributed.xml $TE_PATH/tests/pytorch/debug/test_distributed.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "debug test_distributed.py"
 # standard numerics tests with initialized debug
-NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_2.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "debug test_numerics.py"
+if [ "$DEBUG_TESTS_READY" -eq 1 ]; then
+    run_test_step "pytest_test_numerics_2.xml" "$TE_PATH/tests/pytorch/distributed/test_numerics.py" \
+    "NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_2.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py" \
+    "test_numerics.py (debug)"
+else
+    echo "Skipping debug test_numerics.py because nvdlfw_inspect is unavailable"
+fi
 
 if [ "$RET" -ne 0 ]; then
     echo "Error in the following test cases:$FAILED_CASES"
diff --git a/qa/L1_pytorch_mcore_integration/test.sh b/qa/L1_pytorch_mcore_integration/test.sh
index a5130a52d3..b4ccb8f9ad 100644
--- a/qa/L1_pytorch_mcore_integration/test.sh
+++ b/qa/L1_pytorch_mcore_integration/test.sh
@@ -4,69 +4,149 @@
 
 set -e
 
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
+
+retry_command() {
+    local attempts=$1
+    local delay_seconds=$2
+    shift 2
+
+    local attempt
+    for attempt in $(seq 1 "${attempts}"); do
+        if "$@"; then
+            return 0
+        fi
+        if [ "${attempt}" -lt "${attempts}" ]; then
+            echo "Command failed (attempt ${attempt}/${attempts}): $*"
+            echo "Retrying in ${delay_seconds}s..."
+            sleep "${delay_seconds}"
+        fi
+    done
+
+    echo "Command failed after ${attempts} attempts: $*"
+    return 1
+}
+
 # Paths
-: ${TE_PATH:=/opt/transformerengine}
-: ${MCORE_PATH:=${TE_PATH}/qa/L1_pytorch_mcore_integration/Megatron-LM}
+: "${TE_PATH:=$(cd -- "${SCRIPT_DIR}/../.." && pwd)}"
+: "${MCORE_PATH:=/workspace/Megatron-LM-FL}"
+: "${MCORE_REPO_URL:=https://github.com/flagos-ai/Megatron-LM-FL.git}"
+: "${MCORE_REF:=main}"
+: "${OUTPUT_DIR:=${TE_PATH}/qa/L1_pytorch_mcore_integration/output}"
+: "${DATA_CACHE_PATH:=/tmp/data_cache}"
 
 # Check whether FP8 is supported
-DEVICE_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/[^0-9]//g')
-if [[ ${DEVICE_ARCH} -ge 89 ]]; then
-    WITH_FP8=1
+WITH_FP8=
+if command -v nvidia-smi &>/dev/null; then
+    DEVICE_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/[^0-9]//g')
+    if [[ ${DEVICE_ARCH} -ge 89 ]]; then
+        WITH_FP8=1
+    fi
+elif command -v mx-smi &>/dev/null; then
+    # Metax hardware does not support FP8; leave WITH_FP8 unset
+    :
 fi
 
-# Download Megatron-LM if needed
+# Download or sync Megatron-LM-FL to the requested repo/ref.
 if [ ! -d "${MCORE_PATH}" ]; then
     pushd $(dirname ${MCORE_PATH})
-    git clone -b core_r0.12.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
+    git config --global --unset-all credential.helper 2>/dev/null || true
+    git config --system --unset-all credential.helper 2>/dev/null || true
+    retry_command 3 5 git clone --depth 1 -b "${MCORE_REF}" "${MCORE_REPO_URL}" $(basename ${MCORE_PATH})
     popd
 fi
 
-# Create mock vocab
-VOCAB_FILE=${TE_PATH}/qa/L1_pytorch_mcore_integration/vocab.json
-printf "" > ${VOCAB_FILE}
-printf "{" >> ${VOCAB_FILE}
-printf "\"<|endoftext|>\": 0" >> ${VOCAB_FILE}
-seq 1 4095 | awk '{ printf(", \"%d\": %d", $1, $1) }' >> ${VOCAB_FILE}
-printf "}" >> ${VOCAB_FILE}
+if [ -d "${MCORE_PATH}/.git" ]; then
+    git -C "${MCORE_PATH}" remote set-url origin "${MCORE_REPO_URL}"
+    retry_command 3 5 git -C "${MCORE_PATH}" fetch --depth 1 origin "${MCORE_REF}"
+    git -C "${MCORE_PATH}" checkout -B "${MCORE_REF}" "FETCH_HEAD"
+fi
+
+# Megatron-LM-FL tokenizer imports happen at module import time, so direct
+# source execution needs these Python deps available before pretrain_gpt.py
+# starts.
+python3 - <<'PY' || python3 -m pip install --disable-pip-version-check six regex
+import regex
+import six
+print(f"six available: {six.__version__}")
+print(f"regex available: {regex.__version__}")
+PY
+
+CHECKPOINT_DIR=${OUTPUT_DIR}/checkpoints
+TENSORBOARD_DIR=${OUTPUT_DIR}/tensorboard
+mkdir -p "${CHECKPOINT_DIR}" "${TENSORBOARD_DIR}" "${DATA_CACHE_PATH}" /tmp/checkpoints
+
+echo "Using Megatron-LM-FL repo: ${MCORE_REPO_URL}"
+echo "Using Megatron-LM-FL ref: ${MCORE_REF}"
+git -C "${MCORE_PATH}" rev-parse --short HEAD
 
-# Megatron-LM invocation
+# Megatron-LM-FL invocation. Keep the argument shape aligned with the
+# previously validated tp1/pp1 mock-data GPT functional case while letting CI
+# exit after a few steps.
 COMMAND="
 NVTE_TORCH_COMPILE=0
 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
-NVTE_FLASH_ATTN=1
-NVTE_FWD_LAYERNORM_SM_MARGIN=0
-NVTE_BWD_LAYERNORM_SM_MARGIN=0
 CUDA_DEVICE_MAX_CONNECTIONS=1
-NVTE_BIAS_GELU_NVFUSION=0
-NVTE_BIAS_DROPOUT_FUSION=0
+NCCL_ALGO=Ring
+CUBLAS_WORKSPACE_CONFIG=:4096:8
 
-python3
--m torch.distributed.launch
---use_env
+torchrun
 --nnodes=1
 --nproc_per_node=1
 
 ${MCORE_PATH}/pretrain_gpt.py
 --tensor-model-parallel-size 1
 --pipeline-model-parallel-size 1
---use-cpu-initialization
---num-layers 2
---hidden-size 128
+--num-layers 12
+--hidden-size 512
 --num-attention-heads 8
---seq-length 128
---max-position-embeddings 128
---micro-batch-size 1
---global-batch-size 8
---train-iters 10
+--log-params-norm
+--log-num-zeros-in-grad
+--log-validation-ppl-to-tensorboard
+--log-timers-to-tensorboard
+--seq-length 1024
+--max-position-embeddings 1024
+--micro-batch-size 4
+--global-batch-size 32
+--train-iters 50
 --eval-iters 10
---lr 1e-4
+--timing-log-level 0
+--lr-decay-iters 320000
+--save ${CHECKPOINT_DIR}
+--split 949,50,1
+--tokenizer-type NullTokenizer
+--vocab-size 8192
 --mock-data
---vocab-file ${VOCAB_FILE}
---merge-file ${TE_PATH}/qa/L1_pytorch_mcore_integration/merges.txt
+--distributed-backend nccl
+--lr 0.00015
+--lr-decay-style cosine
+--min-lr 1.0e-5
+--weight-decay 1e-2
+--clip-grad 1.0
+--lr-warmup-fraction .01
+--log-interval 1
+--save-interval 10000
+--eval-interval 1000
 --transformer-impl transformer_engine
+--recompute-granularity full
+--recompute-method uniform
+--recompute-num-layers 1
+--deterministic-mode
+--no-gradient-accumulation-fusion
+--attention-softmax-in-fp32
+--use-mcore-models
+--ckpt-format torch_dist
+--dist-ckpt-optim-fully-reshardable
+--dist-ckpt-strictness log_all
+--data-cache-path ${DATA_CACHE_PATH}
+--bf16
+--attention-backend unfused
+--log-memory-to-tensorboard
+--tensorboard-dir ${TENSORBOARD_DIR}
+--exit-interval 4
 ${WITH_FP8:+--fp8-format hybrid}
 "
 COMMAND=$(echo "${COMMAND}" | tr '\n' ' ')
 
-# Launch Megatron-LM
+# Launch Megatron-LM-FL
 bash -c "${COMMAND}"
diff --git a/qa/L1_pytorch_mcore_integration/test_bak.sh b/qa/L1_pytorch_mcore_integration/test_bak.sh
new file mode 100644
index 0000000000..ec0b47b695
--- /dev/null
+++ b/qa/L1_pytorch_mcore_integration/test_bak.sh
@@ -0,0 +1,79 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+set -e
+
+# Paths
+: ${TE_PATH:=/opt/transformerengine}
+: ${MCORE_PATH:=${TE_PATH}/qa/L1_pytorch_mcore_integration/Megatron-LM}
+
+# Check whether FP8 is supported
+DEVICE_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/[^0-9]//g')
+if [[ ${DEVICE_ARCH} -ge 89 ]]; then
+    WITH_FP8=1
+fi
+
+# Download Megatron-LM if needed
+if [ ! -d "${MCORE_PATH}" ]; then
+    pushd $(dirname ${MCORE_PATH})
+    git clone -b core_r0.12.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
+    popd
+fi
+
+# Megatron tokenizer import chain pulls in bert_tokenization at module import
+# time, which unconditionally depends on `six`.
+python3 - <<'PY' || python3 -m pip install --disable-pip-version-check six
+import six
+print(f"six available: {six.__version__}")
+PY
+
+# Create mock vocab
+VOCAB_FILE=${TE_PATH}/qa/L1_pytorch_mcore_integration/vocab.json
+printf "" > ${VOCAB_FILE}
+printf "{" >> ${VOCAB_FILE}
+printf "\"<|endoftext|>\": 0" >> ${VOCAB_FILE}
+seq 1 4095 | awk '{ printf(", \"%d\": %d", $1, $1) }' >> ${VOCAB_FILE}
+printf "}" >> ${VOCAB_FILE}
+
+# Megatron-LM invocation
+COMMAND="
+NVTE_TORCH_COMPILE=0
+NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
+NVTE_FLASH_ATTN=1
+NVTE_FWD_LAYERNORM_SM_MARGIN=0
+NVTE_BWD_LAYERNORM_SM_MARGIN=0
+CUDA_DEVICE_MAX_CONNECTIONS=1
+NVTE_BIAS_GELU_NVFUSION=0
+NVTE_BIAS_DROPOUT_FUSION=0
+
+python3
+-m torch.distributed.launch
+--use_env
+--nnodes=1
+--nproc_per_node=1
+
+${MCORE_PATH}/pretrain_gpt.py
+--tensor-model-parallel-size 1
+--pipeline-model-parallel-size 1
+--use-cpu-initialization
+--num-layers 2
+--hidden-size 128
+--num-attention-heads 8
+--seq-length 128
+--max-position-embeddings 128
+--micro-batch-size 1
+--global-batch-size 8
+--train-iters 10
+--eval-iters 10
+--lr 1e-4
+--mock-data
+--vocab-file ${VOCAB_FILE}
+--merge-file ${TE_PATH}/qa/L1_pytorch_mcore_integration/merges.txt
+--transformer-impl transformer_engine
+${WITH_FP8:+--fp8-format hybrid}
+"
+COMMAND=$(echo "${COMMAND}" | tr '\n' ' ')
+
+# Launch Megatron-LM
+bash -c "${COMMAND}"
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
index 4309cc4a2e..4045997666 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
@@ -14,7 +14,6 @@ def _load_cuda_libs():
     import subprocess
     from pathlib import Path
     import importlib.util
-    import sysconfig
     import platform
     import glob as glob_module
 
@@ -154,7 +153,9 @@ def get_attention_backend(self, attention_params=None):
                      fused_attention_backend, use_unfused_attention, available_backends)
         """
         # Import the original get_attention_backend function
-        from transformer_engine.pytorch.attention.dot_product_attention import utils as dpa_utils
+        from transformer_engine.pytorch.attention.dot_product_attention import (
+            utils as dpa_utils,
+        )
 
         return dpa_utils._original_get_attention_backend(attention_params)
 
@@ -536,7 +537,15 @@ def layernorm_fwd(
         tex = self._get_tex()
         otype = tex.DType(int(otype)) if otype is not None else None
         return tex.layernorm_fwd(
-            input, weight, bias, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
+            input,
+            weight,
+            bias,
+            eps,
+            ln_out,
+            quantizer,
+            otype,
+            sm_margin,
+            zero_centered_gamma,
         )
 
     def layernorm_bwd(
@@ -746,7 +755,12 @@ def fused_amax_and_scale_update_after_reduction(
         tex = self._get_tex()
         fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
         return tex.fused_amax_and_scale_update_after_reduction(
-            amax_reduction_buffer, amax_histories, scales, amax_compute_algo, fp8_dtype, margin
+            amax_reduction_buffer,
+            amax_histories,
+            scales,
+            amax_compute_algo,
+            fp8_dtype,
+            margin,
         )
 
     def fp8_block_scaling_compute_partial_amax(
@@ -1028,7 +1042,14 @@ def fused_rope_forward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_rope_forward(
-            input, freqs, start_positions, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
+            input,
+            freqs,
+            start_positions,
+            qkv_format,
+            interleaved,
+            cu_seqlens,
+            cp_size,
+            cp_rank,
         )
 
     def fused_rope_backward(
@@ -1293,7 +1314,13 @@ def thd_out_correction(
     ) -> None:
         tex = self._get_tex()
         return tex.thd_out_correction(
-            out, out_per_step, lse, lse_per_step, cu_seqlens, only_second_half, lse_packed
+            out,
+            out_per_step,
+            lse,
+            lse_per_step,
+            cu_seqlens,
+            only_second_half,
+            lse_packed,
         )
 
     def thd_grad_correction(