flagos-ai · Darryl233 · Apr 24, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 18, 2026
diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml
@@ -1,26 +1,28 @@
 # CUDA Hardware Configuration for TransformerEngine-FL
-# Refactored for BAAI DGX A100 Nodes
+# Refactored for A100 Nodes
 # This file defines environment variables, volumes, and test filters for TE tests.
 
 hardware_name: cuda
 display_name: 'NVIDIA CUDA (A100)'
 
+# CI image for online env
 ci_image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
 
 # Runner labels for self-hosted A100 node
 # runner_labels:
-# - self-hosted
-# - Linux
-# - X64
-# - nvidia
-# - gpu-8
+#   - self-hosted
+#   - Linux
+#   - X64
+#   - nvidia
+#   - gpu-8
+
+# Runner labels for online env
 runner_labels:
   - nv-8g-cicd-te
 
 # Container volumes
 container_volumes:
   - /home/flagscale_cicd/flask/static:/workspace/report
-  # - /home/flagscale_cicd/data:/opt/data
 
 # Container options
 container_options: >-
@@ -32,9 +34,8 @@ container_options: >-
   --ulimit stack=67108864 
   --user root
 
-# Device types
-device_types:
-  - a100
+# Platform-specific environment setup script
+setup_script: .github/scripts/setup_cuda.sh
 
 # Build environment variables (platform-specific)
 build_env:
@@ -47,6 +48,10 @@ build_env:
   CUDA_HOME: /usr/local/cuda-12.8
   NVCC: /usr/local/cuda-12.8/bin/nvcc
 
+# Device types to run tests on
+device_types:
+  - a100
+
 # Test matrix configuration
 test_matrix:
   l0_pytorch:

diff --git a/.github/configs/metax.yml b/.github/configs/metax.yml
@@ -1,46 +1,50 @@
 # Metax Hardware Configuration for TE-FL
 # This file defines CI/CD settings for Metax-based testing
-# Test configurations are defined in tests/test_utils/config/platforms/metax.yaml
+# This file defines environment variables, volumes, and test filters for TE tests.
 
 hardware_name: metax
 display_name: 'Metax Tests'
 
-ci_image: localhost:5000/megatron-lm-with-te:v1
-# ci_image: harbor.baai.ac.cn/flagscale/megatron-lm-with-te:202603231839
+# CI image for Metax dev env
+# ci_image: localhost:5000/megatron-lm-with-te:v1
 
-runner_labels:
-  - self-hosted
-  - Linux
-  - X64
-  - metax
-  - dev
+# CI image for online env
+ci_image: harbor.baai.ac.cn/flagscale/megatron-lm-with-te:202603231839
+
+# Runner labels for self-hosted Metax node
 # runner_labels:
-#   - mx-4g-cicd-te
+#   - self-hosted
+#   - Linux
+#   - X64
+#   - metax
+#   - dev
+
+# Runner labels for online env
+runner_labels:
+  - mx-4g-cicd-te
 
+# Container volumes
 container_volumes:
   - /nfs/metax_fs:/nfs/metax_fs
-  - /dev/dri:/dev/dri
-  - /dev/mxcd:/dev/mxcd
-  - /dev/infiniband:/dev/infiniband
 
+# Container options
 container_options: >-
   --uts=host
   --ipc=host
   --privileged=true
   --group-add video
   --shm-size=100gb
   --ulimit memlock=-1
-  --security-opt seccomp=unconfined
-  --security-opt apparmor=unconfined
-  --device=/dev/dri
-  --device=/dev/mxcd
-  --device=/dev/infiniband
   --user root
   --ulimit nofile=65535:65535
   -e PLATFORM=metax
   -e TORCH_DISTRIBUTED_BACKEND=mccl
   -e LD_LIBRARY_PATH=/opt/maca/lib:/usr/local/lib:$LD_LIBRARY_PATH
 
+# Platform-specific environment setup script
+setup_script: .github/scripts/setup_metax.sh
+
+# Build environment variables (platform-specific)
 build_env:
   TE_FL_SKIP_CUDA: '1'
   NVTE_WITH_MACA: '1'
@@ -62,10 +66,3 @@ test_matrix:
       # example: tests/unit_tests/test_example.py
       # - tests/unit_tests/test_inference.py
       # - tests/unit_tests/test_rl_utils.py
-
-  # functional:
-  #   train:
-  #     - device: c500
-  #       task: train
-  #       model: deepseek
-  #       case: tp2_pp2_ep2
diff --git a/.github/scripts/setup_cuda.sh b/.github/scripts/setup_cuda.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# CUDA Platform Environment Setup Script
+# Called by unit_tests_common.yml for CUDA platforms (A100, H100, etc.)
+set -euo pipefail
+
+echo "===== Step 0: Activate Python environment ====="
+source /opt/miniconda3/etc/profile.d/conda.sh
+conda activate flagscale-train
+echo "PATH=$PATH" >> $GITHUB_ENV
+echo "Python: $(which python3) ($(python3 --version 2>&1))"
+
+echo "===== Step 1: Remove Existing TransformerEngine ====="
+pip uninstall transformer_engine transformer_engine_torch -y || true
+
+echo "===== Step 2: Build & Install TransformerEngine ====="
+cd $GITHUB_WORKSPACE
+
+pip install nvdlfw-inspect --quiet
+pip install expecttest --quiet
+pip install . -v --no-deps --no-build-isolation
+
+echo "===== Step 3: Verify Installation ====="
+python3 tests/pytorch/test_sanity_import.py
+
+echo "===== Environment Setup Complete ====="
diff --git a/.github/scripts/setup_metax.sh b/.github/scripts/setup_metax.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# Metax Platform Environment Setup Script
+# Called by unit_tests_common.yml for Metax platforms (C500, etc.)
+set -euo pipefail
+
+echo "===== Step 0: Activate Python environment ====="
+source /opt/conda/etc/profile.d/conda.sh
+conda activate base
+echo "PATH=$PATH" >> $GITHUB_ENV
+echo "Python: $(which python3) ($(python3 --version 2>&1))"
+
+echo "===== Step 1: Base Environment Setup ====="
+# Configure MACA toolchain paths
+export PATH=/opt/maca/bin:$PATH
+export LD_LIBRARY_PATH=/opt/maca/lib:$LD_LIBRARY_PATH
+service ssh restart
+
+echo "===== Step 2: Create nvcc Symlink (cucc -> nvcc) ====="
+# TransformerEngine expects nvcc, but MACA provides cucc
+ln -sf /opt/maca/tools/cu-bridge/bin/cucc /opt/maca/tools/cu-bridge/bin/nvcc
+which nvcc || true
+
+echo "===== Step 3: Install Required System Tools ====="
+# Use apt to install git, curl
+sed -i 's|http://mirrors.aliyun.com/ubuntu|http://archive.ubuntu.com/ubuntu|g' /etc/apt/sources.list
+apt-get update -qq || true
+apt-get install -y -qq git curl
+# Install cmake and ninja via pip (more reliable than apt in this env)
+python3 -m pip install cmake ninja torch --no-cache-dir
+
+echo "===== Step 4: Remove Existing TransformerEngine ====="
+# Prevent conflicts with preinstalled or incompatible versions
+python3 -m pip uninstall transformer_engine -y || true
+python3 -m pip install nvdlfw-inspect --no-deps || true
+
+echo "===== Step 5: Install TE-FL Plugin Layer ====="
+# Install TransformerEngine-FL Python layer (plugin logic)
+cd $GITHUB_WORKSPACE
+TE_FL_SKIP_CUDA=1 python3 setup.py install
+
+echo "===== Step 6: Final Verification ====="
+# Verify both TE Python API and backend are functional
+python3 - <<'EOF'
+import transformer_engine
+import transformer_engine_torch as te
+print("transformer_engine:", transformer_engine)
+print("transformer_engine_torch:", te)
+EOF
+
+echo "===== Environment Setup Complete ====="