Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
260dd5a
CI: refactor workflows, add multi-platform support, fix QA scripts (p…
BrianPei Apr 17, 2026
95c00ff
remove duplicate scripts
BrianPei Apr 17, 2026
263a761
restore nv original ymls
BrianPei Apr 18, 2026
f4ae7cd
modify all_tests to pipeline workflow
BrianPei Apr 18, 2026
f45e9d0
remove custom lint
BrianPei Apr 18, 2026
25e1bf4
remove unnecessary if condition
BrianPei Apr 20, 2026
d1711bc
check conda & python
qqjxzxq Apr 21, 2026
4035436
chore: clean debug leftovers and centralize metax ignore rules
HermiaHuan Apr 21, 2026
e895124
turn back
qqjxzxq Apr 21, 2026
5e281e8
add network config
qqjxzxq Apr 22, 2026
7720975
set network again
qqjxzxq Apr 22, 2026
dc22a22
revert: keep cudart workaround at test layer
HermiaHuan Apr 22, 2026
db6a459
chore: restore original cudart load ordering
HermiaHuan Apr 22, 2026
9467cce
disable original qa-l1 & qa-l3 workflow
BrianPei Apr 22, 2026
24320f7
fix format_check
BrianPei Apr 22, 2026
26c6df9
fix: apply black formatting with correct CI flags
BrianPei Apr 22, 2026
fc2b4a3
remove torchrun standalone for integration_tests
BrianPei Apr 23, 2026
3cc6661
updated build & plugin runner label
BrianPei Apr 23, 2026
9401c7d
fix cuda build scripts
BrianPei Apr 23, 2026
9983ed6
Add clean vscode-remote-container step on metax
BrianPei Apr 23, 2026
0dcd0ce
fix depedences installation on Metax runner
BrianPei Apr 23, 2026
6f130c4
set git safe directory for build
BrianPei Apr 23, 2026
78a999a
integration_tests job add strategy metrix
BrianPei Apr 23, 2026
c72c2f9
change integration repo to flagOS megatron-LM-FL
BrianPei Apr 24, 2026
b505a6c
excute tests step add activate conda
BrianPei Apr 24, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 15 additions & 10 deletions .github/configs/cuda.yml
Original file line number Diff line number Diff line change
@@ -1,26 +1,28 @@
# CUDA Hardware Configuration for TransformerEngine-FL
# Refactored for BAAI DGX A100 Nodes
# Refactored for A100 Nodes
# This file defines environment variables, volumes, and test filters for TE tests.

hardware_name: cuda
display_name: 'NVIDIA CUDA (A100)'

# CI image for online env
ci_image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209

# Runner labels for self-hosted A100 node
# runner_labels:
# - self-hosted
# - Linux
# - X64
# - nvidia
# - gpu-8
# - self-hosted
# - Linux
# - X64
# - nvidia
# - gpu-8

# Runner labels for online env
runner_labels:
- nv-8g-cicd-te

# Container volumes
container_volumes:
- /home/flagscale_cicd/flask/static:/workspace/report
# - /home/flagscale_cicd/data:/opt/data

# Container options
container_options: >-
Expand All @@ -32,9 +34,8 @@ container_options: >-
--ulimit stack=67108864
--user root

# Device types
device_types:
- a100
# Platform-specific environment setup script
setup_script: .github/scripts/setup_cuda.sh

# Build environment variables (platform-specific)
build_env:
Expand All @@ -47,6 +48,10 @@ build_env:
CUDA_HOME: /usr/local/cuda-12.8
NVCC: /usr/local/cuda-12.8/bin/nvcc

# Device types to run tests on
device_types:
- a100

# Test matrix configuration
test_matrix:
l0_pytorch:
Expand Down
47 changes: 22 additions & 25 deletions .github/configs/metax.yml
Original file line number Diff line number Diff line change
@@ -1,46 +1,50 @@
# Metax Hardware Configuration for TE-FL
# This file defines CI/CD settings for Metax-based testing
# Test configurations are defined in tests/test_utils/config/platforms/metax.yaml
# This file defines environment variables, volumes, and test filters for TE tests.

hardware_name: metax
display_name: 'Metax Tests'

ci_image: localhost:5000/megatron-lm-with-te:v1
# ci_image: harbor.baai.ac.cn/flagscale/megatron-lm-with-te:202603231839
# CI image for Metax dev env
# ci_image: localhost:5000/megatron-lm-with-te:v1

runner_labels:
- self-hosted
- Linux
- X64
- metax
- dev
# CI image for online env
ci_image: harbor.baai.ac.cn/flagscale/megatron-lm-with-te:202603231839

# Runner labels for self-hosted Metax node
# runner_labels:
# - mx-4g-cicd-te
# - self-hosted
# - Linux
# - X64
# - metax
# - dev

# Runner labels for online env
runner_labels:
- mx-4g-cicd-te

# Container volumes
container_volumes:
- /nfs/metax_fs:/nfs/metax_fs
- /dev/dri:/dev/dri
- /dev/mxcd:/dev/mxcd
- /dev/infiniband:/dev/infiniband

# Container options
container_options: >-
--uts=host
--ipc=host
--privileged=true
--group-add video
--shm-size=100gb
--ulimit memlock=-1
--security-opt seccomp=unconfined
--security-opt apparmor=unconfined
--device=/dev/dri
--device=/dev/mxcd
--device=/dev/infiniband
--user root
--ulimit nofile=65535:65535
-e PLATFORM=metax
-e TORCH_DISTRIBUTED_BACKEND=mccl
-e LD_LIBRARY_PATH=/opt/maca/lib:/usr/local/lib:$LD_LIBRARY_PATH

# Platform-specific environment setup script
setup_script: .github/scripts/setup_metax.sh

# Build environment variables (platform-specific)
build_env:
TE_FL_SKIP_CUDA: '1'
NVTE_WITH_MACA: '1'
Expand All @@ -62,10 +66,3 @@ test_matrix:
# example: tests/unit_tests/test_example.py
# - tests/unit_tests/test_inference.py
# - tests/unit_tests/test_rl_utils.py

# functional:
# train:
# - device: c500
# task: train
# model: deepseek
# case: tp2_pp2_ep2
25 changes: 25 additions & 0 deletions .github/scripts/setup_cuda.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env bash
# CUDA Platform Environment Setup Script
# Called by unit_tests_common.yml for CUDA platforms (A100, H100, etc.)
set -euo pipefail

echo "===== Step 0: Activate Python environment ====="
source /opt/miniconda3/etc/profile.d/conda.sh
conda activate flagscale-train
echo "PATH=$PATH" >> $GITHUB_ENV
echo "Python: $(which python3) ($(python3 --version 2>&1))"

echo "===== Step 1: Remove Existing TransformerEngine ====="
pip uninstall transformer_engine transformer_engine_torch -y || true

echo "===== Step 2: Build & Install TransformerEngine ====="
cd $GITHUB_WORKSPACE

pip install nvdlfw-inspect --quiet
pip install expecttest --quiet
pip install . -v --no-deps --no-build-isolation

echo "===== Step 3: Verify Installation ====="
python3 tests/pytorch/test_sanity_import.py

echo "===== Environment Setup Complete ====="
50 changes: 50 additions & 0 deletions .github/scripts/setup_metax.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env bash
# Metax Platform Environment Setup Script
# Called by unit_tests_common.yml for Metax platforms (C500, etc.)
set -euo pipefail

echo "===== Step 0: Activate Python environment ====="
source /opt/conda/etc/profile.d/conda.sh
conda activate base
echo "PATH=$PATH" >> $GITHUB_ENV
echo "Python: $(which python3) ($(python3 --version 2>&1))"

echo "===== Step 1: Base Environment Setup ====="
# Configure MACA toolchain paths
export PATH=/opt/maca/bin:$PATH
export LD_LIBRARY_PATH=/opt/maca/lib:$LD_LIBRARY_PATH
service ssh restart

echo "===== Step 2: Create nvcc Symlink (cucc -> nvcc) ====="
# TransformerEngine expects nvcc, but MACA provides cucc
ln -sf /opt/maca/tools/cu-bridge/bin/cucc /opt/maca/tools/cu-bridge/bin/nvcc
which nvcc || true

echo "===== Step 3: Install Required System Tools ====="
# Use apt to install git, curl
sed -i 's|http://mirrors.aliyun.com/ubuntu|http://archive.ubuntu.com/ubuntu|g' /etc/apt/sources.list
apt-get update -qq || true
apt-get install -y -qq git curl
# Install cmake and ninja via pip (more reliable than apt in this env)
python3 -m pip install cmake ninja torch --no-cache-dir

echo "===== Step 4: Remove Existing TransformerEngine ====="
# Prevent conflicts with preinstalled or incompatible versions
python3 -m pip uninstall transformer_engine -y || true
python3 -m pip install nvdlfw-inspect --no-deps || true

echo "===== Step 5: Install TE-FL Plugin Layer ====="
# Install TransformerEngine-FL Python layer (plugin logic)
cd $GITHUB_WORKSPACE
TE_FL_SKIP_CUDA=1 python3 setup.py install

echo "===== Step 6: Final Verification ====="
# Verify both TE Python API and backend are functional
python3 - <<'EOF'
import transformer_engine
import transformer_engine_torch as te
print("transformer_engine:", transformer_engine)
print("transformer_engine_torch:", te)
EOF

echo "===== Environment Setup Complete ====="
Loading
Loading