From 3a9b4b044afd023d2e2c194d7b6f7618915b0ac2 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Tue, 28 Apr 2026 02:04:38 +0000 Subject: [PATCH 1/3] Automate LKG updates in nightly regression workflow - Granted `contents: write` and `pull-requests: write` permissions in the caller workflow. - Updated `.github/workflows/tpu-nightly-regression.yml` to extract exactly installed `vllm` and `tpu-inference` commits and automatically create a PR if those dependencies differ from the ones in the requirements files. Co-authored-by: sizhit2 <32147610+sizhit2@users.noreply.github.com> --- ...uild_and_test_tunix_nightly_regression.yml | 3 +- .github/workflows/tpu-nightly-regression.yml | 87 +++++++++++++++++++ 2 files changed, 89 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test_tunix_nightly_regression.yml b/.github/workflows/build_and_test_tunix_nightly_regression.yml index 20abaabb6..bc1fac376 100644 --- a/.github/workflows/build_and_test_tunix_nightly_regression.yml +++ b/.github/workflows/build_and_test_tunix_nightly_regression.yml @@ -32,7 +32,8 @@ concurrency: cancel-in-progress: false permissions: - contents: read + contents: write + pull-requests: write jobs: build_tunix_package: name: Build tunix package diff --git a/.github/workflows/tpu-nightly-regression.yml b/.github/workflows/tpu-nightly-regression.yml index 15537db4f..63f611e24 100644 --- a/.github/workflows/tpu-nightly-regression.yml +++ b/.github/workflows/tpu-nightly-regression.yml @@ -35,6 +35,9 @@ env: jobs: run_prod: + outputs: + vllm_commit: ${{ steps.extract_commits.outputs.vllm_commit }} + tpu_inference_commit: ${{ steps.extract_commits.outputs.tpu_inference_commit }} runs-on: [linux-x86-ct5lp-224-8tpu] environment: testing container: @@ -220,3 +223,87 @@ jobs: fi echo "🎉 All RL scripts completed successfully." + - name: Extract dependency commits + id: extract_commits + run: | + cat << 'EOF' > get_commits.py + import sys + + try: + from importlib.metadata import requires, version + except ImportError: + import pkg_resources + + def get_commit(package_name): + try: + import importlib.metadata + dist = importlib.metadata.distribution(package_name) + # the dist.read_text('direct_url.json') usually contains the exact git commit for git dependencies + import json + direct_url = dist.read_text('direct_url.json') + if direct_url: + data = json.loads(direct_url) + if 'vcs_info' in data and 'commit_id' in data['vcs_info']: + return data['vcs_info']['commit_id'] + except Exception as e: + pass + return None + + vllm_commit = get_commit("vllm") + tpu_inference_commit = get_commit("tpu-inference") + + with open(r"${{ github.output }}", "a") as f: + if vllm_commit: + f.write(f"vllm_commit={vllm_commit}\n") + if tpu_inference_commit: + f.write(f"tpu_inference_commit={tpu_inference_commit}\n") + EOF + sed -i "s|\${{ github.output }}|$GITHUB_OUTPUT|g" get_commits.py + python3 get_commits.py + + update_lkg_pr: + needs: run_prod + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Update requirement files + run: | + VLLM_COMMIT="${{ needs.run_prod.outputs.vllm_commit }}" + TPU_INFERENCE_COMMIT="${{ needs.run_prod.outputs.tpu_inference_commit }}" + + if [ -n "$VLLM_COMMIT" ]; then + sed -i "s|vllm-project/vllm\.git@[a-f0-9]\+|vllm-project/vllm.git@${VLLM_COMMIT}|g" requirements/requirements.txt + fi + + if [ -n "$TPU_INFERENCE_COMMIT" ]; then + sed -i "s|vllm-project/tpu-inference\.git@[a-f0-9]\+|vllm-project/tpu-inference.git@${TPU_INFERENCE_COMMIT}|g" requirements/special_requirements.txt + fi + + - name: Create Pull Request + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "github-actions[bot]@users.noreply.github.com" + + if git diff --exit-code; then + echo "No changes to commit." + exit 0 + fi + + BRANCH_NAME="update-lkg-commits-$(date +%s)" + git checkout -b "$BRANCH_NAME" + git add requirements/requirements.txt requirements/special_requirements.txt + git commit -m "chore: update vLLM and tpu-inference LKG commits" + git push origin "$BRANCH_NAME" + + gh pr create \ + --title "Update vLLM and tpu-inference LKG commits" \ + --body "This PR updates the vLLM and tpu-inference pinned commits to the latest versions that passed the Tunix Nightly Regression tests." \ + --base main \ + --head "$BRANCH_NAME" From e027f1465efe0a770106e5477e11ac502c075a9d Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Wed, 29 Apr 2026 21:28:23 +0000 Subject: [PATCH 2/3] ci: add automated LKG capture to nightly regression tests Adds `run_latest` and `update_lkg_pr` jobs to the nightly regression workflow. It pulls the known LKG commit for `vllm` from `tpu-inference`'s upstream and the HEAD commit for `tpu-inference`, runs tests, and updates `requirements.txt` via PR. The `run_prod` job is kept untouched. Co-authored-by: sizhit2 <32147610+sizhit2@users.noreply.github.com> --- .github/workflows/tpu-nightly-regression.yml | 250 +++++++++++++++++-- 1 file changed, 225 insertions(+), 25 deletions(-) diff --git a/.github/workflows/tpu-nightly-regression.yml b/.github/workflows/tpu-nightly-regression.yml index 63f611e24..680434945 100644 --- a/.github/workflows/tpu-nightly-regression.yml +++ b/.github/workflows/tpu-nightly-regression.yml @@ -35,9 +35,6 @@ env: jobs: run_prod: - outputs: - vllm_commit: ${{ steps.extract_commits.outputs.vllm_commit }} - tpu_inference_commit: ${{ steps.extract_commits.outputs.tpu_inference_commit }} runs-on: [linux-x86-ct5lp-224-8tpu] environment: testing container: @@ -223,34 +220,64 @@ jobs: fi echo "🎉 All RL scripts completed successfully." + run_latest: + outputs: + vllm_commit: ${{ steps.extract_commits.outputs.vllm_commit }} + tpu_inference_commit: ${{ steps.extract_commits.outputs.tpu_inference_commit }} + runs-on: [linux-x86-ct5lp-224-8tpu] + environment: testing + container: + image: vllm/vllm-tpu:nightly + options: --privileged + env: + CLOUD_TPU_ACCELERATOR: v5e-8 + JAX_PLATFORMS: tpu,cpu + steps: + + # Cache Hugging Face hub + - name: Cache HF hub + uses: actions/cache@v4 + with: + path: ~/.cache/huggingface + key: hf-${{ runner.os }}-${{ hashFiles('pyproject.toml', 'requirements*.txt', 'constraints*.txt') }} + restore-keys: | + hf-${{ runner.os }}- + + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Extract dependency commits id: extract_commits run: | cat << 'EOF' > get_commits.py - import sys + import urllib.request + import json - try: - from importlib.metadata import requires, version - except ImportError: - import pkg_resources + def get_head_commit(repo): + url = f"https://api.github.com/repos/{repo}/commits/main" + req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) + try: + with urllib.request.urlopen(req) as response: + data = json.loads(response.read().decode()) + return data['sha'] + except Exception as e: + print(f"Error fetching {repo}: {e}") + return None - def get_commit(package_name): + def get_vllm_lkg_commit(): + url = "https://raw.githubusercontent.com/vllm-project/tpu-inference/main/.buildkite/vllm_lkg.version" + req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) try: - import importlib.metadata - dist = importlib.metadata.distribution(package_name) - # the dist.read_text('direct_url.json') usually contains the exact git commit for git dependencies - import json - direct_url = dist.read_text('direct_url.json') - if direct_url: - data = json.loads(direct_url) - if 'vcs_info' in data and 'commit_id' in data['vcs_info']: - return data['vcs_info']['commit_id'] + with urllib.request.urlopen(req) as response: + return response.read().decode().strip() except Exception as e: - pass - return None + print(f"Error fetching vllm lkg: {e}") + return None - vllm_commit = get_commit("vllm") - tpu_inference_commit = get_commit("tpu-inference") + vllm_commit = get_vllm_lkg_commit() + tpu_inference_commit = get_head_commit("vllm-project/tpu-inference") with open(r"${{ github.output }}", "a") as f: if vllm_commit: @@ -261,8 +288,181 @@ jobs: sed -i "s|\${{ github.output }}|$GITHUB_OUTPUT|g" get_commits.py python3 get_commits.py + - name: Install tunix dependencies + run: | + # Update requirement files with specific commits + VLLM_COMMIT="${{ steps.extract_commits.outputs.vllm_commit }}" + TPU_INFERENCE_COMMIT="${{ steps.extract_commits.outputs.tpu_inference_commit }}" + + if [ -n "$VLLM_COMMIT" ]; then + sed -i "s|vllm-project/vllm\.git@[a-f0-9]\+|vllm-project/vllm.git@${VLLM_COMMIT}|g" requirements/requirements.txt + fi + + if [ -n "$TPU_INFERENCE_COMMIT" ]; then + sed -i "s|vllm-project/tpu-inference\.git@[a-f0-9]\+|vllm-project/tpu-inference.git@${TPU_INFERENCE_COMMIT}|g" requirements/special_requirements.txt + fi + + pip install -e .[prod,test] + + - name: Verify TPU availability + run: | + python -c " + import jax + print(f'JAX version: {jax.__version__}') + print(f'JAX devices: {jax.devices()}') + + # Check if we have TPU devices specifically + devices = jax.devices() + has_tpu = len(devices) > 0 and all(device.platform == 'tpu' for device in devices) + print(f'TPU available: {has_tpu}') + + if not has_tpu: + print('ERROR: No TPU devices found! Expected TPU devices but got:', [device.platform for device in devices]) + exit(1) + else: + print(f'SUCCESS: Found {len(devices)} TPU device(s)') + " + + - name: Run regression scripts + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + id: regression_tests + run: | + # Download GSM8K dataset + mkdir -p /tmp/grpo_test/rl/grpo/data + + FAILED=0 + echo "📦 Executing: examples/deepscaler/math_eval_nb.py..." + python examples/deepscaler/math_eval_nb.py || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vanilla rollout engine in colocated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vanilla rollout engine in 2 way disaggregated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --cluster-setup=disaggregated-2-way || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vanilla rollout engine in 3 way disaggregated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --cluster-setup=disaggregated-3-way || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm rollout engine in colocated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm rollout engine in 2 way disaggregated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm --cluster-setup=disaggregated-2-way || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm rollout engine in 3 way disaggregated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm --cluster-setup=disaggregated-3-way || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm server mode in 2 way disaggregated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm --rollout-server-mode=True --cluster-setup=disaggregated-2-way || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm server mode in 3 way disaggregated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm --rollout-server-mode=True --cluster-setup=disaggregated-3-way || FAILED=1 + + # SGLang Tests + unset JAX_PLATFORMS + pip list | egrep 'jax|flax|libtpu' + cd .. + git clone https://github.com/sgl-project/sglang-jax.git && cd sglang-jax/python && pip install -e . && cd ../.. + pip install jax==0.8.1 flax==0.12.0 libtpu==0.0.24 + pip list | egrep 'jax|flax|libtpu' + cd tunix + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with sglang_jax in colocated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=sglang_jax || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with sglang_jax in 2 way disaggregated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=sglang_jax --cluster-setup=disaggregated-2-way || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with sglang_jax in 3 way disaggregated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=sglang_jax --cluster-setup=disaggregated-3-way || FAILED=1 + + # echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with sglang_jax with LoRA ..." + # python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine sglang_jax --enable-lora --lora-target-modules all || FAILED=1 + + + if [ "$FAILED" -ne 0 ]; then + echo "One or more scripts failed!" + exit 1 + fi + + - name: Run SFT shell scripts + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + SCRIPT_DIR="./tunix/examples/sft/mtnt" + MAX_STEPS=5 + EVAL_EVERY_N_STEPS=1 + + # Check if directory exists + if [ ! -d "$SCRIPT_DIR" ]; then + echo "Directory $SCRIPT_DIR does not exist" + exit 1 + fi + + echo "🔍 Finding scripts in $SCRIPT_DIR" + for script in "$SCRIPT_DIR"/*.sh; do + if [ -f "$script" ]; then + echo "📦 Executing: $script" + echo "MAX_STEPS=$MAX_STEPS, EVAL_EVERY_N_STEPS=$EVAL_EVERY_N_STEPS" + chmod +x "$script" + if bash "$script" \ + --training_config.max_steps "$MAX_STEPS" \ + --training_config.eval_every_n_steps "$EVAL_EVERY_N_STEPS"; then + echo "✅ Successfully completed: $script" + else + exit_code=$? + echo "❌ Failed to complete: $script (Exit Code: $exit_code)" >&2 + exit "$exit_code" + fi + fi + done + echo "🎉 All SFT scripts completed successfully." + + - name: Run RL shell scripts + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + SCRIPT_DIR="./tunix/examples/rl/grpo/gsm8k" + EXCLUDE_DIR="verl_compatible" + MAX_STEPS=5 + EVAL_EVERY_N_STEPS=1 + + if [ ! -d "$SCRIPT_DIR" ]; then + echo "Directory $SCRIPT_DIR does not exist" >&2 + exit 1 + fi + + echo "🔍 Finding scripts in $SCRIPT_DIR, excluding $EXCLUDE_DIR" + final_exit_code=0 + + while IFS= read -r script; do + if [ -f "$script" ]; then + echo "📦 Executing: $script" + echo "MAX_STEPS=$MAX_STEPS, EVAL_EVERY_N_STEPS=$EVAL_EVERY_N_STEPS" + chmod +x "$script" + if ! bash "$script" \ + --rl_training_config.max_steps "$MAX_STEPS" \ + --rl_training_config.eval_every_n_steps "$EVAL_EVERY_N_STEPS"; then + exit_code=$? + echo "❌ Failed to complete: $script (Exit Code: $exit_code)" >&2 + final_exit_code=$exit_code + # Stop processing further scripts after the first failure + break + else + echo "✅ Successfully completed: $script" + fi + fi + done < <(find "$SCRIPT_DIR" -name "*.sh" -type f | grep -v "$SCRIPT_DIR/$EXCLUDE_DIR/") + + if [ "$final_exit_code" -ne 0 ]; then + echo "🚫 One or more RL scripts failed. Exiting with code $final_exit_code." >&2 + exit "$final_exit_code" + fi + echo "🎉 All RL scripts completed successfully." + update_lkg_pr: - needs: run_prod + needs: run_latest runs-on: ubuntu-latest permissions: contents: write @@ -273,8 +473,8 @@ jobs: - name: Update requirement files run: | - VLLM_COMMIT="${{ needs.run_prod.outputs.vllm_commit }}" - TPU_INFERENCE_COMMIT="${{ needs.run_prod.outputs.tpu_inference_commit }}" + VLLM_COMMIT="${{ needs.run_latest.outputs.vllm_commit }}" + TPU_INFERENCE_COMMIT="${{ needs.run_latest.outputs.tpu_inference_commit }}" if [ -n "$VLLM_COMMIT" ]; then sed -i "s|vllm-project/vllm\.git@[a-f0-9]\+|vllm-project/vllm.git@${VLLM_COMMIT}|g" requirements/requirements.txt From de62d11f2f7732286218a48d2b665dec1a04ce49 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Wed, 29 Apr 2026 22:54:17 +0000 Subject: [PATCH 3/3] ci: automate LKG commits generation for vllm and tpu-inference Moves test steps into a composite action and adds a job to update LKGs with git ls-remote and buildkite curl endpoint. Co-authored-by: sizhit2 <32147610+sizhit2@users.noreply.github.com> --- .../actions/run_regression_tests/action.yml | 150 +++++++++ .github/workflows/tpu-nightly-regression.yml | 300 +----------------- 2 files changed, 166 insertions(+), 284 deletions(-) create mode 100644 .github/actions/run_regression_tests/action.yml diff --git a/.github/actions/run_regression_tests/action.yml b/.github/actions/run_regression_tests/action.yml new file mode 100644 index 000000000..7dacd03a1 --- /dev/null +++ b/.github/actions/run_regression_tests/action.yml @@ -0,0 +1,150 @@ +name: "Run Regression Tests" +description: "Runs Tunix regression tests, SFT shell scripts, and RL shell scripts." +inputs: + hf_token: + description: "HuggingFace token for model downloads" + required: true + +runs: + using: "composite" + steps: + - name: Run regression scripts + env: + HF_TOKEN: ${{ inputs.hf_token }} + id: regression_tests + shell: bash + run: | + # Download GSM8K dataset + mkdir -p /tmp/grpo_test/rl/grpo/data + + FAILED=0 + echo "📦 Executing: examples/deepscaler/math_eval_nb.py..." + python examples/deepscaler/math_eval_nb.py || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vanilla rollout engine in colocated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vanilla rollout engine in 2 way disaggregated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --cluster-setup=disaggregated-2-way || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vanilla rollout engine in 3 way disaggregated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --cluster-setup=disaggregated-3-way || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm rollout engine in colocated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm rollout engine in 2 way disaggregated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm --cluster-setup=disaggregated-2-way || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm rollout engine in 3 way disaggregated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm --cluster-setup=disaggregated-3-way || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm server mode in 2 way disaggregated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm --rollout-server-mode=True --cluster-setup=disaggregated-2-way || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm server mode in 3 way disaggregated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm --rollout-server-mode=True --cluster-setup=disaggregated-3-way || FAILED=1 + + # SGLang Tests + unset JAX_PLATFORMS + pip list | egrep 'jax|flax|libtpu' + cd .. + git clone https://github.com/sgl-project/sglang-jax.git && cd sglang-jax/python && pip install -e . && cd ../.. + pip install jax==0.8.1 flax==0.12.0 libtpu==0.0.24 + pip list | egrep 'jax|flax|libtpu' + cd tunix + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with sglang_jax in colocated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=sglang_jax || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with sglang_jax in 2 way disaggregated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=sglang_jax --cluster-setup=disaggregated-2-way || FAILED=1 + + echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with sglang_jax in 3 way disaggregated mode ..." + python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=sglang_jax --cluster-setup=disaggregated-3-way || FAILED=1 + + # echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with sglang_jax with LoRA ..." + # python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine sglang_jax --enable-lora --lora-target-modules all || FAILED=1 + + + if [ "$FAILED" -ne 0 ]; then + echo "One or more scripts failed!" + exit 1 + fi + + - name: Run SFT shell scripts + env: + HF_TOKEN: ${{ inputs.hf_token }} + shell: bash + run: | + SCRIPT_DIR="./tunix/examples/sft/mtnt" + MAX_STEPS=5 + EVAL_EVERY_N_STEPS=1 + + # Check if directory exists + if [ ! -d "$SCRIPT_DIR" ]; then + echo "Directory $SCRIPT_DIR does not exist" + exit 1 + fi + + echo "🔍 Finding scripts in $SCRIPT_DIR" + for script in "$SCRIPT_DIR"/*.sh; do + if [ -f "$script" ]; then + echo "📦 Executing: $script" + echo "MAX_STEPS=$MAX_STEPS, EVAL_EVERY_N_STEPS=$EVAL_EVERY_N_STEPS" + chmod +x "$script" + if bash "$script" \ + --training_config.max_steps "$MAX_STEPS" \ + --training_config.eval_every_n_steps "$EVAL_EVERY_N_STEPS"; then + echo "✅ Successfully completed: $script" + else + exit_code=$? + echo "❌ Failed to complete: $script (Exit Code: $exit_code)" >&2 + exit "$exit_code" + fi + fi + done + echo "🎉 All SFT scripts completed successfully." + + - name: Run RL shell scripts + env: + HF_TOKEN: ${{ inputs.hf_token }} + shell: bash + run: | + SCRIPT_DIR="./tunix/examples/rl/grpo/gsm8k" + EXCLUDE_DIR="verl_compatible" + MAX_STEPS=5 + EVAL_EVERY_N_STEPS=1 + + if [ ! -d "$SCRIPT_DIR" ]; then + echo "Directory $SCRIPT_DIR does not exist" >&2 + exit 1 + fi + + echo "🔍 Finding scripts in $SCRIPT_DIR, excluding $EXCLUDE_DIR" + final_exit_code=0 + + while IFS= read -r script; do + if [ -f "$script" ]; then + echo "📦 Executing: $script" + echo "MAX_STEPS=$MAX_STEPS, EVAL_EVERY_N_STEPS=$EVAL_EVERY_N_STEPS" + chmod +x "$script" + if ! bash "$script" \ + --rl_training_config.max_steps "$MAX_STEPS" \ + --rl_training_config.eval_every_n_steps "$EVAL_EVERY_N_STEPS"; then + exit_code=$? + echo "❌ Failed to complete: $script (Exit Code: $exit_code)" >&2 + final_exit_code=$exit_code + # Stop processing further scripts after the first failure + break + else + echo "✅ Successfully completed: $script" + fi + fi + done < <(find "$SCRIPT_DIR" -name "*.sh" -type f | grep -v "$SCRIPT_DIR/$EXCLUDE_DIR/") + + if [ "$final_exit_code" -ne 0 ]; then + echo "🚫 One or more RL scripts failed. Exiting with code $final_exit_code." >&2 + exit "$final_exit_code" + fi + echo "🎉 All RL scripts completed successfully." diff --git a/.github/workflows/tpu-nightly-regression.yml b/.github/workflows/tpu-nightly-regression.yml index 680434945..46b047827 100644 --- a/.github/workflows/tpu-nightly-regression.yml +++ b/.github/workflows/tpu-nightly-regression.yml @@ -82,143 +82,10 @@ jobs: print(f'SUCCESS: Found {len(devices)} TPU device(s)') " - - name: Run regression scripts - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - id: regression_tests - run: | - # Download GSM8K dataset - mkdir -p /tmp/grpo_test/rl/grpo/data - - FAILED=0 - echo "📦 Executing: examples/deepscaler/math_eval_nb.py..." - python examples/deepscaler/math_eval_nb.py || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vanilla rollout engine in colocated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vanilla rollout engine in 2 way disaggregated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --cluster-setup=disaggregated-2-way || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vanilla rollout engine in 3 way disaggregated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --cluster-setup=disaggregated-3-way || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm rollout engine in colocated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm rollout engine in 2 way disaggregated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm --cluster-setup=disaggregated-2-way || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm rollout engine in 3 way disaggregated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm --cluster-setup=disaggregated-3-way || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm server mode in 2 way disaggregated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm --rollout-server-mode=True --cluster-setup=disaggregated-2-way || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm server mode in 3 way disaggregated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm --rollout-server-mode=True --cluster-setup=disaggregated-3-way || FAILED=1 - - # SGLang Tests - unset JAX_PLATFORMS - pip list | egrep 'jax|flax|libtpu' - cd .. - git clone https://github.com/sgl-project/sglang-jax.git && cd sglang-jax/python && pip install -e . && cd ../.. - pip install jax==0.8.1 flax==0.12.0 libtpu==0.0.24 - pip list | egrep 'jax|flax|libtpu' - cd tunix - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with sglang_jax in colocated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=sglang_jax || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with sglang_jax in 2 way disaggregated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=sglang_jax --cluster-setup=disaggregated-2-way || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with sglang_jax in 3 way disaggregated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=sglang_jax --cluster-setup=disaggregated-3-way || FAILED=1 - - # echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with sglang_jax with LoRA ..." - # python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine sglang_jax --enable-lora --lora-target-modules all || FAILED=1 - - - if [ "$FAILED" -ne 0 ]; then - echo "One or more scripts failed!" - exit 1 - fi - - - name: Run SFT shell scripts - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - SCRIPT_DIR="./tunix/examples/sft/mtnt" - MAX_STEPS=5 - EVAL_EVERY_N_STEPS=1 - - # Check if directory exists - if [ ! -d "$SCRIPT_DIR" ]; then - echo "Directory $SCRIPT_DIR does not exist" - exit 1 - fi - - echo "🔍 Finding scripts in $SCRIPT_DIR" - for script in "$SCRIPT_DIR"/*.sh; do - if [ -f "$script" ]; then - echo "📦 Executing: $script" - echo "MAX_STEPS=$MAX_STEPS, EVAL_EVERY_N_STEPS=$EVAL_EVERY_N_STEPS" - chmod +x "$script" - if bash "$script" \ - --training_config.max_steps "$MAX_STEPS" \ - --training_config.eval_every_n_steps "$EVAL_EVERY_N_STEPS"; then - echo "✅ Successfully completed: $script" - else - exit_code=$? - echo "❌ Failed to complete: $script (Exit Code: $exit_code)" >&2 - exit "$exit_code" - fi - fi - done - echo "🎉 All SFT scripts completed successfully." - - - name: Run RL shell scripts - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - SCRIPT_DIR="./tunix/examples/rl/grpo/gsm8k" - EXCLUDE_DIR="verl_compatible" - MAX_STEPS=5 - EVAL_EVERY_N_STEPS=1 - - if [ ! -d "$SCRIPT_DIR" ]; then - echo "Directory $SCRIPT_DIR does not exist" >&2 - exit 1 - fi - - echo "🔍 Finding scripts in $SCRIPT_DIR, excluding $EXCLUDE_DIR" - final_exit_code=0 - - while IFS= read -r script; do - if [ -f "$script" ]; then - echo "📦 Executing: $script" - echo "MAX_STEPS=$MAX_STEPS, EVAL_EVERY_N_STEPS=$EVAL_EVERY_N_STEPS" - chmod +x "$script" - if ! bash "$script" \ - --rl_training_config.max_steps "$MAX_STEPS" \ - --rl_training_config.eval_every_n_steps "$EVAL_EVERY_N_STEPS"; then - exit_code=$? - echo "❌ Failed to complete: $script (Exit Code: $exit_code)" >&2 - final_exit_code=$exit_code - # Stop processing further scripts after the first failure - break - else - echo "✅ Successfully completed: $script" - fi - fi - done < <(find "$SCRIPT_DIR" -name "*.sh" -type f | grep -v "$SCRIPT_DIR/$EXCLUDE_DIR/") - - if [ "$final_exit_code" -ne 0 ]; then - echo "🚫 One or more RL scripts failed. Exiting with code $final_exit_code." >&2 - exit "$final_exit_code" - fi - echo "🎉 All RL scripts completed successfully." + - name: Run regression tests (shared) + uses: ./.github/actions/run_regression_tests + with: + hf_token: ${{ secrets.HF_TOKEN }} run_latest: outputs: @@ -254,16 +121,15 @@ jobs: cat << 'EOF' > get_commits.py import urllib.request import json + import os + import subprocess - def get_head_commit(repo): - url = f"https://api.github.com/repos/{repo}/commits/main" - req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) + def get_head_commit(repo_url): try: - with urllib.request.urlopen(req) as response: - data = json.loads(response.read().decode()) - return data['sha'] + result = subprocess.run(["git", "ls-remote", repo_url, "HEAD"], capture_output=True, text=True, check=True) + return result.stdout.split()[0] except Exception as e: - print(f"Error fetching {repo}: {e}") + print(f"Error fetching {repo_url}: {e}") return None def get_vllm_lkg_commit(): @@ -277,15 +143,14 @@ jobs: return None vllm_commit = get_vllm_lkg_commit() - tpu_inference_commit = get_head_commit("vllm-project/tpu-inference") + tpu_inference_commit = get_head_commit("https://github.com/vllm-project/tpu-inference.git") - with open(r"${{ github.output }}", "a") as f: + with open(os.environ["GITHUB_OUTPUT"], "a") as f: if vllm_commit: f.write(f"vllm_commit={vllm_commit}\n") if tpu_inference_commit: f.write(f"tpu_inference_commit={tpu_inference_commit}\n") EOF - sed -i "s|\${{ github.output }}|$GITHUB_OUTPUT|g" get_commits.py python3 get_commits.py - name: Install tunix dependencies @@ -323,143 +188,10 @@ jobs: print(f'SUCCESS: Found {len(devices)} TPU device(s)') " - - name: Run regression scripts - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - id: regression_tests - run: | - # Download GSM8K dataset - mkdir -p /tmp/grpo_test/rl/grpo/data - - FAILED=0 - echo "📦 Executing: examples/deepscaler/math_eval_nb.py..." - python examples/deepscaler/math_eval_nb.py || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vanilla rollout engine in colocated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vanilla rollout engine in 2 way disaggregated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --cluster-setup=disaggregated-2-way || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vanilla rollout engine in 3 way disaggregated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --cluster-setup=disaggregated-3-way || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm rollout engine in colocated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm rollout engine in 2 way disaggregated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm --cluster-setup=disaggregated-2-way || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm rollout engine in 3 way disaggregated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm --cluster-setup=disaggregated-3-way || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm server mode in 2 way disaggregated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm --rollout-server-mode=True --cluster-setup=disaggregated-2-way || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with vllm server mode in 3 way disaggregated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=vllm --rollout-server-mode=True --cluster-setup=disaggregated-3-way || FAILED=1 - - # SGLang Tests - unset JAX_PLATFORMS - pip list | egrep 'jax|flax|libtpu' - cd .. - git clone https://github.com/sgl-project/sglang-jax.git && cd sglang-jax/python && pip install -e . && cd ../.. - pip install jax==0.8.1 flax==0.12.0 libtpu==0.0.24 - pip list | egrep 'jax|flax|libtpu' - cd tunix - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with sglang_jax in colocated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=sglang_jax || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with sglang_jax in 2 way disaggregated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=sglang_jax --cluster-setup=disaggregated-2-way || FAILED=1 - - echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with sglang_jax in 3 way disaggregated mode ..." - python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine=sglang_jax --cluster-setup=disaggregated-3-way || FAILED=1 - - # echo "📦 Executing: scripts/grpo_demo_llama3_qwen2.py with sglang_jax with LoRA ..." - # python scripts/grpo_demo_llama3_qwen2.py --root-dir=/tmp/grpo_test --num-batches=20 --rollout-engine sglang_jax --enable-lora --lora-target-modules all || FAILED=1 - - - if [ "$FAILED" -ne 0 ]; then - echo "One or more scripts failed!" - exit 1 - fi - - - name: Run SFT shell scripts - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - SCRIPT_DIR="./tunix/examples/sft/mtnt" - MAX_STEPS=5 - EVAL_EVERY_N_STEPS=1 - - # Check if directory exists - if [ ! -d "$SCRIPT_DIR" ]; then - echo "Directory $SCRIPT_DIR does not exist" - exit 1 - fi - - echo "🔍 Finding scripts in $SCRIPT_DIR" - for script in "$SCRIPT_DIR"/*.sh; do - if [ -f "$script" ]; then - echo "📦 Executing: $script" - echo "MAX_STEPS=$MAX_STEPS, EVAL_EVERY_N_STEPS=$EVAL_EVERY_N_STEPS" - chmod +x "$script" - if bash "$script" \ - --training_config.max_steps "$MAX_STEPS" \ - --training_config.eval_every_n_steps "$EVAL_EVERY_N_STEPS"; then - echo "✅ Successfully completed: $script" - else - exit_code=$? - echo "❌ Failed to complete: $script (Exit Code: $exit_code)" >&2 - exit "$exit_code" - fi - fi - done - echo "🎉 All SFT scripts completed successfully." - - - name: Run RL shell scripts - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - SCRIPT_DIR="./tunix/examples/rl/grpo/gsm8k" - EXCLUDE_DIR="verl_compatible" - MAX_STEPS=5 - EVAL_EVERY_N_STEPS=1 - - if [ ! -d "$SCRIPT_DIR" ]; then - echo "Directory $SCRIPT_DIR does not exist" >&2 - exit 1 - fi - - echo "🔍 Finding scripts in $SCRIPT_DIR, excluding $EXCLUDE_DIR" - final_exit_code=0 - - while IFS= read -r script; do - if [ -f "$script" ]; then - echo "📦 Executing: $script" - echo "MAX_STEPS=$MAX_STEPS, EVAL_EVERY_N_STEPS=$EVAL_EVERY_N_STEPS" - chmod +x "$script" - if ! bash "$script" \ - --rl_training_config.max_steps "$MAX_STEPS" \ - --rl_training_config.eval_every_n_steps "$EVAL_EVERY_N_STEPS"; then - exit_code=$? - echo "❌ Failed to complete: $script (Exit Code: $exit_code)" >&2 - final_exit_code=$exit_code - # Stop processing further scripts after the first failure - break - else - echo "✅ Successfully completed: $script" - fi - fi - done < <(find "$SCRIPT_DIR" -name "*.sh" -type f | grep -v "$SCRIPT_DIR/$EXCLUDE_DIR/") - - if [ "$final_exit_code" -ne 0 ]; then - echo "🚫 One or more RL scripts failed. Exiting with code $final_exit_code." >&2 - exit "$final_exit_code" - fi - echo "🎉 All RL scripts completed successfully." + - name: Run regression tests (shared) + uses: ./.github/actions/run_regression_tests + with: + hf_token: ${{ secrets.HF_TOKEN }} update_lkg_pr: needs: run_latest