Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 10 additions & 39 deletions .github/scripts/common.py → .github/scripts/ci_common.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,23 @@
from __future__ import annotations

import json
import os
import re
import subprocess
import time
import urllib.parse
import urllib.error
import urllib.parse
import urllib.request
import json
from pathlib import Path


# Test files with this marker are intentionally scheduled on CPU-only runners.
GPU_DISABLED_MARKER = re.compile(r"^# GPU=-1\s*$", re.MULTILINE)


def now_ms() -> int:
return time.time_ns() // 1_000_000


def fetch_text(url: str, *, timeout: float, suppress_error: bool = False) -> str:
try:
with urllib.request.urlopen(url, timeout=timeout) as response:
return response.read().decode("utf-8", errors="replace")
except (urllib.error.URLError, TimeoutError, OSError) as exc:
if suppress_error:
print(f"Request failed for {url}: {exc}")
return ""
raise


def fetch_with_retry(url: str, *, timeout: float, retries: int, retry_delay: float) -> str:
last_error: Exception | None = None
for attempt in range(retries + 1):
try:
return fetch_text(url, timeout=timeout)
except (urllib.error.URLError, TimeoutError, OSError) as exc:
last_error = exc
if attempt < retries:
time.sleep(retry_delay)
if last_error is not None:
print(f"Request failed after retries: {last_error}")
return ""


def normalize_base_url(base_url: str) -> str:
return base_url.rstrip("/")

Expand Down Expand Up @@ -89,14 +66,14 @@ def request_json_with_retry(


def append_github_env(name: str, value: str) -> None:
_append_github_file(os.environ.get("GITHUB_ENV"), name, value)
append_github_file(os.environ.get("GITHUB_ENV"), name, value)


def append_github_output(name: str, value: str) -> None:
_append_github_file(os.environ.get("GITHUB_OUTPUT"), name, value)
append_github_file(os.environ.get("GITHUB_OUTPUT"), name, value)


def _append_github_file(target: str | None, name: str, value: str) -> None:
def append_github_file(target: str | None, name: str, value: str) -> None:
if not target:
return
with open(target, "a", encoding="utf-8") as fh:
Expand Down Expand Up @@ -126,18 +103,12 @@ def test_requires_gpu(test_file: str) -> bool:
return GPU_DISABLED_MARKER.search(contents) is None


def quote_url_value(value: str) -> str:
return urllib.parse.quote(value, safe="")


def build_server_info() -> dict[str, str]:
from device_smi import Device

os_info = Device("os")
cpu_model = Device("cpu").model
platform_name = (
os.environ.get("GPU_PLATFORM")
or cpu_model
)
platform_name = os.environ.get("GPU_PLATFORM") or cpu_model
return {
"platform": platform_name,
"arch": os_info.arch,
Expand Down
78 changes: 62 additions & 16 deletions .github/scripts/allocate_gpu.py → .github/scripts/ci_gpu.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from __future__ import annotations

import argparse
import subprocess
import sys
import time
import urllib.error

from common import (
from ci_common import (
append_github_env,
build_get_request,
build_job_request,
build_server_info,
extract_gpu_ids,
format_info_url,
Expand Down Expand Up @@ -43,21 +47,7 @@ def print_status(base_url: str, runner_name: str) -> None:
print(status)


def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--base-url", required=True)
parser.add_argument("--run-id", required=True)
parser.add_argument("--test", required=True)
parser.add_argument("--runner", required=True)
parser.add_argument("--count", required=True)
parser.add_argument("--sleep-sec", type=float, default=5)
parser.add_argument("--timeout-sec", type=int, default=18000)
parser.add_argument("--request-timeout", type=float, default=10)
parser.add_argument("--retries", type=int, default=3)
parser.add_argument("--retry-delay", type=float, default=1)
parser.add_argument("--require-single", action="store_true")
args = parser.parse_args()

def allocate_gpu(args: argparse.Namespace) -> int:
start_s = time.time()
endpoint = f"{normalize_base_url(args.base_url)}/get"

Expand Down Expand Up @@ -121,5 +111,61 @@ def main() -> int:
return 0


def release_gpu(args: argparse.Namespace) -> int:
request_body = build_job_request(
runner_name=args.runner,
run_id=args.run_id,
test_name=args.test,
)
url = f"{normalize_base_url(args.base_url)}/release"
print(url)

try:
response = request_json(url, method="POST", body=request_body, timeout=args.timeout)
except (urllib.error.URLError, TimeoutError, OSError, ValueError) as exc:
print(f"Failed to release GPU: {exc}")
return 0

resp = extract_gpu_ids(response)
print(f"response: {resp}")
if args.gpu_id and resp not in {args.gpu_id, "-1"}:
print(f"Error: response ({resp}) != expected ({args.gpu_id})")
return 0


def main() -> int:
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(dest="command", required=True)

allocate_parser = subparsers.add_parser("allocate")
allocate_parser.add_argument("--base-url", required=True)
allocate_parser.add_argument("--run-id", required=True)
allocate_parser.add_argument("--test", required=True)
allocate_parser.add_argument("--runner", required=True)
allocate_parser.add_argument("--count", required=True)
allocate_parser.add_argument("--sleep-sec", type=float, default=5)
allocate_parser.add_argument("--timeout-sec", type=int, default=18000)
allocate_parser.add_argument("--request-timeout", type=float, default=10)
allocate_parser.add_argument("--retries", type=int, default=3)
allocate_parser.add_argument("--retry-delay", type=float, default=1)
allocate_parser.add_argument("--require-single", action="store_true")

release_parser = subparsers.add_parser("release")
release_parser.add_argument("--base-url", required=True)
release_parser.add_argument("--run-id", required=True)
release_parser.add_argument("--gpu-id", default="")
release_parser.add_argument("--timestamp")
release_parser.add_argument("--test", required=True)
release_parser.add_argument("--runner", required=True)
release_parser.add_argument("--timeout", type=float, default=10)

args = parser.parse_args()
if args.command == "allocate":
return allocate_gpu(args)
if args.command == "release":
return release_gpu(args)
raise AssertionError(f"Unhandled command: {args.command}")


if __name__ == "__main__":
sys.exit(main())
37 changes: 0 additions & 37 deletions .github/scripts/ci_loop_versions.py

This file was deleted.

14 changes: 14 additions & 0 deletions .github/scripts/ci_prepare_checkout.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash
set -euo pipefail

pr_number="${1:-0}"

git config --global --add safe.directory "$(pwd)"

if [[ -z "$pr_number" || "$pr_number" == "0" ]]; then
exit 0
fi

echo "pr number $pr_number"
git fetch origin "pull/${pr_number}/head:pr-${pr_number}"
git checkout "pr-${pr_number}"
35 changes: 35 additions & 0 deletions .github/scripts/ci_restore_uv_cache.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env bash
set -euo pipefail

tar_file="${1:-/opt/dist/uv.tar.xz}"
cache_dir="${2:-/opt/uv/cache}"
tmp_dir="${cache_dir}/tmp"
last_file="${cache_dir}/lastmodified"

if [[ ! -f "$tar_file" ]]; then
echo "uv cache archive not found: $tar_file"
exit 0
fi

tar_mtime="$(stat -c %Y "$tar_file")"
last_mtime="0"
if [[ -f "$last_file" ]]; then
last_mtime="$(<"$last_file")"
fi

if [[ "$tar_mtime" == "$last_mtime" ]]; then
echo "uv cache archive unchanged, skip decompress"
exit 0
fi

echo "decompressing $tar_file into $cache_dir..."
mkdir -p "$tmp_dir"
rm -rf "${tmp_dir:?}/"*
tar -xJf "$tar_file" -C "$tmp_dir"
rm -rf "$cache_dir/uv"
mv "$tmp_dir/uv" "$cache_dir/uv"
printf '%s\n' "$tar_mtime" > "$last_file"

ls -ahl "$cache_dir"
echo "=========="
ls -ahl "$cache_dir/uv"
Loading