diff --git a/.github/scripts/common.py b/.github/scripts/ci_common.py similarity index 79% rename from .github/scripts/common.py rename to .github/scripts/ci_common.py index 4b55f83..be473c8 100644 --- a/.github/scripts/common.py +++ b/.github/scripts/ci_common.py @@ -1,14 +1,16 @@ +from __future__ import annotations + +import json import os import re import subprocess import time -import urllib.parse import urllib.error +import urllib.parse import urllib.request -import json from pathlib import Path - +# Test files with this marker are intentionally scheduled on CPU-only runners. GPU_DISABLED_MARKER = re.compile(r"^# GPU=-1\s*$", re.MULTILINE) @@ -16,31 +18,6 @@ def now_ms() -> int: return time.time_ns() // 1_000_000 -def fetch_text(url: str, *, timeout: float, suppress_error: bool = False) -> str: - try: - with urllib.request.urlopen(url, timeout=timeout) as response: - return response.read().decode("utf-8", errors="replace") - except (urllib.error.URLError, TimeoutError, OSError) as exc: - if suppress_error: - print(f"Request failed for {url}: {exc}") - return "" - raise - - -def fetch_with_retry(url: str, *, timeout: float, retries: int, retry_delay: float) -> str: - last_error: Exception | None = None - for attempt in range(retries + 1): - try: - return fetch_text(url, timeout=timeout) - except (urllib.error.URLError, TimeoutError, OSError) as exc: - last_error = exc - if attempt < retries: - time.sleep(retry_delay) - if last_error is not None: - print(f"Request failed after retries: {last_error}") - return "" - - def normalize_base_url(base_url: str) -> str: return base_url.rstrip("/") @@ -89,14 +66,14 @@ def request_json_with_retry( def append_github_env(name: str, value: str) -> None: - _append_github_file(os.environ.get("GITHUB_ENV"), name, value) + append_github_file(os.environ.get("GITHUB_ENV"), name, value) def append_github_output(name: str, value: str) -> None: - _append_github_file(os.environ.get("GITHUB_OUTPUT"), name, value) + append_github_file(os.environ.get("GITHUB_OUTPUT"), name, value) -def _append_github_file(target: str | None, name: str, value: str) -> None: +def append_github_file(target: str | None, name: str, value: str) -> None: if not target: return with open(target, "a", encoding="utf-8") as fh: @@ -126,18 +103,12 @@ def test_requires_gpu(test_file: str) -> bool: return GPU_DISABLED_MARKER.search(contents) is None -def quote_url_value(value: str) -> str: - return urllib.parse.quote(value, safe="") - - def build_server_info() -> dict[str, str]: from device_smi import Device + os_info = Device("os") cpu_model = Device("cpu").model - platform_name = ( - os.environ.get("GPU_PLATFORM") - or cpu_model - ) + platform_name = os.environ.get("GPU_PLATFORM") or cpu_model return { "platform": platform_name, "arch": os_info.arch, diff --git a/.github/scripts/allocate_gpu.py b/.github/scripts/ci_gpu.py similarity index 57% rename from .github/scripts/allocate_gpu.py rename to .github/scripts/ci_gpu.py index 0ea1929..7d85049 100644 --- a/.github/scripts/allocate_gpu.py +++ b/.github/scripts/ci_gpu.py @@ -1,11 +1,15 @@ +from __future__ import annotations + import argparse import subprocess import sys import time +import urllib.error -from common import ( +from ci_common import ( append_github_env, build_get_request, + build_job_request, build_server_info, extract_gpu_ids, format_info_url, @@ -43,21 +47,7 @@ def print_status(base_url: str, runner_name: str) -> None: print(status) -def main() -> int: - parser = argparse.ArgumentParser() - parser.add_argument("--base-url", required=True) - parser.add_argument("--run-id", required=True) - parser.add_argument("--test", required=True) - parser.add_argument("--runner", required=True) - parser.add_argument("--count", required=True) - parser.add_argument("--sleep-sec", type=float, default=5) - parser.add_argument("--timeout-sec", type=int, default=18000) - parser.add_argument("--request-timeout", type=float, default=10) - parser.add_argument("--retries", type=int, default=3) - parser.add_argument("--retry-delay", type=float, default=1) - parser.add_argument("--require-single", action="store_true") - args = parser.parse_args() - +def allocate_gpu(args: argparse.Namespace) -> int: start_s = time.time() endpoint = f"{normalize_base_url(args.base_url)}/get" @@ -121,5 +111,61 @@ def main() -> int: return 0 +def release_gpu(args: argparse.Namespace) -> int: + request_body = build_job_request( + runner_name=args.runner, + run_id=args.run_id, + test_name=args.test, + ) + url = f"{normalize_base_url(args.base_url)}/release" + print(url) + + try: + response = request_json(url, method="POST", body=request_body, timeout=args.timeout) + except (urllib.error.URLError, TimeoutError, OSError, ValueError) as exc: + print(f"Failed to release GPU: {exc}") + return 0 + + resp = extract_gpu_ids(response) + print(f"response: {resp}") + if args.gpu_id and resp not in {args.gpu_id, "-1"}: + print(f"Error: response ({resp}) != expected ({args.gpu_id})") + return 0 + + +def main() -> int: + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(dest="command", required=True) + + allocate_parser = subparsers.add_parser("allocate") + allocate_parser.add_argument("--base-url", required=True) + allocate_parser.add_argument("--run-id", required=True) + allocate_parser.add_argument("--test", required=True) + allocate_parser.add_argument("--runner", required=True) + allocate_parser.add_argument("--count", required=True) + allocate_parser.add_argument("--sleep-sec", type=float, default=5) + allocate_parser.add_argument("--timeout-sec", type=int, default=18000) + allocate_parser.add_argument("--request-timeout", type=float, default=10) + allocate_parser.add_argument("--retries", type=int, default=3) + allocate_parser.add_argument("--retry-delay", type=float, default=1) + allocate_parser.add_argument("--require-single", action="store_true") + + release_parser = subparsers.add_parser("release") + release_parser.add_argument("--base-url", required=True) + release_parser.add_argument("--run-id", required=True) + release_parser.add_argument("--gpu-id", default="") + release_parser.add_argument("--timestamp") + release_parser.add_argument("--test", required=True) + release_parser.add_argument("--runner", required=True) + release_parser.add_argument("--timeout", type=float, default=10) + + args = parser.parse_args() + if args.command == "allocate": + return allocate_gpu(args) + if args.command == "release": + return release_gpu(args) + raise AssertionError(f"Unhandled command: {args.command}") + + if __name__ == "__main__": sys.exit(main()) diff --git a/.github/scripts/ci_loop_versions.py b/.github/scripts/ci_loop_versions.py deleted file mode 100644 index c65b14d..0000000 --- a/.github/scripts/ci_loop_versions.py +++ /dev/null @@ -1,37 +0,0 @@ -import argparse -import json - -import requests -from packaging.specifiers import SpecifierSet -from packaging.version import Version - - -def get_versions(package: str, version_spec: str) -> list[str]: - specifier = SpecifierSet(version_spec) - - url = f"https://pypi.org/pypi/{package}/json" - resp = requests.get(url, timeout=30) - resp.raise_for_status() - data = resp.json() - - all_versions = data["releases"].keys() - - matched = sorted( - (Version(v) for v in all_versions if Version(v) in specifier), - reverse=True, - ) - return [str(v) for v in matched] - - -def main(): - parser = argparse.ArgumentParser(description="List matching PyPI versions as JSON") - parser.add_argument("package", help="package name, e.g. setuptools") - parser.add_argument("version", help='version spec, e.g. ">=77.0.1,<83"') - args = parser.parse_args() - - versions = get_versions(args.package, args.version) - print(json.dumps(versions)) - - -if __name__ == "__main__": - main() diff --git a/.github/scripts/ci_prepare_checkout.sh b/.github/scripts/ci_prepare_checkout.sh new file mode 100644 index 0000000..2062f99 --- /dev/null +++ b/.github/scripts/ci_prepare_checkout.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +set -euo pipefail + +pr_number="${1:-0}" + +git config --global --add safe.directory "$(pwd)" + +if [[ -z "$pr_number" || "$pr_number" == "0" ]]; then + exit 0 +fi + +echo "pr number $pr_number" +git fetch origin "pull/${pr_number}/head:pr-${pr_number}" +git checkout "pr-${pr_number}" diff --git a/.github/scripts/ci_restore_uv_cache.sh b/.github/scripts/ci_restore_uv_cache.sh new file mode 100644 index 0000000..3e12d3e --- /dev/null +++ b/.github/scripts/ci_restore_uv_cache.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +set -euo pipefail + +tar_file="${1:-/opt/dist/uv.tar.xz}" +cache_dir="${2:-/opt/uv/cache}" +tmp_dir="${cache_dir}/tmp" +last_file="${cache_dir}/lastmodified" + +if [[ ! -f "$tar_file" ]]; then + echo "uv cache archive not found: $tar_file" + exit 0 +fi + +tar_mtime="$(stat -c %Y "$tar_file")" +last_mtime="0" +if [[ -f "$last_file" ]]; then + last_mtime="$(<"$last_file")" +fi + +if [[ "$tar_mtime" == "$last_mtime" ]]; then + echo "uv cache archive unchanged, skip decompress" + exit 0 +fi + +echo "decompressing $tar_file into $cache_dir..." +mkdir -p "$tmp_dir" +rm -rf "${tmp_dir:?}/"* +tar -xJf "$tar_file" -C "$tmp_dir" +rm -rf "$cache_dir/uv" +mv "$tmp_dir/uv" "$cache_dir/uv" +printf '%s\n' "$tar_mtime" > "$last_file" + +ls -ahl "$cache_dir" +echo "==========" +ls -ahl "$cache_dir/uv" diff --git a/.github/scripts/run_tests.py b/.github/scripts/ci_tests.py similarity index 50% rename from .github/scripts/run_tests.py rename to .github/scripts/ci_tests.py index 19c89bf..e3c6be9 100644 --- a/.github/scripts/run_tests.py +++ b/.github/scripts/ci_tests.py @@ -1,4 +1,7 @@ +from __future__ import annotations + import argparse +import json import os import shutil import signal @@ -7,17 +10,129 @@ import threading import time import urllib.error +from dataclasses import asdict, dataclass from pathlib import Path -from common import ( + +from ci_common import ( append_github_env, + append_github_output, build_job_request, extract_gpu_ids, normalize_base_url, normalize_test_file, request_json, + test_requires_gpu, to_safe_name, ) +# GPTQModel engine tests need a pinned CPU wheel because upstream CPU artifacts are version-sensitive. +TORCHAO_CPU_WHEEL = ( + "https://download.pytorch.org/whl/cpu/" + "torchao-0.17.0%2Bcpu-py3-none-any.whl" + "#sha256=6c0ce8b506c72be4efb1f0c6fd1679cb58145efebb20d51ac1adf7a7b3ebb872" +) +# CI uses a non-zero exit rewrite so GitHub summaries clearly distinguish test process failures. +ERROR_EXIT_CODE = 22 + + +# Unit test metadata is computed once per matrix entry and exported to GitHub env/output files. +@dataclass(frozen=True) +class UnitTestConfig: + """Describe the Python/runtime settings needed by one test file.""" + + test_file: str + safe_name: str + requires_gpu: bool + python_version: str + uv_python: str + + +def resolve_unit_test_config(test_file: str) -> UnitTestConfig: + normalized = normalize_test_file(test_file) + python_version = "3.14t" + uv_python = "3.14t" + + if normalized == "tests/test_tensorrt_llm_engine.py": + python_version = "3.12" + uv_python = "3.12" + + return UnitTestConfig( + test_file=normalized, + safe_name=to_safe_name(normalized), + requires_gpu=test_requires_gpu(normalized), + python_version=python_version, + uv_python=uv_python, + ) + + +def export_unit_test_metadata(test_file: str) -> None: + config = resolve_unit_test_config(test_file) + + append_github_env("SAFE_NAME", config.safe_name) + append_github_env("TEST_REQUIRES_GPU", str(config.requires_gpu).lower()) + append_github_env("PYTHON_VERSION", config.python_version) + append_github_env("UV_PYTHON", config.uv_python) + + append_github_output("safe-name", config.safe_name) + append_github_output("requires-gpu", str(config.requires_gpu).lower()) + append_github_output("python-version", config.python_version) + append_github_output("uv-python", config.uv_python) + + print(json.dumps(asdict(config), ensure_ascii=False, indent=2)) + + +def run(cmd: list[str]) -> None: + print(f"+ {' '.join(cmd)}") + subprocess.check_call(cmd) + + +def uv_install(*packages: str, upgrade: bool = False) -> None: + if not packages: + return + cmd = ["uv", "pip", "install"] + if upgrade: + cmd.append("-U") + cmd.extend(packages) + run(cmd) + + +def install_flash_attn(uv_python: str, runner: str) -> None: + if uv_python == "3.14t": + uv_install(f"http://{runner}/files/flash_attn/flash_attn-2.8.4-cp314-cp314t-linux_x86_64.whl") + return + + if uv_python == "3.12": + append_github_env("EVALUTION_SKIP_GIL_CHECK", "1") + uv_install("tensorrt_llm", upgrade=True) + uv_install(f"http://{runner}/files/flash_attn/flash_attn-2.8.4-cp312-cp312-linux_x86_64.whl") + return + + uv_install("flash-attn") + run(["uv", "pip", "show", "flash-attn"]) + + +def install_test_specific_deps(test_file: str) -> None: + if test_file != "tests/test_gptqmodel_engine.py": + return + + uv_install("accelerate", upgrade=True) + uv_install(TORCHAO_CPU_WHEEL, upgrade=True) + + print("== installing gptqmodel ==") + uv_install("gptqmodel", upgrade=True) + run(["uv", "pip", "show", "gptqmodel"]) + + +def install_test_deps(test_file: str, runner: str, uv_python: str, install_project: bool) -> None: + normalized = normalize_test_file(test_file) + + if install_project: + uv_install(".") + uv_install("pytest", "datasets", "rouge_score", "sglang", "pybase64", upgrade=True) + + install_flash_attn(uv_python, runner) + install_test_specific_deps(normalized) + def kill_process_group(proc: subprocess.Popen[str]) -> None: try: @@ -96,18 +211,7 @@ def log_python_and_pytest_resolution() -> None: print(f"pytest shebang={first_line}") -def main() -> int: - parser = argparse.ArgumentParser() - parser.add_argument("--base-url", required=True) - parser.add_argument("--run-id", required=True) - parser.add_argument("--test-file", required=True) - parser.add_argument("--runner", required=True) - parser.add_argument("--gpu-id", default="") - parser.add_argument("--monitor-interval-sec", type=int, default=60) - parser.add_argument("--artifacts-dir", default="artifacts") - parser.add_argument("--clear-cuda", action="store_true") - args = parser.parse_args() - +def run_test(args: argparse.Namespace) -> int: env = os.environ.copy() if args.clear_cuda: env["CUDA_VISIBLE_DEVICES"] = "" @@ -172,13 +276,13 @@ def main() -> int: monitor_thread.join(timeout=5) if monitor_state["forced_exit_code"]: - append_github_env("ERROR", "22") - return 22 + append_github_env("ERROR", str(ERROR_EXIT_CODE)) + return ERROR_EXIT_CODE if return_code != 0: - append_github_env("ERROR", "22") + append_github_env("ERROR", str(ERROR_EXIT_CODE)) print(f"pipe status wrong: {return_code}") - return 22 + return ERROR_EXIT_CODE execution_time = int(time.time() - start_time) print(f"{execution_time // 60}m {execution_time % 60}s") @@ -193,5 +297,40 @@ def main() -> int: return 0 +def main() -> int: + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(dest="command", required=True) + + set_metadata_parser = subparsers.add_parser("set-metadata") + set_metadata_parser.add_argument("--test-file", required=True) + + install_deps_parser = subparsers.add_parser("install-deps") + install_deps_parser.add_argument("--test-file", required=True) + install_deps_parser.add_argument("--runner", default=os.environ.get("RUNNER", "10.0.13.31")) + install_deps_parser.add_argument("--uv-python", default=os.environ.get("UV_PYTHON", "")) + install_deps_parser.add_argument("--install-project", action="store_true") + + run_parser = subparsers.add_parser("run") + run_parser.add_argument("--base-url", required=True) + run_parser.add_argument("--run-id", required=True) + run_parser.add_argument("--test-file", required=True) + run_parser.add_argument("--runner", required=True) + run_parser.add_argument("--gpu-id", default="") + run_parser.add_argument("--monitor-interval-sec", type=int, default=60) + run_parser.add_argument("--artifacts-dir", default="artifacts") + run_parser.add_argument("--clear-cuda", action="store_true") + + args = parser.parse_args() + if args.command == "set-metadata": + export_unit_test_metadata(args.test_file) + return 0 + if args.command == "install-deps": + install_test_deps(args.test_file, args.runner, args.uv_python, args.install_project) + return 0 + if args.command == "run": + return run_test(args) + raise AssertionError(f"Unhandled command: {args.command}") + + if __name__ == "__main__": sys.exit(main()) diff --git a/.github/scripts/ci_workflow.py b/.github/scripts/ci_workflow.py new file mode 100644 index 0000000..8767cb8 --- /dev/null +++ b/.github/scripts/ci_workflow.py @@ -0,0 +1,162 @@ +from __future__ import annotations + +import argparse +import json +import re +from dataclasses import dataclass +from pathlib import Path + +from ci_common import normalize_test_file, test_requires_gpu + + +# Keep the reusable workflow matrix payload constrained to the single field the runner needs. +@dataclass(frozen=True) +class TestMatrixEntry: + """Represent one test case entry consumed by the reusable unit-test workflow.""" + + test_file: str + + def as_dict(self) -> dict[str, str]: + return {"test_file": self.test_file} + + +# Test discovery only schedules pytest files from the repo's test tree. +TEST_FILE_GLOB = "test_*.py" + + +def compile_regex(pattern: str) -> re.Pattern[str] | None: + if not pattern: + return None + return re.compile(pattern) + + +def normalize_rel_test_path(path: Path) -> str: + return normalize_test_file(path.as_posix()) + + +def matches_test_regex(compiled: re.Pattern[str] | None, rel_path: str) -> bool: + if compiled is None: + return True + candidates = { + rel_path, + rel_path.removeprefix("tests/"), + Path(rel_path).name, + Path(rel_path).stem, + } + return any(compiled.search(candidate) for candidate in candidates) + + +def is_model_test(rel_path: str) -> bool: + return rel_path.startswith("tests/models/") + + +def sort_key(rel_path: str) -> tuple[int, str]: + return (len(Path(rel_path).parts), rel_path) + + +def list_tests( + *, + tests_root: str | Path, + test_regex: str, +) -> tuple[list[str], list[str], list[str]]: + root = Path(tests_root) + compiled_regex = compile_regex(test_regex) + + cpu_tests: list[str] = [] + torch_tests: list[str] = [] + model_tests: list[str] = [] + + for path in sorted(root.rglob(TEST_FILE_GLOB)): + rel_path = normalize_rel_test_path(path) + if not matches_test_regex(compiled_regex, rel_path): + continue + if is_model_test(rel_path): + model_tests.append(rel_path) + continue + if test_requires_gpu(rel_path): + torch_tests.append(rel_path) + continue + cpu_tests.append(rel_path) + + return ( + sorted(cpu_tests, key=sort_key), + sorted(torch_tests, key=sort_key), + sorted(model_tests, key=sort_key), + ) + + +def build_group_matrix(tests: list[str]) -> list[dict[str, str]]: + return [TestMatrixEntry(test_file=test_file).as_dict() for test_file in tests] + + +def build_test_plan(*, tests_root: str | Path, test_regex: str) -> dict[str, list[dict[str, str]] | list[str]]: + cpu_tests, torch_tests, model_tests = list_tests( + tests_root=tests_root, + test_regex=test_regex, + ) + return { + "cpu_files": cpu_tests, + "torch_files": torch_tests, + "model_files": model_tests, + "cpu_matrix": build_group_matrix(cpu_tests), + "torch_matrix": build_group_matrix(torch_tests), + "model_matrix": build_group_matrix(model_tests), + } + + +def list_matching_versions(package: str, version_spec: str) -> list[str]: + # Defer optional dependencies so local test discovery can run on the stock runner image. + import requests + from packaging.specifiers import SpecifierSet + from packaging.version import Version + + specifier = SpecifierSet(version_spec) + response = requests.get(f"https://pypi.org/pypi/{package}/json", timeout=30) + response.raise_for_status() + data = response.json() + matched = sorted( + (Version(version) for version in data["releases"].keys() if Version(version) in specifier), + reverse=True, + ) + return [str(version) for version in matched] + + +def cmd_list_tests(args: argparse.Namespace) -> int: + print( + json.dumps( + build_test_plan( + tests_root=args.tests_root, + test_regex=args.test_regex, + ) + ) + ) + return 0 + + +def cmd_loop_versions(args: argparse.Namespace) -> int: + print(json.dumps(list_matching_versions(args.package, args.version))) + return 0 + + +def main() -> int: + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(dest="command", required=True) + + list_parser = subparsers.add_parser("list-tests") + list_parser.add_argument("--tests-root", default="tests") + list_parser.add_argument("--test-regex", default="") + + loop_versions_parser = subparsers.add_parser("loop-versions") + loop_versions_parser.add_argument("package") + loop_versions_parser.add_argument("version") + + args = parser.parse_args() + if args.command == "list-tests": + return cmd_list_tests(args) + if args.command == "loop-versions": + return cmd_loop_versions(args) + raise AssertionError(f"Unhandled command: {args.command}") + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/.github/scripts/ci_write_runner_outputs.sh b/.github/scripts/ci_write_runner_outputs.sh new file mode 100644 index 0000000..4995555 --- /dev/null +++ b/.github/scripts/ci_write_runner_outputs.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +set -euo pipefail + +runner_ip="${1:?runner ip is required}" +github_run_id="${2:?github run id is required}" +artifact_id="${3:-}" +max_parallel="${4:-}" + +if [[ -z "${GITHUB_OUTPUT:-}" ]]; then + echo "GITHUB_OUTPUT is required" >&2 + exit 1 +fi + +run_id="$github_run_id" +if [[ -n "$artifact_id" ]]; then + run_id="$artifact_id" +fi + +echo "ip=$runner_ip" >> "$GITHUB_OUTPUT" +echo "ip: $runner_ip" +echo "run_id=$run_id" >> "$GITHUB_OUTPUT" +echo "artifact_id=$run_id" + +if [[ -n "$max_parallel" ]]; then + max_parallel_json="{\"size\": ${max_parallel:-20}}" + echo "max-parallel=$max_parallel_json" >> "$GITHUB_OUTPUT" + echo "max-parallel=$max_parallel_json" +fi diff --git a/.github/scripts/install_unit_test_deps.py b/.github/scripts/install_unit_test_deps.py deleted file mode 100644 index 803566a..0000000 --- a/.github/scripts/install_unit_test_deps.py +++ /dev/null @@ -1,76 +0,0 @@ -import argparse -import os -import subprocess - -from common import append_github_env, normalize_test_file - - -TORCHAO_CPU_WHEEL = ( - "https://download.pytorch.org/whl/cpu/" - "torchao-0.17.0%2Bcpu-py3-none-any.whl" - "#sha256=6c0ce8b506c72be4efb1f0c6fd1679cb58145efebb20d51ac1adf7a7b3ebb872" -) - - -def run(cmd: list[str]) -> None: - print(f"+ {' '.join(cmd)}") - subprocess.check_call(cmd) - - -def uv_install(*packages: str, upgrade: bool = False) -> None: - if not packages: - return - cmd = ["uv", "pip", "install"] - if upgrade: - cmd.append("-U") - cmd.extend(packages) - run(cmd) - - -def install_flash_attn(uv_python: str, runner: str) -> None: - if uv_python == "3.14t": - uv_install(f"http://{runner}/files/flash_attn/flash_attn-2.8.4-cp314-cp314t-linux_x86_64.whl") - return - - if uv_python == "3.12": - append_github_env("EVALUTION_SKIP_GIL_CHECK", "1") - uv_install("tensorrt_llm", upgrade=True) - uv_install(f"http://{runner}/files/flash_attn/flash_attn-2.8.4-cp312-cp312-linux_x86_64.whl") - return - - uv_install("flash-attn") - run(["uv", "pip", "show", "flash-attn"]) - - -def install_test_specific_deps(test_file: str) -> None: - if test_file != "tests/test_gptqmodel_engine.py": - return - - uv_install("accelerate", upgrade=True) - uv_install(TORCHAO_CPU_WHEEL, upgrade=True) - - print("== installing gptqmodel ==") - uv_install("gptqmodel", upgrade=True) - run(["uv", "pip", "show", "gptqmodel"]) - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--test-file", required=True) - parser.add_argument("--runner", default=os.environ.get("RUNNER", "10.0.13.31")) - parser.add_argument("--uv-python", default=os.environ.get("UV_PYTHON", "")) - parser.add_argument("--install-project", action="store_true") - args = parser.parse_args() - - test_file = normalize_test_file(args.test_file) - - if args.install_project: - uv_install(".") - uv_install("pytest", "datasets", "rouge_score", "sglang", "pybase64", upgrade=True) - - install_flash_attn(args.uv_python, args.runner) - install_test_specific_deps(test_file) - - -if __name__ == "__main__": - main() diff --git a/.github/scripts/list_test_files.py b/.github/scripts/list_test_files.py deleted file mode 100644 index 3715ffe..0000000 --- a/.github/scripts/list_test_files.py +++ /dev/null @@ -1,58 +0,0 @@ -import argparse -import json -import re -from pathlib import Path - -from common import normalize_test_file - - -def sort_key(path: Path, root: Path) -> tuple[int, str]: - rel = path.relative_to(root) - return (len(rel.parts), path.as_posix()) - - -def list_test_files(tests_root: str = "tests", test_regex: str = "") -> list[str]: - root = Path(tests_root) - regex = re.compile(test_regex) if test_regex else None - files: list[str] = [] - for path in sorted(root.rglob("test_*.py"), key=lambda item: sort_key(item, root)): - rel = normalize_test_file(path.as_posix()) - if regex and not regex.search(rel): - continue - files.append(rel) - return files - - -def split_evenly(files: list[str], group_count: int) -> list[list[str]]: - if group_count <= 0: - raise ValueError("group_count must be greater than 0") - - base_size, remainder = divmod(len(files), group_count) - groups: list[list[str]] = [] - start = 0 - for index in range(group_count): - size = base_size + (1 if index < remainder else 0) - end = start + size - groups.append(files[start:end]) - start = end - return groups - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--tests-root", default="tests") - parser.add_argument("--test-regex", default="") - parser.add_argument("--group-count", type=int, default=1) - args = parser.parse_args() - - files = list_test_files(args.tests_root, args.test_regex) - if args.group_count == 1: - print(json.dumps(files, ensure_ascii=False)) - return - - for group in split_evenly(files, args.group_count): - print(json.dumps(group, ensure_ascii=False)) - - -if __name__ == "__main__": - main() diff --git a/.github/scripts/prepare_unit_test.py b/.github/scripts/prepare_unit_test.py deleted file mode 100644 index 9596b78..0000000 --- a/.github/scripts/prepare_unit_test.py +++ /dev/null @@ -1,29 +0,0 @@ -import argparse -import json - -from common import append_github_env, append_github_output -from unit_test_config import resolve_unit_test_config - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--test-file", required=True) - args = parser.parse_args() - - config = resolve_unit_test_config(args.test_file) - - append_github_env("SAFE_NAME", config.safe_name) - append_github_env("TEST_REQUIRES_GPU", str(config.requires_gpu).lower()) - append_github_env("PYTHON_VERSION", config.python_version) - append_github_env("UV_PYTHON", config.uv_python) - - append_github_output("safe-name", config.safe_name) - append_github_output("requires-gpu", str(config.requires_gpu).lower()) - append_github_output("python-version", config.python_version) - append_github_output("uv-python", config.uv_python) - - print(json.dumps(config.__dict__, ensure_ascii=False, indent=2)) - - -if __name__ == "__main__": - main() diff --git a/.github/scripts/release_gpu.py b/.github/scripts/release_gpu.py deleted file mode 100644 index 2cf6a0b..0000000 --- a/.github/scripts/release_gpu.py +++ /dev/null @@ -1,41 +0,0 @@ -import argparse -import sys -import urllib.error - -from common import build_job_request, extract_gpu_ids, normalize_base_url, request_json - - -def main() -> int: - parser = argparse.ArgumentParser() - parser.add_argument("--base-url", required=True) - parser.add_argument("--run-id", required=True) - parser.add_argument("--gpu-id", default="") - parser.add_argument("--timestamp") - parser.add_argument("--test", required=True) - parser.add_argument("--runner", required=True) - parser.add_argument("--timeout", type=float, default=10) - args = parser.parse_args() - - request_body = build_job_request( - runner_name=args.runner, - run_id=args.run_id, - test_name=args.test, - ) - url = f"{normalize_base_url(args.base_url)}/release" - print(url) - - try: - response = request_json(url, method="POST", body=request_body, timeout=args.timeout) - except (urllib.error.URLError, TimeoutError, OSError, ValueError) as exc: - print(f"Failed to release GPU: {exc}") - return 0 - - resp = extract_gpu_ids(response) - print(f"response: {resp}") - if args.gpu_id and resp not in {args.gpu_id, "-1"}: - print(f"Error: response ({resp}) != expected ({args.gpu_id})") - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/.github/scripts/unit_test_config.py b/.github/scripts/unit_test_config.py deleted file mode 100644 index 63b4f04..0000000 --- a/.github/scripts/unit_test_config.py +++ /dev/null @@ -1,36 +0,0 @@ -from __future__ import annotations - -from dataclasses import asdict, dataclass - -from common import normalize_test_file, test_requires_gpu, to_safe_name - - -@dataclass(frozen=True) -class UnitTestConfig: - test_file: str - safe_name: str - requires_gpu: bool - python_version: str - uv_python: str - - -def resolve_unit_test_config(test_file: str) -> UnitTestConfig: - normalized = normalize_test_file(test_file) - python_version = "3.14t" - uv_python = "3.14t" - - if normalized == "tests/test_tensorrt_llm_engine.py": - python_version = "3.12" - uv_python = "3.12" - - return UnitTestConfig( - test_file=normalized, - safe_name=to_safe_name(normalized), - requires_gpu=test_requires_gpu(normalized), - python_version=python_version, - uv_python=uv_python, - ) - - -def resolve_unit_test_config_dict(test_file: str) -> dict[str, str | bool]: - return asdict(resolve_unit_test_config(test_file)) diff --git a/.github/workflows/compatibility.yml b/.github/workflows/compatibility.yml deleted file mode 100644 index 882ea64..0000000 --- a/.github/workflows/compatibility.yml +++ /dev/null @@ -1,55 +0,0 @@ -name: Test Compatibility - -on: - push: - paths: - - pyproject.toml - workflow_dispatch: - -permissions: - contents: read - -jobs: - - prepare-setuptools: - runs-on: ubuntu-latest - outputs: - versions: ${{ steps.parser.outputs.versions || '[]' }} - steps: - - uses: actions/checkout@v6 - - uses: actions/setup-python@v6 - with: - python-version: "3.14t" - - - name: Generate version matrix - id: parser - run: | - python -m pip install --upgrade requests packaging - versions=$(python .github/scripts/ci_loop_versions.py setuptools ">=77.0.1,<83") - echo "versions=$versions" >> "$GITHUB_OUTPUT" - - check-setuptools: - needs: prepare-setuptools - if: needs.prepare-setuptools.outputs.versions != '[]' - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - version: ${{ fromJSON(needs.prepare-setuptools.outputs.versions) }} - - steps: - - uses: actions/checkout@v6 - - uses: actions/setup-python@v6 - with: - python-version: "3.14t" - cache: pip - - - name: Install package with selected setuptools - run: | - python -m pip install --upgrade pip - python -m pip install . "setuptools==${{ matrix.version }}" - - - name: Show versions - run: | - python --version - python -m pip show setuptools \ No newline at end of file diff --git a/.github/workflows/setuptools_compatibility_reusable.yml b/.github/workflows/setuptools_compatibility_reusable.yml new file mode 100644 index 0000000..db4722d --- /dev/null +++ b/.github/workflows/setuptools_compatibility_reusable.yml @@ -0,0 +1,82 @@ +name: Setuptools Compatibility Reusable + +on: + workflow_call: + inputs: + repo: + description: "GitHub repo {owner}/{repo}" + required: false + default: "" + type: string + ref: + description: "GitHub ref: Branch, Tag or Commit SHA" + required: false + default: "" + type: string + pr_number: + description: "PR Number" + required: false + default: 0 + type: number + +permissions: + contents: read + +jobs: + prepare-setuptools: + runs-on: ubuntu-latest + outputs: + versions: ${{ steps.parser.outputs.versions || '[]' }} + steps: + - uses: actions/checkout@v6 + with: + repository: ${{ inputs.repo || github.repository }} + ref: ${{ inputs.ref || github.ref }} + + - name: Prepare checkout + run: | + bash .github/scripts/ci_prepare_checkout.sh "${{ inputs.pr_number }}" + + - uses: actions/setup-python@v6 + with: + python-version: "3.14" + + - name: Generate version matrix + id: parser + run: | + python -m pip install --upgrade requests packaging + versions=$(python .github/scripts/ci_workflow.py loop-versions setuptools ">=77.0.1,<83") + echo "versions=$versions" >> "$GITHUB_OUTPUT" + + check-setuptools: + needs: prepare-setuptools + if: needs.prepare-setuptools.outputs.versions != '[]' + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + version: ${{ fromJSON(needs.prepare-setuptools.outputs.versions) }} + steps: + - uses: actions/checkout@v6 + with: + repository: ${{ inputs.repo || github.repository }} + ref: ${{ inputs.ref || github.ref }} + + - name: Prepare checkout + run: | + bash .github/scripts/ci_prepare_checkout.sh "${{ inputs.pr_number }}" + + - uses: actions/setup-python@v6 + with: + python-version: "3.14" + cache: pip + + - name: Install package with selected setuptools + run: | + python -m pip install --upgrade pip + python -m pip install . "setuptools==${{ matrix.version }}" + + - name: Show versions + run: | + python --version + python -m pip show setuptools diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index eb1392f..c9ababc 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -1,5 +1,7 @@ name: Unit Tests +run-name: "${{ github.event.inputs.title || github.workflow }}" + defaults: run: shell: bash -le {0} @@ -8,481 +10,194 @@ on: repository_dispatch: workflow_dispatch: inputs: + title: + description: "Set a title for this run" + required: false + default: "" + repo: + description: "GitHub repo {owner}/{repo}" + required: false + default: "" ref: description: "GitHub ref: Branch, Tag or Commit SHA" required: false default: "" + pr_number: + description: "PR Number" + required: false + type: number test_regex: description: "Regex to filter test files" required: false default: "" + artifact_id: + description: "Run id for artifact lookup" + required: false + default: "" max-parallel: description: "Parallel jobs" required: false default: "4" env: + repo: ${{ github.event.inputs.repo || github.repository }} ref: ${{ github.event.inputs.ref || github.ref }} CUDA_DEVICE_ORDER: PCI_BUS_ID CUDA_VERSION: 131 + # The runtime image tag tracks the current CUDA base image kept on the CI registry. + CUDA_IMAGE_VERSION: 132 UV_TORCH_BACKEND: cu130 TORCH_VERSION: 2.11.0 PYTHON_VERSION: 3.14t UV_PYTHON: 3.14t PYTHON_GIL: 0 RUNNER: 10.0.13.31 - HF_TOKEN: ${{ secrets.HF_TOKEN }} BASE_URL: http://10.0.13.31/gpu + LOGBAR_ANIMATION: "0" + HF_TOKEN: ${{ secrets.HF_TOKEN }} concurrency: group: ${{ github.event.inputs.ref || github.ref }}-workflow-unit-tests cancel-in-progress: true +permissions: + contents: read + jobs: - list-test-files: + check-vm: runs-on: ubuntu-latest outputs: - files-group1: ${{ steps.files.outputs.files-group1 }} - files-group2: ${{ steps.files.outputs.files-group2 }} - max-parallel: ${{ steps.config.outputs.max-parallel }} + ip: ${{ steps.get_ip.outputs.ip }} + run_id: ${{ steps.get_ip.outputs.run_id }} + max-parallel: ${{ steps.get_ip.outputs['max-parallel'] }} + cuda_version: ${{ env.CUDA_IMAGE_VERSION }} steps: - name: Checkout Code uses: actions/checkout@v6 with: + repository: ${{ env.repo }} ref: ${{ env.ref }} - - uses: actions/setup-python@v6 - with: - python-version: "3.14t" - cache: pip - - - name: Set matrix config - id: config - run: | - echo "max-parallel=${{ github.event.inputs['max-parallel'] || '8' }}" >> "$GITHUB_OUTPUT" - - - name: List test files - id: files - run: | - python -m pip install --upgrade pip - python -m pip install device_smi - - files=$(python3 .github/scripts/list_test_files.py \ - --test-regex "${{ github.event.inputs.test_regex || '' }}" \ - --group-count=2) - - mapfile -t file_groups <<< "$files" - files_group1="${file_groups[0]}" - files_group2="${file_groups[1]}" - - echo "files-group1=$files_group1" >> "$GITHUB_OUTPUT" - echo "files-group2=$files_group2" >> "$GITHUB_OUTPUT" - - echo "Group1: $files_group1" - echo "Group2: $files_group2" - - test: - needs: list-test-files - if: needs.list-test-files.outputs.files-group1 != '' && needs.list-test-files.outputs.files-group1 != '[]' - strategy: - fail-fast: false - max-parallel: ${{ fromJSON(needs.list-test-files.outputs.max-parallel) }} - matrix: - test-file: ${{ fromJSON(needs.list-test-files.outputs.files-group1) }} - runs-on: [ self-hosted, xeon5 ] - container: - image: 10.0.13.31:5000/nvidia/cuda:132-ubuntu24.04_0325 - volumes: - - /monster/ci/env/entrypoint.sh:/entrypoint.sh - - /monster/ci/env/entrypoint.sh:/etc/profile.d/01-entrypoint.sh - - /dev/dri/by-path:/dev/dri/by-path - - /monster/ci/models:/monster/data/model - - /monster/ci/dataset:/monster/data/model/dataset - - /monster/ci/huggingface:/github/home/.cache/huggingface - - /github/workspace/uv:/opt/uv - - /github/workspace/tmp:/opt/uv/tmp - - /monster/ci/uv/python:/opt/uv/python - - /monster/ci/uv/cache/python:/opt/uv/cache/python - - /monster/ci/uv/setup_uv_venv.sh:/opt/uv/setup_uv_venv.sh - - /monster/ci/uv/uv:/opt/uv/uv - - /monster/ci/uv/uvx:/opt/uv/uvx - - /monster/ci/uv/env:/opt/uv/env - - /monster/ci/uv/uv.toml:/opt/uv/uv.toml - - /monster/ci/env:/opt/env - - /monster/ci/dist:/opt/dist - steps: - - name: Checkout Code - uses: actions/checkout@v6 - with: - ref: ${{ env.ref }} - - - name: Set test metadata - id: meta - run: | - python3 .github/scripts/prepare_unit_test.py \ - --test-file "${{ matrix.test-file }}" - - - name: decompress uv cache - continue-on-error: true - run: | - if [ -f /opt/dist/uv.tar.xz ]; then - TAR_FILE="/opt/dist/uv.tar.xz" - LAST_FILE="/opt/uv/cache/lastmodified" - - # Get modification time of tar.xz file (epoch seconds) - TAR_MTIME=$(stat -c %Y "$TAR_FILE") - - # Read last recorded modification time if file exists - if [ -f "$LAST_FILE" ]; then - LAST_MTIME=$(cat "$LAST_FILE") - else - LAST_MTIME=0 - fi - - # Compare timestamps to decide whether to decompress - if [ "$TAR_MTIME" = "$LAST_MTIME" ]; then - echo "uv.tar.xz unchanged, skip decompress" - else - echo "decompressing uv.tar.xz..." - - # Prepare temporary directory - mkdir -p /opt/uv/cache/tmp - rm -rf /opt/uv/cache/tmp/* - - # Extract archive - tar -xJf "$TAR_FILE" -C /opt/uv/cache/tmp - - # Replace existing uv directory - rm -rf /opt/uv/cache/uv - mv /opt/uv/cache/tmp/uv /opt/uv/cache/uv - - # Record latest modification time - echo "$TAR_MTIME" > "$LAST_FILE" - - echo "done!" - ls -ahl /opt/uv/cache - echo "==========" - ls -ahl /opt/uv/cache/uv - fi - fi - - - name: Activate uv env - run: | - env_name="evalution_${SAFE_NAME}_cu${{ env.CUDA_VERSION }}_torch${{ env.TORCH_VERSION }}_py${PYTHON_VERSION}_release" - mv /opt/uv/venvs/$env_name /opt/uv/tmp || true - echo "source /opt/uv/setup_uv_venv.sh $env_name ${PYTHON_VERSION}" - source /opt/uv/setup_uv_venv.sh "$env_name" "${PYTHON_VERSION}" - python -VV - - - name: Setup uv env - run: | - /opt/env/init_compiler_torch_only.sh ${{ env.CUDA_VERSION }} ${{ env.TORCH_VERSION }} ${UV_PYTHON} - uv pip install device_smi -U - python3 .github/scripts/install_unit_test_deps.py \ - --test-file "${{ matrix.test-file }}" \ - --uv-python "${UV_PYTHON}" \ - --runner "${{ env.RUNNER }}" \ - --install-project - - - name: Print uv env - run: | - echo "::group::uv python list" - uv python list - ls -ahl /opt/uv/venvs - echo "::endgroup::" - - echo "== python ==" - python -VV - which python - which pip || true - - echo "== nvcc ==" - nvcc --version - - echo "::group::pip list" - uv pip list - echo "::endgroup::" - - echo "== torch ==" - uv pip show torch || true - - echo "::group::project files" - ls -ahl - echo "::endgroup::" - - echo "::group::git status" - git config --global --add safe.directory $(pwd) - git status - echo "::endgroup::" - - - name: Find suitable GPU - if: ${{ steps.meta.outputs.requires-gpu == 'true' }} - run: | - python3 .github/scripts/allocate_gpu.py \ - --base-url "${{ env.BASE_URL }}" \ - --run-id "${{ github.run_id }}" \ - --test "${{ matrix.test-file }}" \ - --runner "${{ runner.name }}" \ - --count "1" \ - --require-single - - - name: Run test - run: | - echo "::group::pip list" - uv pip list - echo "::endgroup::" - - python .github/scripts/run_tests.py \ - --base-url "${{ env.BASE_URL }}" \ - --run-id "${{ github.run_id }}" \ - --test-file "${{ matrix.test-file }}" \ - --runner "${{ runner.name }}" \ - --gpu-id "${CUDA_VISIBLE_DEVICES:-}" \ - --artifacts-dir artifacts - - - name: Release GPU - if: ${{ always() && steps.meta.outputs.requires-gpu == 'true' }} + - name: Print env run: | - if [ -n "${CUDA_VISIBLE_DEVICES:-}" ]; then - python3 .github/scripts/release_gpu.py \ - --base-url "${{ env.BASE_URL }}" \ - --run-id "${{ github.run_id }}" \ - --gpu-id "${CUDA_VISIBLE_DEVICES}" \ - --test "${{ matrix.test-file }}" \ - --runner "${{ runner.name }}" - else - echo "Skip GPU release because allocation metadata is missing." - fi + echo "repo: ${{ env.repo }}" + echo "ref: ${{ env.ref }}" + echo "artifact_id: ${{ github.event.inputs.artifact_id }}" + echo "max-parallel: ${{ github.event.inputs['max-parallel'] }}" - - name: Clean cache - if: always() + - name: Set runner outputs + id: get_ip run: | - echo "Cleaning workspace: $PWD" - rm -rf ./* .[^.] .??* || true - echo "cleaning venv: ${{ env.VIRTUAL_ENV }}" - rm -rf "${{ env.VIRTUAL_ENV }}" - + bash .github/scripts/ci_write_runner_outputs.sh \ + "$RUNNER" \ + "${{ github.run_id }}" \ + "${{ github.event.inputs.artifact_id }}" \ + "${{ github.event.inputs['max-parallel'] }}" - test2: - needs: list-test-files - if: needs.list-test-files.outputs.files-group2 != '' && needs.list-test-files.outputs.files-group2 != '[]' - strategy: - fail-fast: false - max-parallel: ${{ fromJSON(needs.list-test-files.outputs.max-parallel) }} - matrix: - test-file: ${{ fromJSON(needs.list-test-files.outputs.files-group2) }} - runs-on: [ self-hosted, xeon5 ] - container: - image: 10.0.13.31:5000/nvidia/cuda:132-ubuntu24.04_0325 - volumes: - - /monster/ci/env/entrypoint.sh:/entrypoint.sh - - /monster/ci/env/entrypoint.sh:/etc/profile.d/01-entrypoint.sh - - /dev/dri/by-path:/dev/dri/by-path - - /monster/ci/models:/monster/data/model - - /monster/ci/dataset:/monster/data/model/dataset - - /monster/ci/huggingface:/github/home/.cache/huggingface - - /github/workspace/uv:/opt/uv - - /github/workspace/tmp:/opt/uv/tmp - - /monster/ci/uv/python:/opt/uv/python - - /monster/ci/uv/cache/python:/opt/uv/cache/python - - /monster/ci/uv/setup_uv_venv.sh:/opt/uv/setup_uv_venv.sh - - /monster/ci/uv/uv:/opt/uv/uv - - /monster/ci/uv/uvx:/opt/uv/uvx - - /monster/ci/uv/env:/opt/uv/env - - /monster/ci/uv/uv.toml:/opt/uv/uv.toml - - /monster/ci/env:/opt/env - - /monster/ci/dist:/opt/dist + list-test-files: + runs-on: ubuntu-latest + outputs: + cpu-files: ${{ steps.files.outputs.cpu-files }} + torch-files: ${{ steps.files.outputs.torch-files }} + model-files: ${{ steps.files.outputs.model-files }} + cpu-matrix: ${{ steps.files.outputs.cpu-matrix }} + torch-matrix: ${{ steps.files.outputs.torch-matrix }} + model-matrix: ${{ steps.files.outputs.model-matrix }} steps: - name: Checkout Code uses: actions/checkout@v6 with: + repository: ${{ env.repo }} ref: ${{ env.ref }} - - name: Set test metadata - id: meta - run: | - python3 .github/scripts/prepare_unit_test.py \ - --test-file "${{ matrix.test-file }}" - - - name: decompress uv cache - continue-on-error: true - run: | - if [ -f /opt/dist/uv.tar.xz ]; then - TAR_FILE="/opt/dist/uv.tar.xz" - LAST_FILE="/opt/uv/cache/lastmodified" - - # Get modification time of tar.xz file (epoch seconds) - TAR_MTIME=$(stat -c %Y "$TAR_FILE") - - # Read last recorded modification time if file exists - if [ -f "$LAST_FILE" ]; then - LAST_MTIME=$(cat "$LAST_FILE") - else - LAST_MTIME=0 - fi - - # Compare timestamps to decide whether to decompress - if [ "$TAR_MTIME" = "$LAST_MTIME" ]; then - echo "uv.tar.xz unchanged, skip decompress" - else - echo "decompressing uv.tar.xz..." - - # Prepare temporary directory - mkdir -p /opt/uv/cache/tmp - rm -rf /opt/uv/cache/tmp/* - - # Extract archive - tar -xJf "$TAR_FILE" -C /opt/uv/cache/tmp - - # Replace existing uv directory - rm -rf /opt/uv/cache/uv - mv /opt/uv/cache/tmp/uv /opt/uv/cache/uv - - # Record latest modification time - echo "$TAR_MTIME" > "$LAST_FILE" - - echo "done!" - ls -ahl /opt/uv/cache - echo "==========" - ls -ahl /opt/uv/cache/uv - fi - fi - - - name: Activate uv env - run: | - env_name="evalution_${SAFE_NAME}_cu${{ env.CUDA_VERSION }}_torch${{ env.TORCH_VERSION }}_py${PYTHON_VERSION}_release" - mv /opt/uv/venvs/$env_name /opt/uv/tmp || true - echo "source /opt/uv/setup_uv_venv.sh $env_name ${PYTHON_VERSION}" - source /opt/uv/setup_uv_venv.sh "$env_name" "${PYTHON_VERSION}" - python -VV - - - name: Setup uv env - run: | - /opt/env/init_compiler_torch_only.sh ${{ env.CUDA_VERSION }} ${{ env.TORCH_VERSION }} ${UV_PYTHON} - uv pip install device_smi -U - python3 .github/scripts/install_unit_test_deps.py \ - --test-file "${{ matrix.test-file }}" \ - --uv-python "${UV_PYTHON}" \ - --runner "${{ env.RUNNER }}" \ - --install-project - - - name: Print uv env - run: | - echo "::group::uv python list" - uv python list - ls -ahl /opt/uv/venvs - echo "::endgroup::" - - echo "== python ==" - python -VV - which python - which pip || true - - echo "== nvcc ==" - nvcc --version - - echo "::group::pip list" - uv pip list - echo "::endgroup::" - - echo "== torch ==" - uv pip show torch || true - - echo "::group::project files" - ls -ahl - echo "::endgroup::" - - echo "::group::git status" - git config --global --add safe.directory $(pwd) - git status - echo "::endgroup::" - - - name: Find suitable GPU - if: ${{ steps.meta.outputs.requires-gpu == 'true' }} + - name: Prepare checkout run: | - python3 .github/scripts/allocate_gpu.py \ - --base-url "${{ env.BASE_URL }}" \ - --run-id "${{ github.run_id }}" \ - --test "${{ matrix.test-file }}" \ - --runner "${{ runner.name }}" \ - --count "1" \ - --require-single + bash .github/scripts/ci_prepare_checkout.sh "${{ github.event.inputs.pr_number }}" - - name: Run test - run: | - echo "::group::pip list" - uv pip list - echo "::endgroup::" - - python .github/scripts/run_tests.py \ - --base-url "${{ env.BASE_URL }}" \ - --run-id "${{ github.run_id }}" \ - --test-file "${{ matrix.test-file }}" \ - --runner "${{ runner.name }}" \ - --gpu-id "${CUDA_VISIBLE_DEVICES:-}" \ - --artifacts-dir artifacts - - - name: Release GPU - if: ${{ always() && steps.meta.outputs.requires-gpu == 'true' }} - run: | - if [ -n "${CUDA_VISIBLE_DEVICES:-}" ]; then - python3 .github/scripts/release_gpu.py \ - --base-url "${{ env.BASE_URL }}" \ - --run-id "${{ github.run_id }}" \ - --gpu-id "${CUDA_VISIBLE_DEVICES}" \ - --test "${{ matrix.test-file }}" \ - --runner "${{ runner.name }}" - else - echo "Skip GPU release because allocation metadata is missing." - fi - - - name: Clean cache - if: always() - run: | - echo "Cleaning workspace: $PWD" - rm -rf ./* .[^.] .??* || true - echo "cleaning venv: ${{ env.VIRTUAL_ENV }}" - rm -rf "${{ env.VIRTUAL_ENV }}" - - prepare-setuptools: - runs-on: ubuntu-latest - outputs: - versions: ${{ steps.parser.outputs.versions || '[]' }} - steps: - - uses: actions/checkout@v6 - - uses: actions/setup-python@v6 - with: - python-version: "3.14t" - - - name: Generate version matrix - id: parser - run: | - python -m pip install --upgrade requests packaging - versions=$(python .github/scripts/ci_loop_versions.py setuptools ">=77.0.1,<83") - echo "versions=$versions" >> "$GITHUB_OUTPUT" - - check-setuptools: - needs: prepare-setuptools - if: needs.prepare-setuptools.outputs.versions != '[]' - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - version: ${{ fromJSON(needs.prepare-setuptools.outputs.versions) }} - - steps: - - uses: actions/checkout@v6 - - uses: actions/setup-python@v6 - with: - python-version: "3.14t" - cache: pip - - - name: Install package with selected setuptools - run: | - python -m pip install --upgrade pip - python -m pip install . "setuptools==${{ matrix.version }}" - - - name: Show versions + - name: List files + id: files run: | - python --version - python -m pip show setuptools + test_plan=$(python3 .github/scripts/ci_workflow.py list-tests \ + --test-regex "${{ github.event.inputs.test_regex }}") + echo "Test plan: $test_plan" + + TEST_PLAN="$test_plan" python3 - <<'PY' >> "$GITHUB_OUTPUT" + import json + import os + + plan = json.loads(os.environ["TEST_PLAN"]) + print(f"cpu-files={json.dumps(plan['cpu_files'])}") + print(f"torch-files={json.dumps(plan['torch_files'])}") + print(f"model-files={json.dumps(plan['model_files'])}") + print(f"cpu-matrix={json.dumps(plan['cpu_matrix'])}") + print(f"torch-matrix={json.dumps(plan['torch_matrix'])}") + print(f"model-matrix={json.dumps(plan['model_matrix'])}") + PY + + TEST_PLAN="$test_plan" python3 - <<'PY' + import json + import os + + plan = json.loads(os.environ["TEST_PLAN"]) + print(f"CPU test files: {json.dumps(plan['cpu_files'])}") + print(f"Torch test files: {json.dumps(plan['torch_files'])}") + print(f"Model test files: {json.dumps(plan['model_files'])}") + print(f"CPU matrix: {json.dumps(plan['cpu_matrix'])}") + print(f"Torch matrix: {json.dumps(plan['torch_matrix'])}") + print(f"Model matrix: {json.dumps(plan['model_matrix'])}") + PY + + cpu: + needs: + - list-test-files + - check-vm + if: always() && !cancelled() && needs.list-test-files.outputs.cpu-matrix != '[]' + uses: ./.github/workflows/unit_tests_reusable.yml + secrets: inherit + with: + repo: ${{ github.event.inputs.repo || github.repository }} + ref: ${{ github.event.inputs.ref || github.ref }} + pr_number: ${{ fromJSON(github.event.inputs.pr_number || '0') }} + check_vm_ip: ${{ needs.check-vm.outputs.ip }} + check_vm_max_parallel: ${{ needs.check-vm.outputs['max-parallel'] }} + check_vm_cuda_version: ${{ needs.check-vm.outputs.cuda_version }} + matrix_json: ${{ needs.list-test-files.outputs.cpu-matrix || '[]' }} + + torch: + needs: + - list-test-files + - check-vm + if: always() && !cancelled() && needs.list-test-files.outputs.torch-matrix != '[]' + uses: ./.github/workflows/unit_tests_reusable.yml + secrets: inherit + with: + repo: ${{ github.event.inputs.repo || github.repository }} + ref: ${{ github.event.inputs.ref || github.ref }} + pr_number: ${{ fromJSON(github.event.inputs.pr_number || '0') }} + check_vm_ip: ${{ needs.check-vm.outputs.ip }} + check_vm_max_parallel: ${{ needs.check-vm.outputs['max-parallel'] }} + check_vm_cuda_version: ${{ needs.check-vm.outputs.cuda_version }} + matrix_json: ${{ needs.list-test-files.outputs.torch-matrix || '[]' }} + + models: + needs: + - list-test-files + - check-vm + if: always() && !cancelled() && needs.list-test-files.outputs.model-matrix != '[]' + uses: ./.github/workflows/unit_tests_reusable.yml + secrets: inherit + with: + repo: ${{ github.event.inputs.repo || github.repository }} + ref: ${{ github.event.inputs.ref || github.ref }} + pr_number: ${{ fromJSON(github.event.inputs.pr_number || '0') }} + check_vm_ip: ${{ needs.check-vm.outputs.ip }} + check_vm_max_parallel: ${{ needs.check-vm.outputs['max-parallel'] }} + check_vm_cuda_version: ${{ needs.check-vm.outputs.cuda_version }} + matrix_json: ${{ needs.list-test-files.outputs.model-matrix || '[]' }} + + setuptools-compatibility: + uses: ./.github/workflows/setuptools_compatibility_reusable.yml + with: + repo: ${{ github.event.inputs.repo || github.repository }} + ref: ${{ github.event.inputs.ref || github.ref }} + pr_number: ${{ fromJSON(github.event.inputs.pr_number || '0') }} diff --git a/.github/workflows/unit_tests_reusable.yml b/.github/workflows/unit_tests_reusable.yml new file mode 100644 index 0000000..f9e0a6b --- /dev/null +++ b/.github/workflows/unit_tests_reusable.yml @@ -0,0 +1,215 @@ +name: Unit Tests Reusable + +defaults: + run: + shell: bash -le {0} + +on: + workflow_call: + inputs: + repo: + description: "GitHub repo {owner}/{repo}" + required: false + default: "" + type: string + ref: + description: "GitHub ref: Branch, Tag or Commit SHA" + required: false + default: "" + type: string + pr_number: + description: "PR Number" + required: false + default: 0 + type: number + check_vm_ip: + description: "Selected CI runner IP" + required: true + type: string + check_vm_max_parallel: + description: "Serialized max parallel payload" + required: true + type: string + check_vm_cuda_version: + description: "Selected CI container CUDA image version" + required: true + type: string + matrix_json: + description: "Serialized unit test matrix for this reusable job" + required: true + type: string + +permissions: + contents: read + +env: + repo: ${{ inputs.repo || github.repository }} + ref: ${{ inputs.ref || github.ref }} + CUDA_DEVICE_ORDER: PCI_BUS_ID + CUDA_VERSION: 131 + UV_TORCH_BACKEND: cu130 + TORCH_VERSION: 2.11.0 + PYTHON_VERSION: 3.14t + UV_PYTHON: 3.14t + PYTHON_GIL: 0 + RUNNER: 10.0.13.31 + BASE_URL: http://10.0.13.31/gpu + LOGBAR_ANIMATION: "0" + HF_TOKEN: ${{ secrets.HF_TOKEN }} + +jobs: + test: + runs-on: [self-hosted, xeon5] + if: ${{ !cancelled() && inputs.matrix_json != '[]' }} + container: + image: ${{ inputs.check_vm_ip }}:5000/nvidia/cuda:${{ inputs.check_vm_cuda_version }}-ubuntu24.04_0325 + options: --device /dev/dri --ipc=host --runtime=nvidia --gpus all + volumes: + - /monster/ci/env/entrypoint.sh:/entrypoint.sh + - /monster/ci/env/entrypoint.sh:/etc/profile.d/01-entrypoint.sh + - /dev/dri/by-path:/dev/dri/by-path + - /monster/ci/models:/monster/data/model + - /monster/ci/dataset:/monster/data/model/dataset + - /monster/ci/huggingface:/github/home/.cache/huggingface + - /github/workspace/uv:/opt/uv + - /github/workspace/tmp:/opt/uv/tmp + - /monster/ci/uv/python:/opt/uv/python + - /monster/ci/uv/cache/python:/opt/uv/cache/python + - /monster/ci/uv/setup_uv_venv.sh:/opt/uv/setup_uv_venv.sh + - /monster/ci/uv/uv:/opt/uv/uv + - /monster/ci/uv/uvx:/opt/uv/uvx + - /monster/ci/uv/env:/opt/uv/env + - /monster/ci/uv/uv.toml:/opt/uv/uv.toml + - /monster/ci/env:/opt/env + - /monster/ci/dist:/opt/dist + strategy: + fail-fast: false + max-parallel: ${{ fromJSON(inputs.check_vm_max_parallel).size || 20 }} + matrix: + include: ${{ fromJSON(inputs.matrix_json || '[]') }} + steps: + - name: Checkout Code + uses: actions/checkout@v6 + with: + repository: ${{ env.repo }} + ref: ${{ env.ref }} + + - name: Prepare checkout + run: | + bash .github/scripts/ci_prepare_checkout.sh "${{ inputs.pr_number }}" + + - name: Restore uv cache + continue-on-error: true + run: | + bash .github/scripts/ci_restore_uv_cache.sh + + - name: Set test metadata + id: meta + run: | + python3 .github/scripts/ci_tests.py set-metadata \ + --test-file "${{ matrix.test_file }}" + + - name: Activate uv env + run: | + export EVALUTION_TORCH_EXTENSIONS_DIR="/tmp/evalution/torch_extensions/${{ github.run_id }}/${{ github.run_attempt }}/${SAFE_NAME}" + export TORCH_EXTENSIONS_DIR="$EVALUTION_TORCH_EXTENSIONS_DIR" + mkdir -p "$EVALUTION_TORCH_EXTENSIONS_DIR" + echo "EVALUTION_TORCH_EXTENSIONS_DIR=$EVALUTION_TORCH_EXTENSIONS_DIR" >> "$GITHUB_ENV" + echo "TORCH_EXTENSIONS_DIR=$TORCH_EXTENSIONS_DIR" >> "$GITHUB_ENV" + + uv cache prune --ci + + env_name="evalution_${SAFE_NAME}_cu${{ env.CUDA_VERSION }}_torch${{ env.TORCH_VERSION }}_py${PYTHON_VERSION}_release" + mv "/opt/uv/venvs/$env_name" /opt/uv/tmp || true + source /opt/uv/setup_uv_venv.sh "$env_name" "${PYTHON_VERSION}" + python -VV + + - name: Setup uv env + run: | + /opt/env/init_compiler_torch_only.sh ${{ env.CUDA_VERSION }} ${{ env.TORCH_VERSION }} "${UV_PYTHON}" + uv pip install device_smi -U + python3 .github/scripts/ci_tests.py install-deps \ + --test-file "${{ matrix.test_file }}" \ + --uv-python "${UV_PYTHON}" \ + --runner "${{ env.RUNNER }}" \ + --install-project + + - name: Print uv env + run: | + echo "::group::uv python list" + uv python list + ls -ahl /opt/uv/venvs + echo "::endgroup::" + + echo "== python ==" + python -VV + which python + which pip || true + + echo "== nvcc ==" + nvcc --version + + echo "::group::pip list" + uv pip list + echo "::endgroup::" + + echo "== torch ==" + uv pip show torch || true + + echo "::group::project files" + ls -ahl + echo "::endgroup::" + + echo "::group::git status" + git config --global --add safe.directory "$(pwd)" + git status + echo "::endgroup::" + + - name: Find suitable GPU + if: ${{ steps.meta.outputs.requires-gpu == 'true' }} + run: | + python3 .github/scripts/ci_gpu.py allocate \ + --base-url "${{ env.BASE_URL }}" \ + --run-id "${{ github.run_id }}" \ + --test "${{ matrix.test_file }}" \ + --runner "${{ runner.name }}" \ + --count "1" \ + --require-single + + - name: Run test + run: | + echo "::group::pip list" + uv pip list + echo "::endgroup::" + + python .github/scripts/ci_tests.py run \ + --base-url "${{ env.BASE_URL }}" \ + --run-id "${{ github.run_id }}" \ + --test-file "${{ matrix.test_file }}" \ + --runner "${{ runner.name }}" \ + --gpu-id "${CUDA_VISIBLE_DEVICES:-}" \ + --artifacts-dir artifacts + + - name: Release GPU + if: ${{ always() && steps.meta.outputs.requires-gpu == 'true' }} + run: | + if [ -n "${CUDA_VISIBLE_DEVICES:-}" ]; then + python3 .github/scripts/ci_gpu.py release \ + --base-url "${{ env.BASE_URL }}" \ + --run-id "${{ github.run_id }}" \ + --gpu-id "${CUDA_VISIBLE_DEVICES}" \ + --test "${{ matrix.test_file }}" \ + --runner "${{ runner.name }}" + else + echo "Skip GPU release because allocation metadata is missing." + fi + + - name: Clean cache + if: always() + run: | + echo "Cleaning workspace: $PWD" + rm -rf ./* .[^.] .??* || true + echo "cleaning venv: ${{ env.VIRTUAL_ENV }}" + rm -rf "${{ env.VIRTUAL_ENV }}" + echo "Cleaning uv cache" + uv cache prune --ci diff --git a/scripts/arch.md b/scripts/arch.md index 8a9cb2d..a9c3681 100644 --- a/scripts/arch.md +++ b/scripts/arch.md @@ -3,29 +3,22 @@ ## Naming - Workflow entrypoints follow the same `ci_*.py` convention as GPTQModel. -- `ci_common.py` and `ci_gpu.py` are kept structurally aligned with GPTQModel so allocator and GitHub env handling stay consistent across repos. +- `ci_common.py`, `ci_gpu.py`, `ci_tests.py`, and `ci_workflow.py` are the only Python CI entrypoints that workflows should call directly. ## Evalution unit test flow 1. `list-test-files` -- `ci_workflow.py set-matrix-config` writes the matrix parallelism output. -- `ci_workflow.py list-tests` enumerates `tests/test_*.py`, applies the optional regex filter, emits the test matrix, and reports whether any `unit_test_common` cases are present. +- `ci_workflow.py list-tests` enumerates `tests/test_*.py`, applies the optional regex filter, and emits separate CPU, torch, and model matrices. -2. `prepare-common-env` -- When any common tests are scheduled, the workflow activates the branch-scoped `unit_test_common` uv env, runs `ci_workflow.py setup-uv-env`, and runs `ci_workflow.py prepare-test-run` once before the test matrix starts. +2. `test` job setup +- `ci_tests.py set-metadata` derives `SAFE_NAME` and whether the test requires a GPU by reading the file marker and test-specific Python runtime rules. +- `ci_restore_uv_cache.sh` restores the shared uv cache before each reusable job matrix starts. +- The reusable workflow activates the uv env and calls `ci_tests.py install-deps` to install test-specific Python dependencies. -3. `test` job setup -- `ci_workflow.py set-test-metadata` derives `SAFE_NAME`, `ENV_FAMILY`, and whether the test requires a GPU by reading the file marker. -- `ci_workflow.py activate-uv-env` resolves the test runtime signature, writes `PYTHON_VERSION`, `UV_PYTHON`, `ENV_NAME`, and `UV_CACHE_DIR`, then activates the shared uv env for that signature. -- `ci_workflow.py setup-uv-env` initializes compiler and torch state, then installs the Python-version-specific runtime dependencies once per shared env under a filesystem lock. -- `ci_workflow.py print-uv-env` prints the same diagnostic state the old shell step emitted. - -4. execution +3. execution - `ci_gpu.py allocate` reserves a GPU only when the test requires one. -- Common-env matrix jobs skip `ci_workflow.py setup-uv-env` and `ci_workflow.py prepare-test-run` because `prepare-common-env` has already installed the shared runtime and project package. -- `unit_test_tensorrt_llm` keeps the original flow and prepares its env lazily when that test runs. - `ci_tests.py run` executes pytest, writes artifacts, and optionally keeps the GPU lease alive while the test is running. -- `ci_workflow.py release-gpu-if-present` releases the GPU only when allocation metadata exists. +- `ci_gpu.py release` releases the GPU only when allocation metadata exists. ## Maintenance rule