forked from NVIDIA/Model-Optimizer
-
Notifications
You must be signed in to change notification settings - Fork 3
105 lines (102 loc) · 4.02 KB
/
gpu_tests.yml
File metadata and controls
105 lines (102 loc) · 4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
name: GPU tests
on:
push:
branches: ["pull-request/[0-9]+"]
# NOTE: paths cannot be used since push happens to copied PR and only latest commit to PR is used
schedule:
- cron: "0 0 * * *" # Nightly
workflow_dispatch: # On-demand
# Cancel previous runs if new commit is pushed to the same PR
concurrency:
group: ${{ github.workflow }}-${{ startsWith(github.ref, 'refs/heads/pull-request/') && github.ref || github.sha }}
cancel-in-progress: true
jobs:
check-file-changes:
if: startsWith(github.ref, 'refs/heads/pull-request/')
runs-on: ubuntu-latest
outputs:
any_changed: ${{ steps.changed-tests.outputs.any_changed }}
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0
- id: get-pr-info
uses: nv-gha-runners/get-pr-info@main
# Get commit from main branch that is present in the PR to use as base for changed files
- id: calculate-merge-base
env:
PR_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
run: |
(echo -n "merge-base="; git merge-base "$BASE_SHA" "$PR_SHA") | tee --append "${GITHUB_OUTPUT}"
- name: Check for changes in test-relevant directories
id: changed-tests
uses: step-security/changed-files@v46.0.5
with:
base_sha: ${{ steps.calculate-merge-base.outputs.merge-base }}
sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
files: |
.github/workflows/gpu_tests.yml
modelopt/**
tests/gpu/**
pyproject.toml
tox.ini
fail_on_initial_diff_error: true
wait-checks:
needs: [check-file-changes]
if: needs.check-file-changes.outputs.any_changed == 'true'
uses: ./.github/workflows/_wait_for_checks.yml
permissions:
checks: read
secrets: inherit
with:
match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
delay: 300s
gpu-tests-pr:
needs: [check-file-changes, wait-checks]
if: needs.check-file-changes.outputs.any_changed == 'true'
strategy: &gpu_strategy
fail-fast: false
matrix:
include:
- example: gpu
timeout: 45
container_image: pytorch:26.01-py3
- example: gpu-megatron
timeout: 45
container_image: pytorch:26.01-py3
- example: gpu-trtllm
timeout: 30
container_image: tensorrt-llm/release:1.3.0rc5
runs-on: linux-amd64-gpu-rtxpro6000-latest-1
timeout-minutes: ${{ matrix.timeout }}
container: &gpu_container
image: nvcr.io/nvidia/${{ matrix.container_image }}
env:
GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py
PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps: &gpu_steps
- uses: actions/checkout@v6
- uses: nv-gha-runners/setup-proxy-cache@main
- name: Setup environment variables
run: |
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
- name: Run gpu tests
run: pip install tox-current-env && tox -e cuda13-${{ matrix.example }} --current-env
gpu-tests-non-pr:
if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
strategy: *gpu_strategy
runs-on: linux-amd64-gpu-rtxpro6000-latest-2
timeout-minutes: ${{ matrix.timeout }}
container: *gpu_container
steps: *gpu_steps
gpu-pr-required-check:
# Run even if gpu-tests-pr is skipped
if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
needs: [check-file-changes, gpu-tests-pr]
runs-on: ubuntu-latest
steps:
- name: Required GPU tests did not succeed
if: ${{ needs.check-file-changes.result != 'success' || (needs.check-file-changes.outputs.any_changed == 'true' && needs.gpu-tests-pr.result != 'success') }}
run: exit 1