Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
427cac1
Infrastructure to test DaCe's codegen (in)deterministic behavior
kotsaloscv Apr 30, 2026
3c452d5
Merge branch 'main' into dace_deterministic_codegen_test
kotsaloscv May 3, 2026
655fa5f
Infrastructure to test DaCe's codegen (in)deterministic behavior: CI/…
kotsaloscv May 4, 2026
a813bb8
Infrastructure to test DaCe's codegen (in)deterministic behavior: CI/…
kotsaloscv May 4, 2026
75f0dcf
Merge branch 'main' into dace_deterministic_codegen_test
kotsaloscv May 8, 2026
60bd7cd
Merge branch 'main' into dace_deterministic_codegen_test
kotsaloscv May 18, 2026
0984297
WIP: wording
kotsaloscv May 18, 2026
0e0670e
WIP: wording
kotsaloscv May 19, 2026
02cacd1
Testing infrastructure in Santis/Beverin: WIP
kotsaloscv May 19, 2026
6dbe044
Merge branch 'main' into dace_deterministic_codegen_test
kotsaloscv May 20, 2026
801fc34
Infrastructure to test DaCe's codegen (in)deterministic behavior: Tes…
kotsaloscv May 20, 2026
0af9045
Infrastructure to test DaCe's codegen (in)deterministic behavior: Tes…
kotsaloscv May 21, 2026
a6ba257
Infrastructure to test DaCe's codegen (in)deterministic behavior: Tes…
kotsaloscv May 21, 2026
247768f
Infrastructure to test DaCe's codegen (in)deterministic behavior: Tes…
kotsaloscv May 21, 2026
2a266d7
Infrastructure to test DaCe's codegen (in)deterministic behavior: CI/…
kotsaloscv May 21, 2026
4c633ec
Merge branch 'main' into dace_deterministic_codegen_test
kotsaloscv May 21, 2026
9c2b44d
WIP
kotsaloscv May 22, 2026
c6632a1
Infrastructure to test DaCe's codegen (in)deterministic behavior: Fix…
kotsaloscv May 22, 2026
5b004e3
Merge branch 'main' into dace_deterministic_codegen_test
kotsaloscv May 22, 2026
5f818a3
Infrastructure to test DaCe's codegen (in)deterministic behavior: Fix…
kotsaloscv May 22, 2026
f7a4c44
Infrastructure to test DaCe's codegen (in)deterministic behavior: Fix…
kotsaloscv May 26, 2026
5e8dedb
Infrastructure to test DaCe's codegen (in)deterministic behavior: Fix…
kotsaloscv May 26, 2026
9cb5b5f
Infrastructure to test DaCe's codegen (in)deterministic behavior: Fix…
kotsaloscv May 26, 2026
e119731
Infrastructure to test DaCe's codegen (in)deterministic behavior: Fix…
kotsaloscv May 26, 2026
aa21bb3
Infrastructure to test DaCe's codegen (in)deterministic behavior: Fix…
kotsaloscv May 26, 2026
714d383
Infrastructure to test DaCe's codegen (in)deterministic behavior: Fix…
kotsaloscv May 26, 2026
83aa993
Infrastructure to test DaCe's codegen (in)deterministic behavior: Fix…
kotsaloscv May 26, 2026
61e9d1f
Infrastructure to test DaCe's codegen (in)deterministic behavior: Fix…
kotsaloscv May 27, 2026
d7ba128
Infrastructure to test DaCe's codegen (in)deterministic behavior: Fix…
kotsaloscv May 27, 2026
2154dd0
Infrastructure to test DaCe's codegen (in)deterministic behavior: Fix…
kotsaloscv May 27, 2026
4eb2152
Merge branch 'main' into dace_deterministic_codegen_test
kotsaloscv May 27, 2026
1f8a868
Infrastructure to test DaCe's codegen (in)deterministic behavior: Fix…
kotsaloscv May 27, 2026
b548cc2
Merge branch 'main' into dace_deterministic_codegen_test
kotsaloscv May 27, 2026
424a0dd
Infrastructure to test DaCe's codegen (in)deterministic behavior: Fix…
kotsaloscv May 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 182 additions & 0 deletions ci/cscs-ci-dace-determinism.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
#
# GT4Py - GridTools Framework
#
# Copyright (c) 2014-2024, ETH Zurich
# All rights reserved.
#
# Please, refer to the LICENSE file in the root directory.
# SPDX-License-Identifier: BSD-3-Clause
#

# DaCe codegen determinism check — STANDALONE CSCS-CI PIPELINE
# ============================================================
#
# How to trigger
# --------------
# Whitelisted users trigger it on any PR by posting the comment:
#
# cscs-ci run dace-determinism
#
# What it does
# ------------
# Drives gt4py's `test_*_dace_determinism` nox sessions, each of which
# runs pytest twice with isolated build caches and asserts the
# DaCe-generated source files are byte-identical between the two runs.
# A diff means the gt4py + dace toolchain is non-deterministic for that
# test selection.
#
# Logic lives in:
# noxfile.py (test_*_dace_determinism sessions)
# scripts/dace_deterministic_codegen.py (cache comparison lib + CLI)

include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
- local: 'ci/cscs-ci-ext-config.yml'

variables:
CUDA_VERSION: '12.6.2'
ROCM_VERSION: '7.1.1'
UBUNTU_VERSION: '24.04'
UV_VERSION: '0.11.2'

.test_python_versions: &test_python_versions ['3.10']

stages:
- build
- test

# -- Build stage --
.build_common:
stage: build
extends:
- .dynamic-image-name
variables:
BASE_IMAGE: jfrog.svc.cscs.ch/dockerhub/ubuntu:${UBUNTU_VERSION}
CSCS_REBUILD_POLICY: if-not-exists
DOCKERFILE: ci/Dockerfile
DOCKER_BUILD_ARGS: '["BASE_IMAGE", "CACHE_DIR", "EXTRA_APTGET", "EXTRA_UV_ENV_VARS", "EXTRA_UV_PIP_ARGS", "EXTRA_UV_SYNC_ARGS", "PY_VERSION", "UV_VERSION", "WORKDIR_PATH" ]'
PERSIST_IMAGE_NAME: ${CSCS_REGISTRY_PATH}/public/${ARCH}/base/gt4py-ci-${PY_VERSION}
WATCH_FILECHANGES: 'ci/Dockerfile ci/cscs-ci.yml ci/cscs-ci-ext-config.yml uv.lock'
parallel:
matrix:
- PY_VERSION: *test_python_versions

.build_extra_cuda:
variables:
BASE_IMAGE: jfrog.svc.cscs.ch/dockerhub/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
EXTRA_UV_SYNC_ARGS: "--extra cuda12"

.build_extra_rocm:
variables:
BASE_IMAGE: jfrog.svc.cscs.ch/dockerhub/rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
EXTRA_UV_SYNC_ARGS: "--extra rocm7"
EXTRA_UV_ENV_VARS: "CUPY_INSTALL_USE_HIP=1 HCC_AMDGPU_TARGET=gfx942 ROCM_HOME=/opt/rocm"
KUBERNETES_MEMORY_REQUEST: "64Gi"
KUBERNETES_MEMORY_LIMIT: "64Gi"

build_cscs_gh200:
extends:
- .container-builder-cscs-gh200
- .build_common
- .build_extra_cuda
needs: []

build_cscs_amd_rocm:
extends:
- .container-builder-cscs-zen2
- .build_common
- .build_extra_rocm
needs: []

# -- Test stage --
.dace_determinism_common:
stage: test
image: ${CSCS_REGISTRY_PATH}/public/${ARCH}/base/gt4py-ci-${PY_VERSION}:${DOCKER_TAG}
variables:
TEST_VARIANTS: 'cpu'
USE_MPI: 0 # TODO(havogt): to workaround the libfabric hook injecting incompatible libraries
SLURM_JOB_NUM_NODES: 1
SLURM_TIMELIMIT: 60
allow_failure: true
artifacts:
when: always
paths:
- _dace_deterministic_codegen/
expire_in: 1 month
parallel:
matrix:
- SUBPACKAGE: [cartesian]
SUBVARIANT: ['cuda12', 'rocm7', 'cpu']
PY_VERSION: *test_python_versions
- SUBPACKAGE: [next]
SUBVARIANT: ['cuda12', 'rocm7', 'cpu']
DETAIL: ["nomesh"]
PY_VERSION: *test_python_versions
rules: &exclude_variants_rules
- if: '$SUBVARIANT == "cpu" && ($TEST_VARIANTS !~ /(^|\s)cpu(\s|$)/)'
when: never
- if: '$SUBVARIANT == "cuda12" && ($TEST_VARIANTS !~ /(^|\s)cuda12(\s|$)/)'
when: never
- if: '$SUBVARIANT == "rocm7" && ($TEST_VARIANTS !~ /(^|\s)rocm7(\s|$)/)'
when: never

script:
- mkdir -p "${WORKDIR}/gt4py" && git clone --depth 1 "${CSCS_CI_ORIG_CLONE_URL}" "${WORKDIR}/gt4py"
- cd "${WORKDIR}/gt4py" && git fetch --depth 1 origin "${CI_COMMIT_SHA}" && git checkout "${CI_COMMIT_SHA}"
- export NOX_SESSION_ARGS="(${SUBVARIANT}${DETAIL:+, $DETAIL})"
- |
cd "${WORKDIR}/gt4py"
nox_rc=0
./noxfile.py -s "test_${SUBPACKAGE}_dace_determinism-${PY_VERSION}${NOX_SESSION_ARGS}" || nox_rc=$?
if [ -d _dace_deterministic_codegen ]; then
cp -r _dace_deterministic_codegen "${CI_PROJECT_DIR}/"
fi
exit $nox_rc

dace_determinism_cscs_gh200:
extends:
- .container-runner-santis-gh200
- .dace_determinism_common
needs:
- build_cscs_gh200
variables:
TEST_VARIANTS: 'cpu cuda12'
SLURM_GPUS_PER_NODE: 1
SLURM_PARTITION: 'shared'
GT4PY_BUILD_JOBS: 8
PYTEST_XDIST_AUTO_NUM_WORKERS: 32
rules:
- *exclude_variants_rules
- if: $SUBPACKAGE == 'next' && $DETAIL == 'nomesh'
variables:
# TODO: investigate why the dace tests seem to hang with multiple jobs
GT4PY_BUILD_JOBS: 1
- if: $SUBPACKAGE == 'cartesian' && $SUBVARIANT == 'cpu'
variables:
GT4PY_CARTESIAN_DETERMINISM_XDIST: 3
- if: $SUBPACKAGE == 'cartesian'
variables:
GT4PY_CARTESIAN_DETERMINISM_XDIST: 4
- when: on_success

dace_determinism_cscs_amd_rocm:
extends:
- .tds-container-runner-beverin-mi200
- .dace_determinism_common
needs:
- build_cscs_amd_rocm
variables:
TEST_VARIANTS: 'rocm7'
SLURM_GPUS_PER_NODE: 4
GT4PY_BUILD_JOBS: 8
PYTEST_XDIST_AUTO_NUM_WORKERS: 32
SLURM_PARTITION: mi300
CMAKE_PREFIX_PATH: /opt/rocm
CUDA_HOME: /opt/rocm
CXX: /opt/rocm/bin/hipcc
rules:
- *exclude_variants_rules
- if: $SUBPACKAGE == 'cartesian'
variables:
GT4PY_CARTESIAN_DETERMINISM_XDIST: 4
- when: on_success
Loading