diff --git a/ci/cscs-ci-dace-determinism.yml b/ci/cscs-ci-dace-determinism.yml new file mode 100644 index 0000000000..55f1297879 --- /dev/null +++ b/ci/cscs-ci-dace-determinism.yml @@ -0,0 +1,179 @@ +# +# GT4Py - GridTools Framework +# +# Copyright (c) 2014-2024, ETH Zurich +# All rights reserved. +# +# Please, refer to the LICENSE file in the root directory. +# SPDX-License-Identifier: BSD-3-Clause +# + +# DaCe codegen determinism check — STANDALONE CSCS-CI PIPELINE +# ============================================================ +# +# How to trigger +# -------------- +# Whitelisted users trigger it on any PR by posting the comment: +# +# cscs-ci run dace-determinism +# +# What it does +# ------------ +# Drives gt4py's `test_*_dace_determinism` nox sessions, each of which +# runs pytest twice with isolated build caches and asserts the +# DaCe-generated source files are byte-identical between the two runs. +# A diff means the gt4py + dace toolchain is non-deterministic for that +# test selection. +# +# Logic lives in: +# noxfile.py (test_*_dace_determinism sessions) +# scripts/dace_deterministic_codegen.py (cache comparison lib + CLI) + +include: + - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml' + - local: 'ci/cscs-ci-ext-config.yml' + +variables: + CUDA_VERSION: '12.6.2' + ROCM_VERSION: '7.1.1' + UBUNTU_VERSION: '24.04' + UV_VERSION: '0.11.2' + +.test_python_versions: &test_python_versions ['3.10'] + +stages: + - build + - test + +# -- Build stage -- +.build_common: + stage: build + extends: + - .dynamic-image-name + variables: + BASE_IMAGE: jfrog.svc.cscs.ch/dockerhub/ubuntu:${UBUNTU_VERSION} + CSCS_REBUILD_POLICY: if-not-exists + DOCKERFILE: ci/Dockerfile + DOCKER_BUILD_ARGS: '["BASE_IMAGE", "CACHE_DIR", "EXTRA_APTGET", "EXTRA_UV_ENV_VARS", "EXTRA_UV_PIP_ARGS", "EXTRA_UV_SYNC_ARGS", "PY_VERSION", "UV_VERSION", "WORKDIR_PATH" ]' + PERSIST_IMAGE_NAME: ${CSCS_REGISTRY_PATH}/public/${ARCH}/base/gt4py-ci-${PY_VERSION} + WATCH_FILECHANGES: 'ci/Dockerfile ci/cscs-ci.yml ci/cscs-ci-ext-config.yml uv.lock' + parallel: + matrix: + - PY_VERSION: *test_python_versions + +.build_extra_cuda: + variables: + BASE_IMAGE: jfrog.svc.cscs.ch/dockerhub/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} + EXTRA_UV_SYNC_ARGS: "--extra cuda12" + +.build_extra_rocm: + variables: + BASE_IMAGE: jfrog.svc.cscs.ch/dockerhub/rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete + EXTRA_UV_SYNC_ARGS: "--extra rocm7" + EXTRA_UV_ENV_VARS: "CUPY_INSTALL_USE_HIP=1 HCC_AMDGPU_TARGET=gfx942 ROCM_HOME=/opt/rocm" + KUBERNETES_MEMORY_REQUEST: "64Gi" + KUBERNETES_MEMORY_LIMIT: "64Gi" + +build_cscs_gh200: + extends: + - .container-builder-cscs-gh200 + - .build_common + - .build_extra_cuda + needs: [] + +build_cscs_amd_rocm: + extends: + - .container-builder-cscs-zen2 + - .build_common + - .build_extra_rocm + needs: [] + +# -- Test stage -- +.dace_determinism_common: + stage: test + image: ${CSCS_REGISTRY_PATH}/public/${ARCH}/base/gt4py-ci-${PY_VERSION}:${DOCKER_TAG} + variables: + TEST_VARIANTS: 'cpu' + USE_MPI: 0 # TODO(havogt): to workaround the libfabric hook injecting incompatible libraries + SLURM_JOB_NUM_NODES: 1 + SLURM_TIMELIMIT: 60 + allow_failure: true + artifacts: + when: always + paths: + - _dace_deterministic_codegen/ + expire_in: 1 month + parallel: + matrix: + - SUBPACKAGE: [cartesian] + SUBVARIANT: ['cuda12', 'rocm7', 'cpu'] + PY_VERSION: *test_python_versions + - SUBPACKAGE: [next] + SUBVARIANT: ['cuda12', 'rocm7', 'cpu'] + DETAIL: ["nomesh"] + PY_VERSION: *test_python_versions + rules: &exclude_variants_rules + - if: '$SUBVARIANT == "cpu" && ($TEST_VARIANTS !~ /(^|\s)cpu(\s|$)/)' + when: never + - if: '$SUBVARIANT == "cuda12" && ($TEST_VARIANTS !~ /(^|\s)cuda12(\s|$)/)' + when: never + - if: '$SUBVARIANT == "rocm7" && ($TEST_VARIANTS !~ /(^|\s)rocm7(\s|$)/)' + when: never + + script: + - mkdir -p "${WORKDIR}/gt4py" && git clone --depth 1 "${CSCS_CI_ORIG_CLONE_URL}" "${WORKDIR}/gt4py" + - cd "${WORKDIR}/gt4py" && git fetch --depth 1 origin "${CI_COMMIT_SHA}" && git checkout "${CI_COMMIT_SHA}" + - export NOX_SESSION_ARGS="(${SUBVARIANT}${DETAIL:+, $DETAIL})" + - | + cd "${WORKDIR}/gt4py" + nox_rc=0 + ./noxfile.py -s "test_${SUBPACKAGE}_dace_determinism-${PY_VERSION}${NOX_SESSION_ARGS}" || nox_rc=$? + if [ -d _dace_deterministic_codegen ]; then + cp -r _dace_deterministic_codegen "${CI_PROJECT_DIR}/" + fi + exit $nox_rc + +dace_determinism_cscs_gh200: + extends: + - .container-runner-santis-gh200 + - .dace_determinism_common + needs: + - build_cscs_gh200 + variables: + TEST_VARIANTS: 'cuda12' + SLURM_GPUS_PER_NODE: 1 + SLURM_PARTITION: 'shared' + GT4PY_BUILD_JOBS: 8 + PYTEST_XDIST_AUTO_NUM_WORKERS: 32 + rules: + - *exclude_variants_rules + - if: $SUBPACKAGE == 'next' && $DETAIL == 'nomesh' + variables: + # TODO: investigate why the dace tests seem to hang with multiple jobs + GT4PY_BUILD_JOBS: 1 + - if: $SUBPACKAGE == 'cartesian' + variables: + GT4PY_CARTESIAN_DETERMINISM_XDIST: 4 + - when: on_success + +dace_determinism_cscs_amd_rocm: + extends: + - .tds-container-runner-beverin-mi200 + - .dace_determinism_common + needs: + - build_cscs_amd_rocm + variables: + TEST_VARIANTS: 'cpu rocm7' + SLURM_GPUS_PER_NODE: 4 + GT4PY_BUILD_JOBS: 8 + PYTEST_XDIST_AUTO_NUM_WORKERS: 32 + SLURM_PARTITION: mi300 + CMAKE_PREFIX_PATH: /opt/rocm + CUDA_HOME: /opt/rocm + CXX: /opt/rocm/bin/hipcc + rules: + - *exclude_variants_rules + - if: $SUBPACKAGE == 'cartesian' + variables: + GT4PY_CARTESIAN_DETERMINISM_XDIST: 4 + - when: on_success diff --git a/noxfile.py b/noxfile.py index 8abee16251..7f79613b21 100755 --- a/noxfile.py +++ b/noxfile.py @@ -21,6 +21,8 @@ import os import pathlib +import shutil +import sys from collections.abc import Sequence from typing import Final, Literal, TypeAlias @@ -345,5 +347,284 @@ def test_typing_exports(session: nox.Session) -> None: ) +# -- DaCe codegen determinism check -- +# +# The two `test_*_dace_determinism` sessions below each run gt4py's pytest +# selection twice with isolated GT4PY_BUILD_CACHE_DIR per run, then +# verify the DaCe-generated source files under /src/ are +# byte-identical between the two runs. A diff is a determinism bug +# somewhere in the gt4py + dace toolchain for that test selection. +# +# Comparison logic (snapshot, hash, diff, report) lives in +# `scripts/dace_deterministic_codegen.py`; the helper below just +# wires gt4py's existing pytest invocation pattern into a "run +# twice + compare" loop. +# +# Workdir at REPO_ROOT/_dace_deterministic_codegen/ (wiped before +# each session invocation): +# run1/.gt4py_cache/... (first run's cached programs) +# run2/.gt4py_cache/... (second run's cached programs) +# diffs//.diff (only on mismatch) +# report.txt (human-readable summary) +# +# Only `dace` codegen is checked (`internal` doesn't go through dace), +# so the codegen parameter is dropped from these sessions' signatures. + +DACE_DETERMINISM_WORKDIR_NAME: Final = "_dace_deterministic_codegen" + + +def _run_dace_determinism_check( + session: nox.Session, + pytest_args: Sequence[str], + *, + layout: Literal["next", "cartesian"], +) -> None: + """Run pytest twice with an isolated cache per run, then verify the + DaCe-generated source files are byte-identical between the two runs. + + The ``layout`` parameter selects which cache mechanism gt4py is using: + + * ``"next"`` — sets ``GT4PY_BUILD_CACHE_DIR=`` so the cache + lands at ``/.gt4py_cache/``, where the comparator walks + ``_/src/{cpu,cuda}/...``. + * ``"cartesian"`` — sets ``GT_CACHE_ROOT=`` plus + ``GT_CACHE_PYTEST_DIR=/gt_cache`` AND passes + ``--keep-gtcache`` to pytest. The conftest in + ``tests/cartesian_tests/conftest.py`` unconditionally + ``shutil.rmtree``\\ s its cache directory at ``pytest_sessionfinish`` + unless that CLI flag is present — that gating is independent of the + env vars, so we need both knobs. The comparator then walks + ``/gt_cache/py_// + /__/`` and compares + ``m_*.py`` + ``bindings.{cpp,cu}`` + ``computation.hpp``. + + On mismatch, calls ``session.error(...)`` with a pointer to the + diffs/ directory and report.txt so the failure is actionable. + """ + workdir = REPO_ROOT / DACE_DETERMINISM_WORKDIR_NAME + if workdir.exists(): + shutil.rmtree(workdir) + + run1_dir = workdir / "run1" + run2_dir = workdir / "run2" + run1_dir.mkdir(parents=True) + run2_dir.mkdir(parents=True) + + # Per-layout knobs: + # - cache_subdir: the subdirectory of run_dir where the cache lands + # - extra_pytest: additional pytest CLI args (cartesian needs + # --keep-gtcache; see conftest in + # tests/cartesian_tests/conftest.py:pytest_sessionfinish) + # - env_for_run: env-var overrides for the pytest subprocess + # + # Setting DACE_compiler_build_folder_mode to `development` is REQUIRED for + # both layouts. gt4py configures dace to `production` mode by default, + # which cleans up the dace build folder after compilation — leaving only + # the compiled .so and stripping the codegen sources we need to diff. + # Forcing `development` keeps `src/...` (next) and `bindings.{cpp,cu}` + + # `computation.hpp` (cartesian) around so the checker has codegen to + # compare. (See src/gt4py/next/program_processors/runners/dace/workflow/ + # common.py for the upstream next-side config this overrides; the + # comment there explicitly documents this env var as the escape hatch.) + if layout == "cartesian": + cache_subdir = "gt_cache" + extra_pytest_args: list[str] = ["--keep-gtcache"] + + def env_for_run(run_dir: pathlib.Path) -> dict[str, str]: + # gt4py.cartesian and gt4py.next have entirely separate caching + # subsystems with separate env vars. cartesian uses + # GT_CACHE_ROOT (the `root_path` for cache_settings) and + # GT_CACHE_PYTEST_DIR (which the conftest writes into + # cache_settings["dir_name"]). Both required to isolate the + # cache per run; --keep-gtcache is required for it to survive + # pytest_sessionfinish. + return { + "GT_CACHE_ROOT": str(run_dir), + "GT_CACHE_PYTEST_DIR": str(run_dir / cache_subdir), + "DACE_compiler_build_folder_mode": "development", + } + else: + cache_subdir = ".gt4py_cache" + extra_pytest_args = [] + + def env_for_run(run_dir: pathlib.Path) -> dict[str, str]: + # gt4py.next appends `.gt4py_cache` to GT4PY_BUILD_CACHE_DIR, so + # we pass the parent directory and the cache lands at + # .gt4py_cache/ underneath. Setting GT4PY_BUILD_CACHE_LIFETIME + # to `persistent` keeps the cache around long enough for the + # snapshot pass to read it. + return { + "GT4PY_BUILD_CACHE_DIR": str(run_dir), + "GT4PY_BUILD_CACHE_LIFETIME": "persistent", + "DACE_compiler_build_folder_mode": "development", + } + + for run_dir in (run1_dir, run2_dir): + session.run( + *pytest_args, + *extra_pytest_args, + *session.posargs, + env=session.env | env_for_run(run_dir), + # The determinism check cares only about whether the DaCe + # codegen lands deterministically in the cache; individual + # test outcomes are irrelevant. Failed tests (exit code 1) + # often reflect runtime issues that have nothing to do with + # codegen — e.g., GPU contention from pytest-xdist workers + # racing for a single CUDA context on Santis, producing + # spurious cupy OutOfMemoryErrors. As long as SOME programs + # got cached, the comparator (called below with + # tolerate_missing=True) extracts the determinism signal from + # whatever overlap is present. + success_codes=[0, 1, NO_TESTS_COLLECTED_EXIT_CODE], + ) + + # Import the comparison library from scripts/. It uses only stdlib, + # so it runs fine in nox's runtime python (no session venv needed). + if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + from scripts.dace_deterministic_codegen import ( + DeterminismError, + NoProgramsObservedError, + NoSourceFilesObservedError, + UnsupportedBackendError, + check_determinism, + ) + + try: + check_determinism( + run1_dir / cache_subdir, + run2_dir / cache_subdir, + diffs_dir=workdir / "diffs", + report_path=workdir / "report.txt", + # Programs cached in only one run are reported but not + # counted as determinism failures — see the success_codes + # note above for why this is the right policy here. + tolerate_missing=True, + layout=layout, + ) + except DeterminismError as e: + session.error(f"{e}\nSee {workdir / 'report.txt'} and {workdir / 'diffs'}/") + except NoProgramsObservedError as e: + session.error(f"{e}\nLikely the pytest selection collected no tests.") + except NoSourceFilesObservedError as e: + session.error(str(e)) + except UnsupportedBackendError as e: + session.error(str(e)) + finally: + # Reclaim disk after the comparison. The two per-run caches are + # ~hundreds of MB each in development mode, and dace's own + # `.dacecache/` at the repo root (used for SDFGs not routed + # through gt4py's build_folder override) is comparably bulky. + # We always keep `workdir/diffs/` and `workdir/report.txt` — + # those are the artifacts a maintainer actually needs to debug + # a determinism failure; the raw caches are reproducible by + # rerunning the session. + for tbd in ( + run1_dir, + run2_dir, + REPO_ROOT / ".dacecache", + ): + if tbd.exists(): + session.log(f"cleanup: removing {tbd}") + shutil.rmtree(tbd, ignore_errors=True) + + +@nox.session(python=PYTHON_VERSIONS, tags=["cartesian", "dace", "determinism"]) +@nox.parametrize("device", [*DeviceNoxParam.values()]) +def test_cartesian_dace_determinism( + session: nox.Session, + device: DeviceOption, +) -> None: + """Run selected 'gt4py.cartesian' DaCe tests twice and verify codegen + is byte-identical between the two runs.""" + + codegen_settings = CodeGenDaceTestSettings["dace"] + device_settings = DeviceTestSettings[device] + extras = [ + "standard", + "testing", + *codegen_settings.get("extras", []), + *device_settings.get("extras", []), + ] + groups = ["test", *codegen_settings.get("groups", []), *device_settings.get("groups", [])] + + install_session_venv(session, extras=extras, groups=groups) + + markers = " and ".join(codegen_settings["markers"] + device_settings["markers"]) + + xdist_workers = os.environ.get( + "GT4PY_CARTESIAN_DETERMINISM_XDIST", + "2" if device == "cpu" else "4", + ) + + _run_dace_determinism_check( + session, + pytest_args=[ + "pytest", + "--cache-clear", + "-sv", + "-n", + xdist_workers, + "--dist", + "loadgroup", + "-m", + f"{markers}", + str(pathlib.Path("tests") / "cartesian_tests"), + ], + layout="cartesian", + ) + + +@nox.session(python=PYTHON_VERSIONS, tags=["next", "dace", "determinism"]) +@nox.parametrize( + "meshlib", + [ + nox.param("nomesh", id="nomesh", tags=["nomesh"]), + nox.param("atlas", id="atlas", tags=["atlas"]), + ], +) +@nox.parametrize("device", [*DeviceNoxParam.values()]) +def test_next_dace_determinism( + session: nox.Session, + device: DeviceOption, + meshlib: Literal["nomesh", "atlas"], +) -> None: + """Run selected 'gt4py.next' DaCe tests twice and verify codegen + is byte-identical between the two runs.""" + + codegen_settings = CodeGenDaceTestSettings["dace"] + device_settings = DeviceTestSettings[device] + extras = [ + "standard", + "testing", + *codegen_settings.get("extras", []), + *device_settings.get("extras", []), + ] + groups = ["test", *codegen_settings.get("groups", []), *device_settings.get("groups", [])] + mesh_markers: list[str] = [] + + match meshlib: + case "nomesh": + mesh_markers.append("not requires_atlas") + case "atlas": + mesh_markers.append("requires_atlas") + groups.append("frameworks") + + install_session_venv(session, extras=extras, groups=groups) + + markers = " and ".join(codegen_settings["markers"] + device_settings["markers"] + mesh_markers) + + _run_dace_determinism_check( + session, + pytest_args=[ + *"pytest --cache-clear -sv -n auto --dist loadgroup".split(), + "-m", + f"{markers}", + str(pathlib.Path("tests") / "next_tests"), + ], + layout="next", + ) + + if __name__ == "__main__": nox.main() diff --git a/scripts/dace_deterministic_codegen.py b/scripts/dace_deterministic_codegen.py new file mode 100644 index 0000000000..1f547959d0 --- /dev/null +++ b/scripts/dace_deterministic_codegen.py @@ -0,0 +1,833 @@ +#!/usr/bin/env python3 +# GT4Py - GridTools Framework +# +# Copyright (c) 2014-2024, ETH Zurich +# All rights reserved. +# +# Please, refer to the LICENSE file in the root directory. +# SPDX-License-Identifier: BSD-3-Clause + +"""GT4Py / DaCe codegen determinism check. + +Library + CLI for verifying that gt4py's DaCe backend produces +byte-identical generated source files across two runs of the same +test selection. Used by the ``test_*_determinism`` nox sessions in +``noxfile.py``; also runnable standalone for ad-hoc comparison of +two existing caches. + +Supports both gt4py cache layouts: + +* ``layout="next"`` (default) — the ``gt4py.next`` cache, a flat + ``/_/src/{cpu,cuda}/...`` structure + written via ``GT4PY_BUILD_CACHE_DIR``. Compares everything dace + writes as generated source under each program's ``src/``. + Unknown top-level backends (anything other than cpu/cuda, with + HIP nesting under cuda/hip) raise :class:`UnsupportedBackendError`. + +* ``layout="cartesian"`` — the ``gt4py.cartesian`` cache, a deeply + nested ``/py_///__/...`` structure written via + ``GT_CACHE_ROOT`` + ``GT_CACHE_PYTEST_DIR`` (with the conftest's + ``--keep-gtcache`` flag needed to survive ``pytest_sessionfinish``). + Compares the top-level ``m_*.py`` loader plus ``bindings.{cpp,cu}`` + and ``computation.hpp`` under ``m_*_pyext_BUILD/``. Skips compiled + artifacts (``*.so``, ``*.o``, ``__pycache__/``), gzipped SDFG + archives (``*.sdfgz`` — gzip headers carry timestamps), the + metadata file (``*.cacheinfo``), and the recursive build mirror + directories (``_GT_/``, ``tmp/``) inside ``_pyext_BUILD/``. + +As a library +------------ + +:: + + from scripts.dace_deterministic_codegen import check_determinism + + check_determinism( + cache1=Path(".../run1/.gt4py_cache"), + cache2=Path(".../run2/.gt4py_cache"), + layout="next", # or "cartesian" + diffs_dir=Path(".../diffs"), # optional + report_path=Path(".../report.txt"), # optional + ) + +Raises ``DeterminismError`` on mismatch, ``NoProgramsObservedError`` +if both caches are empty, ``NoSourceFilesObservedError`` if programs +were cached but contain no source files (typically a missing +``DACE_compiler_build_folder_mode=development``), or +``UnsupportedBackendError`` if the next-layout codegen produced an +unfamiliar top-level backend. + +As a CLI +-------- + +:: + + python scripts/dace_deterministic_codegen.py \\ + --run1 path/to/cache1 \\ + --run2 path/to/cache2 \\ + --layout {next,cartesian} \\ + [--diffs-dir DIR] [--report FILE] + +Exit codes: + + 0 codegen is deterministic + 1 codegen differs (see diffs/ and report.txt) + 2 bad arguments / unsupported backend / no source files captured + 3 no programs observed in either cache +""" + +from __future__ import annotations + +import argparse +import dataclasses +import difflib +import hashlib +import re +import sys +from pathlib import Path +from typing import Literal + + +#: Cache layout dispatch tag. ``"next"`` is the gt4py.next cache +#: (flat ``/_/src/...`` structure); ``"cartesian"`` +#: is the gt4py.cartesian cache (deeply nested ``/py_*/ +#: //__/...``). +Layout = Literal["next", "cartesian"] + + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +#: GT4Py names each cached program folder ``_``. +PROGRAM_FOLDER_RE = re.compile(r"^(?P.+)_(?P[0-9a-f]{64})$") + +#: The single directory under each program folder we compare. Only ``src/``, +#: nothing else — by design. dace also writes ``include/``, ``sample/``, +#: ``program.sdfg``, source maps under ``map/``, runtime metadata +#: (``dace.conf``, ``*.csv``), and build artifacts under ``build/``. None of +#: those are the codegen surface we care about for this check. +CODEGEN_ROOT = "src" + +#: Backends recognized as direct children of ``src/``. dace lays out +#: codegen as ``src//[/]``: +#: +#: - CPU codegen -> src/cpu/.cpp +#: - CUDA codegen -> src/cuda/.cu +#: - HIP codegen -> src/cuda/hip/.cpp (NOTE: under cuda/) +#: +#: HIP is dispatched by dace's CUDA target with ``target_type="hip"``, so +#: it lands as a *subdirectory* of ``src/cuda/``, not its own top-level +#: backend folder. That means {cpu, cuda} as a top-level allowlist is +#: enough to cover all three: cpu via ``cpu/``, cuda + hip both via +#: ``cuda/`` (with ``rglob`` picking up the nested hip files). +#: +#: If a snapshot ever encounters another top-level backend (mpi, sve, +#: mlir, snitch, ...), the checker fails loudly rather than silently +#: ignoring — those would need explicit support added here. +SUPPORTED_BACKENDS: frozenset[str] = frozenset({"cpu", "cuda"}) + + +# Cartesian layout constants ------------------------------------------------ + +#: Suffix that marks the per-stencil build directory inside a cartesian +#: program folder, e.g. ``m_TestCopy_dacecpu_4__dacecpu_a8441f26b4_pyext_BUILD/``. +#: Inside that directory we look at the TOP LEVEL only — its ``_GT_/`` and +#: ``tmp/`` subdirectories contain recursive copies of the build path that +#: setuptools spawns when building into an absolute prefix, and those are +#: build artifacts, not codegen output. +CARTESIAN_BUILD_DIR_SUFFIX = "_pyext_BUILD" + +#: Names of files inside ``m_*_pyext_BUILD/`` whose contents we byte-compare. +#: ``bindings.{cpp,cu}`` is gt4py.cartesian's pybind11 wrapper around the +#: dace SDFG; ``computation.hpp`` is dace's generated kernel implementation. +#: Both reflect the codegen surface directly — a non-deterministic codegen +#: pass will show up here. +CARTESIAN_BUILD_SOURCE_NAMES: frozenset[str] = frozenset( + {"bindings.cpp", "bindings.cu", "computation.hpp"} +) + +#: Directory-name prefixes inside a program folder that we MUST NOT descend +#: into when searching for ``m_*.py`` loader stubs. ``__pycache__`` is +#: Python's bytecode cache; the build dir holds compiler-generated artifacts. +CARTESIAN_SKIP_DIRS: frozenset[str] = frozenset({"__pycache__"}) + +#: The 10-hex codegen digest gt4py.cartesian embeds in filenames like +#: ``m______.py`` and the build +#: directory ``m_...__pyext_BUILD/``. We replace it with the +#: literal ```` in the snapshot's relpath keys so that +#: ``bindings.cpp`` from run1 (digest ``a8441f26b4``) and from run2 +#: (digest ``bbbbbbbbbb``) map to the same path, surfacing a real +#: content diff rather than two "only-in-one-run" entries that look +#: like flaky test selection. +#: +#: The pattern matches ``_`` + 10 lowercase hex + a boundary that is +#: either a file extension (``.py``, ``.so``, ``.sdfgz``, ``.cacheinfo``) +#: or the literal ``_pyext_BUILD`` suffix. Anchoring on those endings +#: avoids false-positive matches inside arbitrary identifiers. +CARTESIAN_DIGEST_RE = re.compile(r"_(?P[0-9a-f]{10})(?=(\.|_pyext_BUILD))") + + +def _normalize_cartesian_relpath(relpath: str) -> str: + """Replace the 10-hex codegen digest in a cartesian relpath with the + literal token ````. Idempotent. Leaves non-matching paths + unchanged.""" + return CARTESIAN_DIGEST_RE.sub("_", relpath) + + +# --------------------------------------------------------------------------- +# Exceptions +# --------------------------------------------------------------------------- + + +class UnsupportedBackendError(RuntimeError): + """A program's ``src/`` contained a top-level backend other than cpu/cuda.""" + + +class NoProgramsObservedError(RuntimeError): + """Neither cache contained any cached program folders.""" + + +class NoSourceFilesObservedError(RuntimeError): + """Programs were observed in the caches but none contained any source files. + + Almost always means dace's build folder mode was left at ``production``, + which strips the generated ``src/`` tree after compilation. Set + ``DACE_compiler_build_folder_mode=development`` (lowercase matters) + before running the tests so the codegen surface survives into the + cache and there's actually something to compare. + """ + + +class DeterminismError(RuntimeError): + """Two snapshots compared non-identical. ``.results`` carries the details.""" + + def __init__(self, message: str, results: list[ProgramResult]) -> None: + super().__init__(message) + self.results = results + + +# --------------------------------------------------------------------------- +# Snapshot +# --------------------------------------------------------------------------- + + +@dataclasses.dataclass(frozen=True) +class FileEntry: + #: Logical relative path used as the comparison key. For ``layout="next"`` + #: this is the on-disk path under the program folder verbatim. For + #: ``layout="cartesian"`` the 10-hex codegen digest in filenames is + #: replaced with the literal token ```` so equivalent files + #: across two runs (which carry different digests) still pair up. + relpath: str + sha256: str + #: The actual filename on disk relative to the program folder. Equal to + #: ``relpath`` for next; un-normalized (real digest preserved) for + #: cartesian. Used by :func:`write_diffs` to read the file back. + disk_relpath: str + + +@dataclasses.dataclass +class ProgramSnapshot: + name: str + folder: Path + files: dict[str, FileEntry] + + +def snapshot_run(cache_root: Path, *, layout: Layout = "next") -> dict[str, ProgramSnapshot]: + """Walk a gt4py build cache and snapshot every program's generated source. + + Dispatches on ``layout`` to either :func:`_snapshot_run_next` (the flat + ``/_/src/...`` structure of gt4py.next) or + :func:`_snapshot_run_cartesian` (the deeply nested + ``/py_*///__/...`` + structure of gt4py.cartesian). + + Returns an empty dict (rather than raising) when the path doesn't + exist or contains no programs in the expected layout; callers can + pair the empty result with :func:`_diagnose_empty_cache` for a + human-readable explanation of why. + """ + if layout == "next": + return _snapshot_run_next(cache_root) + if layout == "cartesian": + return _snapshot_run_cartesian(cache_root) + raise ValueError(f"unknown layout: {layout!r}, expected 'next' or 'cartesian'") + + +def _snapshot_run_next(cache_root: Path) -> dict[str, ProgramSnapshot]: + """Snapshot a gt4py.next-layout cache. + + The input directory's name is irrelevant — the function looks for + immediate subdirectories matching ``_<64-char-hex-digest>`` + (gt4py.next's program-folder naming) and reads ``/src/`` + recursively under each one. HIP files at ``src/cuda/hip/`` are + picked up automatically by the recursive walk. + + Raises :class:`UnsupportedBackendError` if any program's ``src/`` + contains a top-level backend not in :data:`SUPPORTED_BACKENDS`. + """ + if not cache_root.is_dir(): + return {} + + out: dict[str, ProgramSnapshot] = {} + for folder in sorted(p for p in cache_root.iterdir() if p.is_dir()): + m = PROGRAM_FOLDER_RE.match(folder.name) + if not m: + continue + name = m.group("name") + + src_root = folder / CODEGEN_ROOT + if not src_root.is_dir(): + # No src/ at all — record an empty snapshot. Pairing logic + # downstream will flag it if its counterpart has files. + out[name] = ProgramSnapshot(name=name, folder=folder, files={}) + continue + + # Backend check: every direct child of src/ must be a supported + # top-level backend. HIP lives nested under cuda/, so cuda is + # what matters here, not "hip". + for bd in sorted(d for d in src_root.iterdir() if d.is_dir()): + if bd.name not in SUPPORTED_BACKENDS: + raise UnsupportedBackendError( + f"unsupported dace backend `{bd.name}/` found under " + f"{src_root} — this checker currently supports " + f"{sorted(SUPPORTED_BACKENDS)} as top-level backends " + f"(HIP is handled under `cuda/hip/`). Add explicit " + f"support in scripts/dace_deterministic_codegen.py " + f"before running this selection." + ) + + # rglob recursively descends — picks up `cuda/hip/` along + # with `cpu/` and `cuda/`, no special-casing needed. + files: dict[str, FileEntry] = {} + for fpath in sorted(src_root.rglob("*")): + if not fpath.is_file(): + continue + rel = fpath.relative_to(folder).as_posix() + files[rel] = FileEntry(relpath=rel, sha256=_sha256(fpath), disk_relpath=rel) + out[name] = ProgramSnapshot(name=name, folder=folder, files=files) + return out + + +def _snapshot_run_cartesian(cache_root: Path) -> dict[str, ProgramSnapshot]: + """Snapshot a gt4py.cartesian-layout cache. + + Program identity is the **relative path** from ``cache_root`` to the + ``__`` folder, e.g. + ``py310_1013/dacecpu/cartesian_tests/integration_tests/multi_feature_tests + /test_suites/TestCopy_dacecpu_4``. Two runs of the same parametrized + test should produce the same relative path, so this works as a stable + matching key across runs. + + Files compared per program (everything else is skipped — see module + docstring for rationale): + + * ``m_*.py`` at the top of the program folder — the gt4py loader + stub. Its filename embeds the 10-hex codegen digest, and the file + body references it; either changing is a determinism signal. + * Files exactly one level inside ``m_*_pyext_BUILD/`` whose basename + is in :data:`CARTESIAN_BUILD_SOURCE_NAMES` (``bindings.cpp``, + ``bindings.cu``, ``computation.hpp``). The ``_GT_/`` and ``tmp/`` + subdirectories of the build dir are recursive build-path mirrors + that setuptools creates when targeting an absolute prefix — they + contain object files and duplicated outputs, not codegen. + """ + if not cache_root.is_dir(): + return {} + + # Discover program folders by finding every top-level `m_*.py` loader. + # "Top-level" here means: not under __pycache__ and not under any + # *_pyext_BUILD directory (which contains its own copies of generated + # files we don't want). + program_dirs: set[Path] = set() + for py in cache_root.rglob("m_*.py"): + if not py.is_file(): + continue + parts = py.relative_to(cache_root).parts + # Reject if any ancestor is __pycache__ or any *_pyext_BUILD dir + if any( + p in CARTESIAN_SKIP_DIRS or p.endswith(CARTESIAN_BUILD_DIR_SUFFIX) for p in parts[:-1] + ): + continue + program_dirs.add(py.parent) + + out: dict[str, ProgramSnapshot] = {} + for prog_dir in sorted(program_dirs): + program_id = prog_dir.relative_to(cache_root).as_posix() + + files: dict[str, FileEntry] = {} + + # Top-level m_*.py file(s) — the gt4py loader stub(s). + for f in sorted(prog_dir.glob("m_*.py")): + if f.is_file(): + rel = _normalize_cartesian_relpath(f.name) + files[rel] = FileEntry(relpath=rel, sha256=_sha256(f), disk_relpath=f.name) + + # Files DIRECTLY under any m_*_pyext_BUILD/ — iterdir, not rglob, + # so we don't descend into _GT_/ or tmp/ which carry build artifacts. + for build_dir in sorted(prog_dir.glob(f"m_*{CARTESIAN_BUILD_DIR_SUFFIX}")): + if not build_dir.is_dir(): + continue + for f in sorted(build_dir.iterdir()): + if f.is_file() and f.name in CARTESIAN_BUILD_SOURCE_NAMES: + disk_rel = f"{build_dir.name}/{f.name}" + rel = _normalize_cartesian_relpath(disk_rel) + files[rel] = FileEntry(relpath=rel, sha256=_sha256(f), disk_relpath=disk_rel) + + out[program_id] = ProgramSnapshot(name=program_id, folder=prog_dir, files=files) + return out + + +def _sha256(path: Path) -> str: + h = hashlib.sha256() + with path.open("rb") as f: + for chunk in iter(lambda: f.read(1 << 16), b""): + h.update(chunk) + return h.hexdigest() + + +def _diagnose_empty_cache(cache_root: Path, *, layout: Layout = "next") -> str: + """Return a one-line explanation of why :func:`snapshot_run` found nothing.""" + if not cache_root.exists(): + return "path does not exist" + if not cache_root.is_dir(): + return "path exists but is not a directory" + + if layout == "cartesian": + # For cartesian we expect /py_//... + # If there's no py*_*/, the user likely passed the wrong path + # (e.g. the run_dir instead of run_dir/gt_cache). + subdirs = sorted(p for p in cache_root.iterdir() if p.is_dir()) + if not subdirs: + return "directory has no subdirectories" + pyver_dirs = [p for p in subdirs if re.match(r"^py\d+_\d+$", p.name)] + if not pyver_dirs: + sample_names = [p.name for p in subdirs[:3]] + suffix = f" (and {len(subdirs) - 3} more)" if len(subdirs) > 3 else "" + return ( + f"directory contains {len(subdirs)} subdirectory(ies) but none " + f"match cartesian's per-Python-version pattern `py_/` " + f"(saw: {sample_names}{suffix}). Did you pass the cache root, " + f"or its parent?" + ) + # py*/ exists but no m_*.py loader stubs were found anywhere + return ( + "cartesian cache structure present but contains no `m_*.py` loader " + "stubs at any depth — pytest probably collected zero tests, or the " + "conftest's `--keep-gtcache` flag wasn't passed and the cache was " + "wiped at session teardown." + ) + + # Fall through to the next-layout diagnostic + subdirs = sorted(p for p in cache_root.iterdir() if p.is_dir()) + if not subdirs: + return "directory has no subdirectories" + matching = [p for p in subdirs if PROGRAM_FOLDER_RE.match(p.name)] + if not matching: + sample_names = [p.name for p in subdirs[:3]] + suffix = f" (and {len(subdirs) - 3} more)" if len(subdirs) > 3 else "" + return ( + f"directory contains {len(subdirs)} subdirectory(ies) but none " + f"match the program-folder pattern `_<64-char-hex-digest>/` " + f"(saw: {sample_names}{suffix}). If one of those names is " + f"`.gt4py_cache`, you've passed the parent of the cache by mistake." + ) + return "directory has matching subdirectories but they were filtered out" + + +# --------------------------------------------------------------------------- +# Compare +# --------------------------------------------------------------------------- + + +@dataclasses.dataclass +class ProgramResult: + name: str + match: bool + differing_files: list[str] + only_in_run1: list[str] + only_in_run2: list[str] + + @property + def missing_on_one_side(self) -> bool: + """True iff the program was cached in only one of the two runs. + + Distinguished from 'differs by content' (where the program is in + both runs but at least one file's bytes differ) — the latter is + always a determinism failure, the former is often a flaky-test + artifact and can be tolerated via ``tolerate_missing``. + + The compare() loop populates only_in_runN exhaustively with the + absent side's file list ONLY when the whole program is missing; + when both programs are present but one happens to carry an extra + file, only_in_runN contains only that extra file. We distinguish + the two by requiring exactly one side to be wholly empty (which + is what compare() emits for the missing-program case). + """ + return ( + not self.match + and not self.differing_files + and (bool(self.only_in_run1) ^ bool(self.only_in_run2)) + ) + + +def compare( + snap1: dict[str, ProgramSnapshot], + snap2: dict[str, ProgramSnapshot], +) -> list[ProgramResult]: + """Pair programs by name across the two snapshots and diff their files.""" + results: list[ProgramResult] = [] + for name in sorted(set(snap1) | set(snap2)): + s1 = snap1.get(name) + s2 = snap2.get(name) + + if s1 is None or s2 is None: + results.append( + ProgramResult( + name=name, + match=False, + differing_files=[], + only_in_run1=sorted((s1.files if s1 else {}).keys()), + only_in_run2=sorted((s2.files if s2 else {}).keys()), + ) + ) + continue + + keys1, keys2 = set(s1.files), set(s2.files) + only1 = sorted(keys1 - keys2) + only2 = sorted(keys2 - keys1) + differing = sorted( + rel for rel in keys1 & keys2 if s1.files[rel].sha256 != s2.files[rel].sha256 + ) + results.append( + ProgramResult( + name=name, + match=not (differing or only1 or only2), + differing_files=differing, + only_in_run1=only1, + only_in_run2=only2, + ) + ) + return results + + +# --------------------------------------------------------------------------- +# Diff + report +# --------------------------------------------------------------------------- + + +def write_diffs( + results: list[ProgramResult], + snap1: dict[str, ProgramSnapshot], + snap2: dict[str, ProgramSnapshot], + diffs_dir: Path, +) -> None: + """Emit a unified diff per differing file under ``diffs_dir//``.""" + for r in results: + if r.match: + continue + s1, s2 = snap1.get(r.name), snap2.get(r.name) + prog_dir = diffs_dir / r.name + for rel in r.differing_files: + # `rel` is the canonical (normalized) key; the on-disk filename + # may differ from it (cartesian normalizes the 10-hex digest). + # Look up the per-side FileEntry to recover the real path. + e1 = s1.files.get(rel) if s1 else None + e2 = s2.files.get(rel) if s2 else None + f1 = (s1.folder / e1.disk_relpath) if (s1 and e1) else None + f2 = (s2.folder / e2.disk_relpath) if (s2 and e2) else None + if not (f1 and f2 and f1.exists() and f2.exists()): + continue + try: + t1 = f1.read_text().splitlines(keepends=True) + t2 = f2.read_text().splitlines(keepends=True) + except UnicodeDecodeError: + prog_dir.mkdir(parents=True, exist_ok=True) + (prog_dir / f"{rel.replace('/', '__')}.binary-differs").write_text( + f"binary content differs:\n run1: {f1}\n run2: {f2}\n" + ) + continue + udiff = "".join( + difflib.unified_diff( + t1, + t2, + fromfile=f"run1/{rel}", + tofile=f"run2/{rel}", + n=3, + ) + ) + prog_dir.mkdir(parents=True, exist_ok=True) + (prog_dir / f"{rel.replace('/', '__')}.diff").write_text(udiff) + + +def render_report(results: list[ProgramResult], *, tolerate_missing: bool = False) -> str: + n_total = len(results) + n_missing = sum(1 for r in results if r.missing_on_one_side) + n_diff_content = sum( + 1 for r in results if r.differing_files or (not r.match and not r.missing_on_one_side) + ) + n_match = n_total - n_missing - n_diff_content + + header = ( + f"Programs: {n_total} matches: {n_match} " + f"differs: {n_diff_content} only-in-one-run: {n_missing}" + ) + lines = [header, ""] + for r in results: + if r.match: + tag = "MATCH " + elif r.missing_on_one_side: + tag = "ONE-OF" + else: + tag = "DIFFER" + lines.append(f" [{tag}] {r.name}") + if not r.match: + lines.extend(f" differs: {rel}" for rel in r.differing_files) + lines.extend(f" only in run1: {rel}" for rel in r.only_in_run1) + lines.extend(f" only in run2: {rel}" for rel in r.only_in_run2) + + lines.append("") + if n_total == 0: + lines.append("RESULT: no programs observed (nothing was cached).") + elif n_diff_content == 0 and n_missing == 0: + lines.append(f"RESULT: codegen deterministic — {n_match} program(s) match.") + elif n_diff_content == 0 and tolerate_missing: + lines.append( + f"RESULT: codegen deterministic across the {n_match} shared program(s); " + f"{n_missing} program(s) cached in only one run (tolerated)." + ) + else: + suffix = f" (plus {n_missing} cached in only one run)" if n_missing else "" + lines.append( + f"RESULT: NON-DETERMINISTIC CODEGEN — {n_diff_content}/{n_total} " + f"program(s) differ by content{suffix}." + ) + return "\n".join(lines) + "\n" + + +# --------------------------------------------------------------------------- +# Library entry point +# --------------------------------------------------------------------------- + + +def check_determinism( + cache1: Path, + cache2: Path, + *, + diffs_dir: Path | None = None, + report_path: Path | None = None, + tolerate_missing: bool = True, + layout: Layout = "next", +) -> list[ProgramResult]: + """Compare two gt4py caches; write artifacts; raise on mismatch. + + Snapshots both caches (using the ``layout``-specific walker) and + diffs them. Optionally writes per-file unified diffs to + ``diffs_dir//`` and a human-readable summary to + ``report_path``. + + Returns the list of :class:`ProgramResult` on a successful match. + + Parameters + ---------- + cache1, cache2 + Roots of the two caches to compare. For ``layout="next"``, this + is the ``.gt4py_cache/`` directory (i.e. the parent of all the + ``_/`` program folders). For ``layout="cartesian"``, + this is the directory pointed to by ``GT_CACHE_PYTEST_DIR`` (i.e. + the parent of ``py_/``). + layout + Which cache layout to expect. See module docstring for details. + tolerate_missing + See module docstring. Default ``True`` (lenient). + diffs_dir, report_path + If set, persist diagnostic artifacts. + + Raises: + UnsupportedBackendError: + (next layout only) A snapshot contained a backend other than cpu/cuda. + NoProgramsObservedError: + Both caches were empty — likely zero tests collected or the + cache was wiped at teardown. + NoSourceFilesObservedError: + Programs were cached but no source files survived (usually a + missing ``DACE_compiler_build_folder_mode=development``). + DeterminismError: + One or more programs differed between the two runs. Under + ``tolerate_missing=True`` this requires at least one + *content* difference. + """ + snap1 = snapshot_run(cache1, layout=layout) + snap2 = snapshot_run(cache2, layout=layout) + results = compare(snap1, snap2) + + if diffs_dir is not None: + write_diffs(results, snap1, snap2, diffs_dir) + if report_path is not None: + report_path.parent.mkdir(parents=True, exist_ok=True) + report_path.write_text(render_report(results, tolerate_missing=tolerate_missing)) + + if not results: + diag1 = _diagnose_empty_cache(cache1, layout=layout) + diag2 = _diagnose_empty_cache(cache2, layout=layout) + raise NoProgramsObservedError( + "no programs observed in either cache:\n" + f" run1 ({cache1}): {diag1}\n" + f" run2 ({cache2}): {diag2}" + ) + + # Safety net for the silent-false-positive case where both runs cached + # programs but every program's source tree is empty — typically because + # dace's build_folder_mode is `production` (the gt4py default). Without + # this, the comparator would see {} == {} for every program and report + # `deterministic` despite there being nothing to compare. + total_files = sum(len(s.files) for s in snap1.values()) + sum( + len(s.files) for s in snap2.values() + ) + if total_files == 0: + if layout == "cartesian": + hint = ( + "programs were cached but contain none of `m_*.py`, " + "`bindings.{cpp,cu}`, or `computation.hpp`" + ) + else: + hint = "none of them contain any source files under src/" + raise NoSourceFilesObservedError( + f"{len(results)} program(s) cached, but {hint}. This almost " + f"always means dace's build folder mode is `production` rather " + f"than `development`, which strips the codegen output after " + f"compilation. Set DACE_compiler_build_folder_mode=development " + f"(lowercase matters) before running the tests so the codegen " + f"survives into the cache." + ) + + # Count true differs (program in both runs, content differs) and missing + # (program only in one run). Under tolerate_missing, only true differs + # raise; under strict mode, both do. + n_true_differs = sum( + 1 for r in results if r.differing_files or (not r.missing_on_one_side and not r.match) + ) + n_missing = sum(1 for r in results if r.missing_on_one_side) + n_failed = n_true_differs if tolerate_missing else (n_true_differs + n_missing) + + if n_failed > 0: + if tolerate_missing: + msg = ( + f"DaCe codegen is non-deterministic: {n_true_differs}/{len(results)} " + f"program(s) differ by content (plus {n_missing} cached in only one " + f"run, ignored under tolerate_missing)" + ) + else: + msg = f"DaCe codegen is non-deterministic: {n_failed}/{len(results)} program(s) differ" + raise DeterminismError(msg, results) + return results + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="dace_deterministic_codegen", + description=( + "Compare two gt4py build caches and check whether the DaCe " + "generated source files are byte-identical between them." + ), + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + p.add_argument( + "--run1", + required=True, + type=Path, + metavar="PATH", + help=( + "Path to the first cache root. For --layout next, this is the " + ".gt4py_cache/ directory. For --layout cartesian, this is the " + "directory that GT_CACHE_PYTEST_DIR pointed to (the parent of " + "py_/)." + ), + ) + p.add_argument( + "--run2", + required=True, + type=Path, + metavar="PATH", + help="Path to the second cache root. Same conventions as --run1.", + ) + p.add_argument( + "--layout", + choices=["next", "cartesian"], + default="next", + help=( + "Cache layout. `next` is gt4py.next's flat " + "/_/src/... structure (default). `cartesian` " + "is gt4py.cartesian's deeply nested " + "/py_///__/ " + "structure." + ), + ) + p.add_argument( + "--diffs-dir", + type=Path, + default=None, + metavar="PATH", + help="If set, write per-file unified diffs to this directory.", + ) + p.add_argument( + "--report", + type=Path, + default=None, + metavar="PATH", + help="If set, write the human-readable summary report to this file.", + ) + p.add_argument( + "--tolerate-missing", + action=argparse.BooleanOptionalAction, + default=True, + help=( + "Whether to skip programs cached in only one of the two runs. " + "Default: lenient — only content differences in shared programs " + "raise. Pass --no-tolerate-missing for strict mode, where any " + "program absent from one cache also counts as a determinism " + "failure." + ), + ) + return p.parse_args(argv) + + +def main(argv: list[str] | None = None) -> int: + args = parse_args(argv) + + try: + results = check_determinism( + args.run1.expanduser().resolve(), + args.run2.expanduser().resolve(), + diffs_dir=args.diffs_dir.expanduser().resolve() if args.diffs_dir else None, + report_path=args.report.expanduser().resolve() if args.report else None, + tolerate_missing=args.tolerate_missing, + layout=args.layout, + ) + except UnsupportedBackendError as e: + print(f"error: {e}", file=sys.stderr) + return 2 + except NoProgramsObservedError as e: + print(f"error: {e}", file=sys.stderr) + return 3 + except NoSourceFilesObservedError as e: + print(f"error: {e}", file=sys.stderr) + return 2 + except DeterminismError as e: + print(render_report(e.results, tolerate_missing=args.tolerate_missing)) + print(f"error: {e}", file=sys.stderr) + return 1 + + print(render_report(results, tolerate_missing=args.tolerate_missing)) + return 0 + + +if __name__ == "__main__": + sys.exit(main())