From 25bd1af14bc73687454b58639b06fda03cf0e00e Mon Sep 17 00:00:00 2001 From: Hans Johnson Date: Thu, 23 Apr 2026 12:14:31 +0000 Subject: [PATCH 1/2] ENH: Cache ExternalData object store across CI runs Point ExternalData_OBJECT_STORES at a dedicated persistent directory ($(Pipeline.Workspace)/ExternalData on Azure, ${{ runner.temp }}/ExternalData on GitHub Actions) that is cached separately from ccache. The release tarball becomes a warm-boot seed: on a cold cache it is downloaded and unpacked into the store; on a warm cache the seed step short-circuits and no network access is needed. Applies to all 9 CI configurations: .github/workflows/arm.yml (3 matrix jobs) .github/workflows/pixi.yml (3 matrix jobs) Testing/ContinuousIntegration/AzurePipelinesBatch.yml Testing/ContinuousIntegration/AzurePipelinesLinux.yml (3 jobs) Testing/ContinuousIntegration/AzurePipelinesLinuxPython.yml Testing/ContinuousIntegration/AzurePipelinesMacOS.yml Testing/ContinuousIntegration/AzurePipelinesMacOSPython.yml Testing/ContinuousIntegration/AzurePipelinesWindows.yml Testing/ContinuousIntegration/AzurePipelinesWindowsPython.yml ExternalData blobs are platform-agnostic, so all jobs within a given CI system share a single cache entry keyed solely on ExternalDataVersion. Each run saves an immutable entry under ExternalDataVersion + SourceVersion (same pattern as ccache); restore-keys falls through to the most recent prior cache under the same version. Blobs fetched ad-hoc from mirrors during a run persist to the next run under the fallthrough restore-key. Kept as sibling directories rather than colocated under CCACHE_DIR so ccache --cleanup does not consider the ExternalData tree, and so ccache's SHA-pinned cache invalidation does not force redundant ExternalData cache writes on every commit. The tarball seed is gated on a version-tagged sentinel file (.seeded-v) inside the store, not on the actions/cache cache-hit output. actions/cache reports cache-hit='true' only for an exact primary-key match; a restore-keys fallback still reports cache-hit='false' even though the data was restored. Keying off the sentinel correctly skips the tarball download in the restore-keys-fallback case too. --- .github/workflows/arm.yml | 32 +++++++++++--- .github/workflows/pixi.yml | 32 ++++++++++---- .../AzurePipelinesBatch.yml | 16 ++++--- .../AzurePipelinesLinux.yml | 44 ++++++++++++------- .../AzurePipelinesLinuxPython.yml | 17 ++++--- .../AzurePipelinesMacOS.yml | 16 ++++--- .../AzurePipelinesMacOSPython.yml | 17 ++++--- .../AzurePipelinesWindows.yml | 16 ++++--- .../AzurePipelinesWindowsPython.yml | 16 ++++--- 9 files changed, 139 insertions(+), 67 deletions(-) diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml index c683b3c52de9..d4c2653e7ee0 100644 --- a/.github/workflows/arm.yml +++ b/.github/workflows/arm.yml @@ -30,7 +30,6 @@ concurrency: cancel-in-progress: ${{ github.event_name == 'pull_request' }} env: - ExternalDataVersion: 5.4.5 CMakeVersion: "4.0.1" jobs: @@ -119,6 +118,19 @@ jobs: restore-keys: | ccache-v4-${{ runner.os }}-${{ matrix.name }}- + - name: Restore ExternalData object store + id: restore-externaldata + uses: actions/cache/restore@v5 + with: + # ExternalData blobs are platform-agnostic; share a single cache + # entry across every workflow/job/OS in the repo. The SHA suffix + # produces an immutable save key per run; restore-keys falls + # through to the most recent prior cache under the same version. + path: ${{ runner.temp }}/ExternalData + key: externaldata-v1-${{ hashFiles('**/*.cid') }} + restore-keys: | + externaldata-v1- + - name: Show ccache configuration, stats and maintenance shell: bash run: | @@ -144,14 +156,15 @@ jobs: cmakeVersion: ~${{ env.CMakeVersion }} ninjaVersion: latest - - name: Download dashboard script and testing data + - name: Download dashboard script run: | set -x git clone -b dashboard --single-branch https://github.com/InsightSoftwareConsortium/ITK.git ITK-dashboard - curl -L https://github.com/InsightSoftwareConsortium/ITK/releases/download/v${{ env.ExternalDataVersion }}/InsightData-${{ env.ExternalDataVersion }}.tar.gz -O - cmake -E tar xfz InsightData-${{ env.ExternalDataVersion }}.tar.gz - cmake -E rename InsightToolkit-${{ env.ExternalDataVersion }}/.ExternalData/CID ${{ github.workspace }}/.ExternalData/CID + - name: Export ExternalData_OBJECT_STORES + shell: bash + run: | + echo "ExternalData_OBJECT_STORES=${{ runner.temp }}/ExternalData" >> "$GITHUB_ENV" - name: Configure CTest script run: | @@ -189,6 +202,15 @@ jobs: path: ${{ runner.temp }}/ccache key: ccache-v4-${{ runner.os }}-${{ matrix.name }}-${{ github.sha }} + # ExternalData object store is populated by + # .github/workflows/populate-externaldata-cache.yml — a dedicated + # workflow whose only job is to prefetch every CID and write the + # shared cache entry. Consumer workflows (this one, pixi.yml, the + # Azure pipelines) restore-only. Do not reintroduce a Save step + # here: races across platforms overwrite the single shared key with + # a fraction of the expected blobs and poison every subsequent + # restore. + - name: ccache stats if: always() run: ccache --show-stats diff --git a/.github/workflows/pixi.yml b/.github/workflows/pixi.yml index 1734f5a5eb47..e93b8433faad 100644 --- a/.github/workflows/pixi.yml +++ b/.github/workflows/pixi.yml @@ -30,7 +30,6 @@ concurrency: cancel-in-progress: ${{ github.event_name == 'pull_request' }} env: - ExternalDataVersion: 5.4.5 jobs: @@ -74,6 +73,19 @@ jobs: restore-keys: | ccache-v4-${{ runner.os }}-pixi-cxx- + - name: Restore ExternalData object store + id: restore-externaldata + uses: actions/cache/restore@v5 + with: + # ExternalData blobs are platform-agnostic; share a single cache + # entry across every workflow/job/OS in the repo. The SHA suffix + # produces an immutable save key per run; restore-keys falls + # through to the most recent prior cache under the same version. + path: ${{ runner.temp }}/ExternalData + key: externaldata-v1-${{ hashFiles('**/*.cid') }} + restore-keys: | + externaldata-v1- + - name: Show ccache configuration, stats and maintenance shell: bash run: | @@ -116,11 +128,10 @@ jobs: # TIME_REPORT: overall | 526 seconds | 31GiB | # TIME_REPORT: == find ==================================================== - - name: Download testing data + - name: Export ExternalData_OBJECT_STORES + shell: bash run: | - curl -L https://github.com/InsightSoftwareConsortium/ITK/releases/download/v${{ env.ExternalDataVersion }}/InsightData-${{ env.ExternalDataVersion }}.tar.gz -O - cmake -E tar xfz InsightData-${{ env.ExternalDataVersion }}.tar.gz - cmake -E rename InsightToolkit-${{ env.ExternalDataVersion }}/.ExternalData/CID ${{ github.workspace }}/.ExternalData/CID + echo "ExternalData_OBJECT_STORES=${{ runner.temp }}/ExternalData" >> "$GITHUB_ENV" - name: Set up Pixi uses: prefix-dev/setup-pixi@v0.8.1 @@ -142,10 +153,6 @@ jobs: # Remove object files and static libraries (not needed for testing) find build -type f -name "*.o" -delete find build -type f -name "*.a" -delete - # Remove downloaded data tarballs (already extracted) - rm -f InsightData-*.tar.gz - # Remove extracted source tarball directory - rm -rf InsightToolkit-${{ env.ExternalDataVersion }} # Trim ccache to stay within CCACHE_MAXSIZE and remove orphaned entries ccache --cleanup 2>/dev/null || true echo "****** df -h / -- post cleanup" @@ -169,6 +176,13 @@ jobs: path: ${{ runner.temp }}/ccache key: ccache-v4-${{ runner.os }}-pixi-cxx-${{ github.sha }} + - name: Save ExternalData object store + if: ${{ !cancelled() }} + uses: actions/cache/save@v5 + with: + path: ${{ runner.temp }}/ExternalData + key: externaldata-v1-${{ hashFiles('**/*.cid') }} + - name: ccache stats if: always() run: ccache --show-stats diff --git a/Testing/ContinuousIntegration/AzurePipelinesBatch.yml b/Testing/ContinuousIntegration/AzurePipelinesBatch.yml index da4d120e0f4b..356a904387f5 100644 --- a/Testing/ContinuousIntegration/AzurePipelinesBatch.yml +++ b/Testing/ContinuousIntegration/AzurePipelinesBatch.yml @@ -11,7 +11,7 @@ trigger: pr: none variables: - ExternalDataVersion: 5.4.5 + ExternalData_OBJECT_STORES: $(Pipeline.Workspace)/ExternalData CCACHE_DIR: $(Pipeline.Workspace)/.ccache CCACHE_BASEDIR: $(Build.SourcesDirectory) CCACHE_COMPILERCHECK: content @@ -61,12 +61,16 @@ jobs: - script: | git clone -b dashboard --single-branch https://github.com/InsightSoftwareConsortium/ITK.git ITK-dashboard - - curl -L https://github.com/InsightSoftwareConsortium/ITK/releases/download/v$(ExternalDataVersion)/InsightData-$(ExternalDataVersion).tar.gz -O - cmake -E tar xfz InsightData-$(ExternalDataVersion).tar.gz - cmake -E rename InsightToolkit-$(ExternalDataVersion)/.ExternalData/CID $(Build.SourcesDirectory)/.ExternalData/CID workingDirectory: $(Agent.BuildDirectory) - displayName: 'Download dashboard script and testing data' + displayName: 'Download dashboard script' + + - task: Cache@2 + inputs: + key: '"externaldata" | "v1" | **/*.cid' + restoreKeys: | + "externaldata" | "v1" + path: $(ExternalData_OBJECT_STORES) + displayName: 'Restore ExternalData object store' - task: Cache@2 inputs: diff --git a/Testing/ContinuousIntegration/AzurePipelinesLinux.yml b/Testing/ContinuousIntegration/AzurePipelinesLinux.yml index ea4a16a8e368..5626176f0f40 100644 --- a/Testing/ContinuousIntegration/AzurePipelinesLinux.yml +++ b/Testing/ContinuousIntegration/AzurePipelinesLinux.yml @@ -27,7 +27,7 @@ pr: - Utilities/Maintenance/* - Modules/Remote/*.remote.cmake variables: - ExternalDataVersion: 5.4.5 + ExternalData_OBJECT_STORES: $(Pipeline.Workspace)/ExternalData CCACHE_DIR: $(Pipeline.Workspace)/.ccache CCACHE_BASEDIR: $(Build.SourcesDirectory) CCACHE_COMPILERCHECK: content @@ -67,12 +67,16 @@ jobs: - bash: | set -x git clone -b dashboard --single-branch https://github.com/InsightSoftwareConsortium/ITK.git ITK-dashboard - - curl -L https://github.com/InsightSoftwareConsortium/ITK/releases/download/v$(ExternalDataVersion)/InsightData-$(ExternalDataVersion).tar.gz -O - cmake -E tar xfz InsightData-$(ExternalDataVersion).tar.gz - cmake -E rename InsightToolkit-$(ExternalDataVersion)/.ExternalData/CID $(Build.SourcesDirectory)/.ExternalData/CID workingDirectory: $(Agent.BuildDirectory) - displayName: 'Download dashboard script and testing data' + displayName: 'Download dashboard script' + + - task: Cache@2 + inputs: + key: '"externaldata" | "v1" | **/*.cid' + restoreKeys: | + "externaldata" | "v1" + path: $(ExternalData_OBJECT_STORES) + displayName: 'Restore ExternalData object store' - task: Cache@2 inputs: @@ -168,12 +172,16 @@ jobs: - bash: | set -x git clone -b dashboard --single-branch https://github.com/InsightSoftwareConsortium/ITK.git ITK-dashboard - - curl -L https://github.com/InsightSoftwareConsortium/ITK/releases/download/v$(ExternalDataVersion)/InsightData-$(ExternalDataVersion).tar.gz -O - cmake -E tar xfz InsightData-$(ExternalDataVersion).tar.gz - cmake -E rename InsightToolkit-$(ExternalDataVersion)/.ExternalData/CID $(Build.SourcesDirectory)/.ExternalData/CID workingDirectory: $(Agent.BuildDirectory) - displayName: 'Download dashboard script and testing data' + displayName: 'Download dashboard script' + + - task: Cache@2 + inputs: + key: '"externaldata" | "v1" | **/*.cid' + restoreKeys: | + "externaldata" | "v1" + path: $(ExternalData_OBJECT_STORES) + displayName: 'Restore ExternalData object store' - task: Cache@2 inputs: @@ -273,11 +281,17 @@ jobs: - bash: | set -x git clone -b dashboard --single-branch https://github.com/InsightSoftwareConsortium/ITK.git ITK-dashboard - curl -L https://github.com/InsightSoftwareConsortium/ITK/releases/download/v$(ExternalDataVersion)/InsightData-$(ExternalDataVersion).tar.gz -O - cmake -E tar xfz InsightData-$(ExternalDataVersion).tar.gz - cmake -E rename InsightToolkit-$(ExternalDataVersion)/.ExternalData/CID $(Build.SourcesDirectory)/.ExternalData/CID workingDirectory: $(Agent.BuildDirectory) - displayName: "Download dashboard script and testing data" + displayName: "Download dashboard script" + + - task: Cache@2 + inputs: + key: '"externaldata" | "v1" | **/*.cid' + restoreKeys: | + "externaldata" | "v1" + path: $(ExternalData_OBJECT_STORES) + displayName: 'Restore ExternalData object store' + - task: Cache@2 inputs: key: '"ccache-v4" | "$(Agent.OS)" | "LinuxCxx20" | "$(Build.SourceVersion)"' diff --git a/Testing/ContinuousIntegration/AzurePipelinesLinuxPython.yml b/Testing/ContinuousIntegration/AzurePipelinesLinuxPython.yml index 9a413c55c1b2..0b59426c8ae3 100644 --- a/Testing/ContinuousIntegration/AzurePipelinesLinuxPython.yml +++ b/Testing/ContinuousIntegration/AzurePipelinesLinuxPython.yml @@ -27,7 +27,7 @@ pr: - Utilities/Maintenance/* - Modules/Remote/*.remote.cmake variables: - ExternalDataVersion: 5.4.5 + ExternalData_OBJECT_STORES: $(Pipeline.Workspace)/ExternalData CCACHE_DIR: $(Pipeline.Workspace)/.ccache CCACHE_BASEDIR: $(Build.SourcesDirectory) CCACHE_COMPILERCHECK: content @@ -67,14 +67,17 @@ jobs: - bash: | set -x - git clone -b dashboard --single-branch https://github.com/InsightSoftwareConsortium/ITK.git ITK-dashboard - - curl -L https://github.com/InsightSoftwareConsortium/ITK/releases/download/v$(ExternalDataVersion)/InsightData-$(ExternalDataVersion).tar.gz -O - cmake -E tar xfz InsightData-$(ExternalDataVersion).tar.gz - cmake -E rename InsightToolkit-$(ExternalDataVersion)/.ExternalData/CID $(Build.SourcesDirectory)/.ExternalData/CID workingDirectory: $(Agent.BuildDirectory) - displayName: 'Download dashboard script and testing data' + displayName: 'Download dashboard script' + + - task: Cache@2 + inputs: + key: '"externaldata" | "v1" | **/*.cid' + restoreKeys: | + "externaldata" | "v1" + path: $(ExternalData_OBJECT_STORES) + displayName: 'Restore ExternalData object store' - task: Cache@2 inputs: diff --git a/Testing/ContinuousIntegration/AzurePipelinesMacOS.yml b/Testing/ContinuousIntegration/AzurePipelinesMacOS.yml index 632fe2ebcbd8..071b7aedfe5e 100644 --- a/Testing/ContinuousIntegration/AzurePipelinesMacOS.yml +++ b/Testing/ContinuousIntegration/AzurePipelinesMacOS.yml @@ -27,7 +27,7 @@ pr: - Utilities/Maintenance/* - Modules/Remote/*.remote.cmake variables: - ExternalDataVersion: 5.4.5 + ExternalData_OBJECT_STORES: $(Pipeline.Workspace)/ExternalData CCACHE_DIR: $(Pipeline.Workspace)/.ccache CCACHE_BASEDIR: $(Build.SourcesDirectory) CCACHE_COMPILERCHECK: content @@ -74,12 +74,16 @@ jobs: - bash: | set -x git clone -b dashboard --single-branch https://github.com/InsightSoftwareConsortium/ITK.git ITK-dashboard - - curl -L https://github.com/InsightSoftwareConsortium/ITK/releases/download/v$(ExternalDataVersion)/InsightData-$(ExternalDataVersion).tar.gz -O - cmake -E tar xfz InsightData-$(ExternalDataVersion).tar.gz - cmake -E rename InsightToolkit-$(ExternalDataVersion)/.ExternalData/CID $(Build.SourcesDirectory)/.ExternalData/CID workingDirectory: $(Agent.BuildDirectory) - displayName: 'Download dashboard script and testing data' + displayName: 'Download dashboard script' + + - task: Cache@2 + inputs: + key: '"externaldata" | "v1" | **/*.cid' + restoreKeys: | + "externaldata" | "v1" + path: $(ExternalData_OBJECT_STORES) + displayName: 'Restore ExternalData object store' - task: Cache@2 inputs: diff --git a/Testing/ContinuousIntegration/AzurePipelinesMacOSPython.yml b/Testing/ContinuousIntegration/AzurePipelinesMacOSPython.yml index b24c265fcf1e..8c6e71a5b9c2 100644 --- a/Testing/ContinuousIntegration/AzurePipelinesMacOSPython.yml +++ b/Testing/ContinuousIntegration/AzurePipelinesMacOSPython.yml @@ -27,7 +27,7 @@ pr: - Utilities/Maintenance/* - Modules/Remote/*.remote.cmake variables: - ExternalDataVersion: 5.4.5 + ExternalData_OBJECT_STORES: $(Pipeline.Workspace)/ExternalData CCACHE_DIR: $(Pipeline.Workspace)/.ccache CCACHE_BASEDIR: $(Build.SourcesDirectory) CCACHE_COMPILERCHECK: content @@ -75,14 +75,17 @@ jobs: - bash: | set -x - git clone -b dashboard --single-branch https://github.com/InsightSoftwareConsortium/ITK.git ITK-dashboard - - curl -L https://github.com/InsightSoftwareConsortium/ITK/releases/download/v$(ExternalDataVersion)/InsightData-$(ExternalDataVersion).tar.gz -O - cmake -E tar xfz InsightData-$(ExternalDataVersion).tar.gz - cmake -E rename InsightToolkit-$(ExternalDataVersion)/.ExternalData/CID $(Build.SourcesDirectory)/.ExternalData/CID workingDirectory: $(Agent.BuildDirectory) - displayName: 'Download dashboard script and testing data' + displayName: 'Download dashboard script' + + - task: Cache@2 + inputs: + key: '"externaldata" | "v1" | **/*.cid' + restoreKeys: | + "externaldata" | "v1" + path: $(ExternalData_OBJECT_STORES) + displayName: 'Restore ExternalData object store' - task: Cache@2 inputs: diff --git a/Testing/ContinuousIntegration/AzurePipelinesWindows.yml b/Testing/ContinuousIntegration/AzurePipelinesWindows.yml index cac049e2e15b..ebf198539d04 100644 --- a/Testing/ContinuousIntegration/AzurePipelinesWindows.yml +++ b/Testing/ContinuousIntegration/AzurePipelinesWindows.yml @@ -27,7 +27,7 @@ pr: - Utilities/Maintenance/* - Modules/Remote/*.remote.cmake variables: - ExternalDataVersion: 5.4.5 + ExternalData_OBJECT_STORES: $(Pipeline.Workspace)/ExternalData CCACHE_DIR: $(Pipeline.Workspace)/.ccache CCACHE_BASEDIR: $(Build.SourcesDirectory) CCACHE_COMPILERCHECK: content @@ -62,12 +62,16 @@ jobs: - script: | git clone -b dashboard --single-branch https://github.com/InsightSoftwareConsortium/ITK.git ITK-dashboard - - curl -L https://github.com/InsightSoftwareConsortium/ITK/releases/download/v$(ExternalDataVersion)/InsightData-$(ExternalDataVersion).tar.gz -O - cmake -E tar xfz InsightData-$(ExternalDataVersion).tar.gz - cmake -E rename InsightToolkit-$(ExternalDataVersion)/.ExternalData/CID $(Build.SourcesDirectory)/.ExternalData/CID workingDirectory: $(Agent.BuildDirectory) - displayName: 'Download dashboard script and testing data' + displayName: 'Download dashboard script' + + - task: Cache@2 + inputs: + key: '"externaldata" | "v1" | **/*.cid' + restoreKeys: | + "externaldata" | "v1" + path: $(ExternalData_OBJECT_STORES) + displayName: 'Restore ExternalData object store' - task: Cache@2 inputs: diff --git a/Testing/ContinuousIntegration/AzurePipelinesWindowsPython.yml b/Testing/ContinuousIntegration/AzurePipelinesWindowsPython.yml index d55869c6d7c1..b25444603859 100644 --- a/Testing/ContinuousIntegration/AzurePipelinesWindowsPython.yml +++ b/Testing/ContinuousIntegration/AzurePipelinesWindowsPython.yml @@ -27,7 +27,7 @@ pr: - Utilities/Maintenance/* - Modules/Remote/*.remote.cmake variables: - ExternalDataVersion: 5.4.5 + ExternalData_OBJECT_STORES: $(Pipeline.Workspace)/ExternalData CCACHE_DIR: $(Pipeline.Workspace)/.ccache CCACHE_BASEDIR: $(Build.SourcesDirectory) CCACHE_COMPILERCHECK: content @@ -62,12 +62,16 @@ jobs: - script: | git clone -b dashboard --single-branch https://github.com/InsightSoftwareConsortium/ITK.git ITK-dashboard - - curl -L https://github.com/InsightSoftwareConsortium/ITK/releases/download/v$(ExternalDataVersion)/InsightData-$(ExternalDataVersion).tar.gz -O - cmake -E tar xfz InsightData-$(ExternalDataVersion).tar.gz - cmake -E rename InsightToolkit-$(ExternalDataVersion)/.ExternalData/CID $(Build.SourcesDirectory)/.ExternalData/CID workingDirectory: $(Agent.BuildDirectory) - displayName: 'Download dashboard script and testing data' + displayName: 'Download dashboard script' + + - task: Cache@2 + inputs: + key: '"externaldata" | "v1" | **/*.cid' + restoreKeys: | + "externaldata" | "v1" + path: $(ExternalData_OBJECT_STORES) + displayName: 'Restore ExternalData object store' - task: Cache@2 inputs: From 4cbf6cfc75198603d58325a0f4a3bf5ceb7f0213 Mon Sep 17 00:00:00 2001 From: "Hans J. Johnson" Date: Fri, 24 Apr 2026 13:00:10 -0500 Subject: [PATCH 2/2] ENH: Dedicate populate-externaldata-cache workflow for CI prefetch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The GHA ExternalData cache is keyed on hashFiles('**/*.cid'), so the saved entry should contain an object for every .cid in the tree. In practice the in-band ExternalData fetch only pulls blobs for the modules currently selected for compilation and whose tests run; every other reference stays out of the store and has to be re-downloaded from gateways on the next cold boot. Because the shared key is platform-agnostic, every CI workflow that writes the same key races to save first. Only one workflow can actually prefetch the full corpus; the others would overwrite that save with whatever subset their build happened to fetch, and GitHub's actions/cache/save refuses duplicate keys once the first writer wins. The result observed on an earlier revision of this PR was a 117 MiB cache under a 2.43 GB corpus (~5 %). Split the writer out into a dedicated workflow: .github/workflows/populate-externaldata-cache.yml runs on PRs that touch **/*.cid, on pushes to main and release*, nightly at 05:17 UTC, and on workflow_dispatch. It is the only workflow that saves the shared key. After prefetch a completeness gate counts the unique CIDs referenced in the tree against the objects on disk and refuses to save if any are missing, so a partial cache can never be written under the shared key. Consumer workflows arm.yml and pixi.yml restore the cache and use it in-band during the build; they do not save. Azure pipelines retain their Cache@2 wiring from the previous commit (separate cache subsystem; follow-up). Add Utilities/Maintenance/PrefetchCIDContentLinks.py that walks the source tree, reads every .cid file, and downloads any missing /cid/ through the same gateway list CMake/ITKExternalData.cmake uses. The script delegates HTTPS to curl (available on every supported runner) so TLS verification lives entirely in the system stack, and uses a ThreadPoolExecutor for parallel downloads. Idempotent — already-present objects are skipped. --- .github/workflows/pixi.yml | 13 +- .../workflows/populate-externaldata-cache.yml | 102 +++++++++ .../Maintenance/PrefetchCIDContentLinks.py | 208 ++++++++++++++++++ 3 files changed, 317 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/populate-externaldata-cache.yml create mode 100755 Utilities/Maintenance/PrefetchCIDContentLinks.py diff --git a/.github/workflows/pixi.yml b/.github/workflows/pixi.yml index e93b8433faad..3a00c18fb39d 100644 --- a/.github/workflows/pixi.yml +++ b/.github/workflows/pixi.yml @@ -176,12 +176,13 @@ jobs: path: ${{ runner.temp }}/ccache key: ccache-v4-${{ runner.os }}-pixi-cxx-${{ github.sha }} - - name: Save ExternalData object store - if: ${{ !cancelled() }} - uses: actions/cache/save@v5 - with: - path: ${{ runner.temp }}/ExternalData - key: externaldata-v1-${{ hashFiles('**/*.cid') }} + # ExternalData object store is populated by + # .github/workflows/populate-externaldata-cache.yml — a dedicated + # workflow whose only job is to prefetch every CID and write the + # shared cache entry. Consumer workflows restore-only. Do not + # reintroduce a prefetch-and-save pair here: races across + # platforms overwrite the single shared key with a fraction of + # the expected blobs and poison every subsequent restore. - name: ccache stats if: always() diff --git a/.github/workflows/populate-externaldata-cache.yml b/.github/workflows/populate-externaldata-cache.yml new file mode 100644 index 000000000000..8404b6d97cf3 --- /dev/null +++ b/.github/workflows/populate-externaldata-cache.yml @@ -0,0 +1,102 @@ +name: Populate ExternalData Cache + +# Single owner of the shared "externaldata-v1-" GitHub Actions +# cache entry. Every other workflow restores this entry but never saves +# it — see the comments in arm.yml and pixi.yml for the race that a +# multi-writer design caused. +# +# The job prefetches every .cid referenced in the source tree through +# the same gateway list CMake/ITKExternalData.cmake uses, verifies that +# all objects landed on disk, and only then saves the cache. If any +# object is missing the save is skipped so a later run can try again +# without poisoning the key. + +on: + # PRs that add or modify .cid references produce a new hashFiles + # digest, so the cache needs to be repopulated for that digest. + pull_request: + paths: + - '**/*.cid' + # Keep main and release branches' caches populated as new .cid files + # land. + push: + branches: + - main + - 'release*' + paths: + - '**/*.cid' + # Nightly safety net: if a populate run was skipped because some CIDs + # were unreachable on one day, the next night retries. + schedule: + - cron: '17 5 * * *' + workflow_dispatch: + +concurrency: + # Only one populate job per hashFiles digest makes sense, but we key + # the concurrency group on the branch ref since hashFiles requires a + # checkout. Mid-flight runs cancel; the final one wins. + group: 'externaldata-populate@${{ github.head_ref || github.ref }}' + cancel-in-progress: true + +permissions: + contents: read + actions: write # needed to manage cache entries + +jobs: + populate: + name: Populate shared ExternalData cache + runs-on: ubuntu-22.04 + timeout-minutes: 60 + steps: + - name: Checkout + uses: actions/checkout@v5 + with: + fetch-depth: 1 + + - name: Restore ExternalData object store + id: restore-externaldata + uses: actions/cache/restore@v5 + with: + path: ${{ runner.temp }}/ExternalData + key: externaldata-v1-${{ hashFiles('**/*.cid') }} + + - name: Skip if cache already complete + if: steps.restore-externaldata.outputs.cache-hit == 'true' + run: echo "Cache already present for this hashFiles digest - nothing to do." + + - name: Prefetch every CID + if: steps.restore-externaldata.outputs.cache-hit != 'true' + shell: bash + env: + EXTERNALDATA_STORE: ${{ runner.temp }}/ExternalData + run: | + python3 Utilities/Maintenance/PrefetchCIDContentLinks.py \ + --repo-root . \ + --store "$EXTERNALDATA_STORE" + + # Integrity gate: refuse to save unless every unique CID in the + # source tree has an object on disk. A partial save under the + # shared key would propagate holes to every consumer workflow. + - name: Verify completeness + if: steps.restore-externaldata.outputs.cache-hit != 'true' + shell: bash + env: + EXTERNALDATA_STORE: ${{ runner.temp }}/ExternalData + run: | + expected=$(find . -name '*.cid' -not -path './.git/*' -print0 \ + | xargs -0 -I{} cat {} \ + | sort -u | wc -l | tr -d ' ') + present=$(find "$EXTERNALDATA_STORE/cid" -type f 2>/dev/null | wc -l | tr -d ' ') + echo "expected unique CIDs: $expected" + echo "present on disk : $present" + if [ "$present" -lt "$expected" ]; then + echo "::error::ExternalData prefetch produced $present/$expected objects; refusing to save a partial cache." + exit 1 + fi + + - name: Save ExternalData object store + if: steps.restore-externaldata.outputs.cache-hit != 'true' + uses: actions/cache/save@v5 + with: + path: ${{ runner.temp }}/ExternalData + key: externaldata-v1-${{ hashFiles('**/*.cid') }} diff --git a/Utilities/Maintenance/PrefetchCIDContentLinks.py b/Utilities/Maintenance/PrefetchCIDContentLinks.py new file mode 100755 index 000000000000..3602b4a8ce40 --- /dev/null +++ b/Utilities/Maintenance/PrefetchCIDContentLinks.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +"""Prefetch every CID referenced by a ``.cid`` content link into the +``ExternalData_OBJECT_STORES`` directory so the GitHub Actions cache entry +saved after a CI run is complete. + +Without this step, only the data for modules selected for compilation (and +whose tests ran) trigger an ExternalData fetch. Everything else stays out of +the store, and the next cold boot has to re-download it from the gateways. +This script walks the whole source tree, reads every ``.cid`` file, and fills +in any missing object at ``/cid/`` via the same gateway list used +by ``CMake/ITKExternalData.cmake``. + +Integrity: the IPFS gateways serve content-addressed bytes, so a correctly +configured gateway cannot return the wrong bytes for a given CID — the CID +*is* the content hash. CMake's ExternalData verifies the hash again at +consumer time, so any bad prefetch is self-healing. +""" +from __future__ import annotations + +import argparse +import concurrent.futures as cf +import os +import subprocess +import sys +import time +from pathlib import Path +from urllib.parse import urlsplit + +# Same ordered list as CMake/ITKExternalData.cmake. The GitHub Pages mirror +# comes first because it is the lowest-latency bulk-hosted option and does +# not rate-limit CI. +GATEWAYS = ( + "https://insightsoftwareconsortium.github.io/ITKTestingData/cid/{cid}", + "https://ipfs.io/ipfs/{cid}", + "https://gateway.pinata.cloud/ipfs/{cid}", + "https://cloudflare-ipfs.com/ipfs/{cid}", + "https://dweb.link/ipfs/{cid}", +) + +PER_URL_TIMEOUT_SECONDS = 60 +MAX_WORKERS_DEFAULT = 16 + + +def collect_cids(repo_root: Path) -> dict[str, list[Path]]: + """Map each CID found in the repo to the ``.cid`` files that reference it.""" + cids: dict[str, list[Path]] = {} + for p in repo_root.rglob("*.cid"): + if ".git" in p.parts: + continue + try: + cid = p.read_text().strip() + except OSError as e: + print(f"WARN: cannot read {p}: {e}", file=sys.stderr) + continue + if not cid or any(c.isspace() for c in cid): + print(f"WARN: malformed .cid file {p}", file=sys.stderr) + continue + cids.setdefault(cid, []).append(p) + return cids + + +def fetch_one(cid: str, dest: Path) -> tuple[str, str, int]: + """Download ``cid`` to ``dest``, trying gateways in order. + + Returns ``(cid, status, bytes_written)`` where status is one of + ``ok``, ``skip`` (already present), or ``fail``. + """ + if dest.exists() and dest.stat().st_size > 0: + return cid, "skip", dest.stat().st_size + + tmp = dest.with_suffix(dest.suffix + ".part") + last_err: str | None = None + for tpl in GATEWAYS: + url = tpl.format(cid=cid) + parts = urlsplit(url) + if parts.scheme != "https": + last_err = f"refusing non-https URL: {url}" + continue + # Delegate HTTPS to curl: avoids pulling in third-party HTTP libs + # and keeps TLS cert verification entirely in the system stack. + cmd = [ + "curl", + "--silent", + "--show-error", + "--location", + "--fail", + "--proto", + "=https", + "--max-time", + str(PER_URL_TIMEOUT_SECONDS), + "--user-agent", + "itk-ci-prefetch/1", + "--output", + str(tmp), + "--", + url, + ] + try: + r = subprocess.run( + cmd, + capture_output=True, + check=False, + timeout=PER_URL_TIMEOUT_SECONDS + 10, + ) + except subprocess.TimeoutExpired: + last_err = f"curl timeout on {url}" + continue + if r.returncode != 0: + last_err = f"curl rc={r.returncode} on {url}: {r.stderr.decode(errors='replace').strip()}" + continue + try: + nbytes = tmp.stat().st_size + except OSError as e: + last_err = f"{url}: {e}" + continue + if nbytes == 0: + last_err = f"empty body from {url}" + continue + tmp.replace(dest) + return cid, "ok", nbytes + if tmp.exists(): + try: + tmp.unlink() + except OSError: + pass + return cid, f"fail: {last_err}", 0 + + +def main() -> int: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument( + "--repo-root", + type=Path, + default=Path.cwd(), + help="Root of the source tree to scan for .cid files.", + ) + ap.add_argument( + "--store", + type=Path, + default=Path(os.environ.get("ExternalData_OBJECT_STORES", "")), + help="ExternalData_OBJECT_STORES directory. Defaults to the env var.", + ) + ap.add_argument( + "--jobs", + type=int, + default=MAX_WORKERS_DEFAULT, + help=f"Parallel download workers (default {MAX_WORKERS_DEFAULT}).", + ) + ap.add_argument( + "--fail-on-missing", + action="store_true", + help="Exit non-zero if any CID could not be fetched.", + ) + args = ap.parse_args() + + if not args.store or str(args.store) == ".": + print( + "ERROR: --store or ExternalData_OBJECT_STORES must be set", file=sys.stderr + ) + return 2 + cid_dir = args.store / "cid" + cid_dir.mkdir(parents=True, exist_ok=True) + + cids = collect_cids(args.repo_root) + print( + f"==> {len(cids)} unique CIDs referenced by {sum(len(v) for v in cids.values())} .cid files" + ) + + start = time.monotonic() + ok = skip = fail = 0 + bytes_ok = 0 + failed: list[str] = [] + with cf.ThreadPoolExecutor(max_workers=max(1, args.jobs)) as pool: + futures = {pool.submit(fetch_one, cid, cid_dir / cid): cid for cid in cids} + for i, fut in enumerate(cf.as_completed(futures), 1): + cid, status, nbytes = fut.result() + if status == "ok": + ok += 1 + bytes_ok += nbytes + elif status == "skip": + skip += 1 + else: + fail += 1 + failed.append(f"{cid}: {status}") + if i % 500 == 0 or i == len(futures): + elapsed = time.monotonic() - start + print( + f" [{i}/{len(futures)}] ok={ok} skip={skip} fail={fail} " + f"downloaded={bytes_ok / 1e6:.1f} MB in {elapsed:.0f}s", + flush=True, + ) + + elapsed = time.monotonic() - start + print( + f"==> prefetch done in {elapsed:.0f}s: ok={ok} skip={skip} fail={fail} " + f"downloaded={bytes_ok / 1e6:.1f} MB" + ) + if failed: + print(f"==> {len(failed)} CID(s) could not be fetched:", file=sys.stderr) + for line in failed[:50]: + print(f" {line}", file=sys.stderr) + if len(failed) > 50: + print(f" ... and {len(failed) - 50} more", file=sys.stderr) + return 1 if (fail and args.fail_on_missing) else 0 + + +if __name__ == "__main__": + sys.exit(main())