blockstor/.github/workflows/pull-request.yml at main · cozystack/blockstor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
name: Pull Request

# Single consolidated CI pipeline for PRs. Fires on every PR open/push and
# runs lint, unit tests, contract tests, and e2e in parallel. The e2e job
# carries an opt-in SSH breakpoint (label-gated) so maintainers can attach
# to a wedged sandbox and resume the workflow with `breakpoint resume`.
#
# Runner topology:
#   - lint, unit-test, contract → GitHub-hosted `ubuntu-latest` (ephemeral).
#   - e2e → self-hosted (needs KVM/large RAM for kind+blockstor sandbox).
#     Adjust the `runs-on:` label to match the ephemeral runner pool once
#     ARC / namespace.so / equivalent is wired up.
#
# Repository variables (Settings → Secrets and variables → Actions → Variables):
#   - BREAKPOINT_ENDPOINT (required for the breakpoint step; if unset, the
#     step is skipped — forks cannot reach the rendezvous server anyway
#     because variables are not exposed in fork-PR workflows).
#
# Labels that change behaviour:
#   - debug   → pins the e2e job to a self-hosted runner so a maintainer can
#               attach via the host (kubectl/docker on the runner host
#               directly, no SSH dance through the rendezvous server). The
#               breakpoint step itself fires on every e2e failure regardless
#               of label — it just needs BREAKPOINT_ENDPOINT to be set.

on:
  pull_request:
    types: [opened, synchronize, reopened]

# Cancel in-flight runs for the same PR when a new push arrives — saves
# runner minutes on rapid force-push iterations.
concurrency:
  group: pr-${{ github.workflow }}-${{ github.event.pull_request.number }}
  cancel-in-progress: true

permissions: {}

jobs:
  detect-changes:
    name: Detect changes
    runs-on: ubuntu-latest
    outputs:
      code: ${{ steps.filter.outputs.code }}
    steps:
      - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1
        id: filter
        with:
          filters: |
            code:
              - '!docs/**'
              - '!**/*.md'
              - '!**/*.svg'
              - '!img/**'
              - '!.github/ISSUE_TEMPLATE/**'
              - '!LICENSE'

  lint:
    name: Lint
    runs-on: ubuntu-latest
    needs: detect-changes
    if: needs.detect-changes.outputs.code == 'true'
    timeout-minutes: 15
    permissions:
      contents: read
    steps:
      - name: Clone the code
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false
      - name: Setup Go
        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: go.mod
      - name: Check linter configuration
        run: make lint-config
      - name: Run linter
        run: make lint

  unit-test:
    name: Unit tests
    runs-on: ubuntu-latest
    needs: detect-changes
    if: needs.detect-changes.outputs.code == 'true'
    timeout-minutes: 20
    permissions:
      contents: read
    steps:
      - name: Clone the code
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false
      - name: Setup Go
        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: go.mod
      - name: Running tests
        run: |
          go mod tidy
          make test

  contract:
    name: Contract tests
    runs-on: ubuntu-latest
    needs: detect-changes
    if: needs.detect-changes.outputs.code == 'true'
    timeout-minutes: 10
    permissions:
      contents: read
    steps:
      - name: Clone
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false
      - name: Setup Go
        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: go.mod
      - name: Build contract image
        run: |
          docker build -t blockstor-drbd-contract:local \
            -f tests/contract/Dockerfile tests/contract/
      - name: Contract tests
        # pipefail: without it `| tee` masks go test's exit code and
        # the job goes green even when the suite fails.
        run: |
          set -o pipefail
          go test -tags=contract -count=1 -timeout=5m -v ./tests/contract/... \
            | tee contract.log
      - name: Upload logs on failure
        if: failure()
        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
          name: contract-logs
          path: contract.log

  integration:
    name: Integration tests
    runs-on: ubuntu-latest
    needs: detect-changes
    if: needs.detect-changes.outputs.code == 'true'
    timeout-minutes: 20
    permissions:
      contents: read
    steps:
      - name: Clone
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false
      - name: Setup Go
        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: go.mod
      - name: Install linstor-client (python-linstor)
        # See .github/workflows/integration.yml for the install path
        # rationale; this mirrors that step so PR runs match push runs.
        # Pin v1.27.1 to match `linstor_client.VERSION` the integration
        # harness asserts on (tests/integration/group_h_test.go).
        run: |
          python3 -m pip install --break-system-packages --upgrade \
            python-linstor==1.27.1 argcomplete
          python3 -m pip install --break-system-packages --no-deps \
            https://github.com/LINBIT/linstor-client/archive/refs/tags/v1.27.1.tar.gz
          linstor --version
      - name: Install envtest binaries
        # controller-runtime's envtest needs kube-apiserver + etcd
        # binaries. We track the release branch matching our
        # controller-runtime (v0.23.x). `@latest` would resolve to
        # the v0.24.x submodule, which requires Go >= 1.26.
        run: |
          go install sigs.k8s.io/controller-runtime/tools/setup-envtest@release-0.23
          echo "KUBEBUILDER_ASSETS=$(setup-envtest use --print path 1.34.x)" >> "$GITHUB_ENV"
      - name: go mod tidy
        run: go mod tidy
      - name: Build (ensures harness compiles)
        run: go build ./...
      - name: Integration tests
        # pipefail: without it `| tee` masks go test's exit code and
        # the job goes green even when the suite fails.
        run: |
          set -o pipefail
          go test -tags=integration -count=1 -timeout=15m ./tests/integration/... | tee integration.log
      - name: Upload logs on failure
        if: failure()
        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
          name: integration-logs
          path: |
            integration.log
            /tmp/envtest-*.log

  e2e:
    name: E2E (lane ${{ matrix.lane }})
    # Real-DRBD QEMU/Talos e2e — the same suite maintainers run on the
    # dev stand (stand/up.sh + blockstor + pools + tests/e2e/*.sh), driven
    # by stand/ci-e2e.sh. Runs on the CNCF-provided Oracle pool which
    # exposes /dev/kvm (nested virt), exactly like cozystack/cozystack's
    # own e2e job. A labelled `debug` PR lands on a long-lived
    # `self-hosted` runner so the breakpoint step below has somewhere
    # stable to attach SSH. Swap to oracle-vm-32cpu-128gb-x86-64 if a
    # future, heavier scenario set needs more RAM/CPU.
    #
    # Parallel lanes: a 4-lane matrix (4 runners in parallel). To widen
    # coverage, add entries to `matrix.lane` (e.g. 1 2 3 ... 10) AND bump
    # `env.LANES` to match — ci-e2e.sh round-robins the SCENARIOS list across LANES,
    # so each runner automatically picks up its 1/N share. Nothing else
    # changes. (A future optimisation is a single build job that pushes
    # images to a registry the lanes pull from, instead of each lane
    # rebuilding — cozystack does this; kept simple here on purpose.)
    runs-on: ${{ contains(github.event.pull_request.labels.*.name, 'debug') && 'self-hosted' || 'oracle-vm-24cpu-96gb-x86-64' }}
    needs: [detect-changes, lint, unit-test]
    if: needs.detect-changes.outputs.code == 'true'
    # Whole tests/e2e suite (~59) sharded over LANES; at 6 lanes that's
    # ~10 scenarios/runner plus build+provision. Lane count is a balance
    # between two opposite failure modes:
    #   - TOO FEW lanes (4) packs the heavy DRBD-destructive scenarios
    #     (~15/cluster) onto one ephemeral cluster, which accumulates
    #     enough residue to wedge its last scenarios (replicas stuck
    #     non-UpToDate);
    #   - TOO MANY lanes (8) over-subscribes the oracle runner POOL —
    #     booting 8 four-node Talos+QEMU clusters at once starves the
    #     late-scheduled runners for CPU/memory, so lanes 7/8 repeatedly
    #     died on infra ("N nodes never Ready", and "runner lost
    #     communication with the server"), not on product faults.
    # 6 lanes scatters the heavy scenarios across distinct fresh clusters
    # (round-robin i%6) while keeping the simultaneous-cluster load the
    # pool can actually sustain. Capped near GitHub's 360-min max.
    timeout-minutes: 350
    strategy:
      fail-fast: false
      matrix:
        lane: [1, 2, 3, 4, 5, 6]
    env:
      # Number of parallel lanes = number of runners (matrix.lane above).
      # This is a capacity/cost knob ONLY, not a coverage knob: no
      # scenario list is hardcoded here — ci-e2e.sh discovers the whole
      # suite via `make e2e-list` (= ls tests/e2e/*.sh) and shards it
      # across LANES, so a newly-added test file runs automatically with
      # zero workflow edits. Add runners by bumping matrix.lane + LANES
      # together (e.g. 6 lanes ≈ the dev-stand-proven ~6 scenarios/lane).
      LANES: 6
      # Scenarios that require the upstream piraeus stack (the LinstorCluster
      # CRD + linstor-csi) are excluded from the blockstor-only matrix
      # clusters — there piraeus is absent and, now that their silent-skip
      # guards are gone, they would hard-fail. They run in the dedicated
      # `e2e-piraeus` job below, which installs piraeus in EXTERNAL mode
      # against blockstor's apiserver (no upstream Java linstor-controller).
      # ci-e2e.sh drops these from the discovered suite.
      # node-replace-hardware regressed post-merge of #48 (worker-3 satellite
      # does not re-register after re-labeling on the replaced node); root
      # cause is in the satellite/controller path, not in CI infrastructure,
      # and reverting #48 isn't viable. Track separately; restore once fixed.
      # observability-capacity-correlation requires piraeus's LinstorCluster CRD,
      # which is only installed in the e2e-piraeus job below — keep it out of the
      # 6-lane matrix or it hard-fails at the prerequisite check.
      E2E_EXCLUDE: "rwx-ganesha observability-three-way observability-capacity-correlation csi-pvc-replicated-rwo csi-pvc-local node-replace-hardware"
    permissions:
      contents: read
      checks: write
    steps:
      - name: Clone the code
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

      - name: Setup Go
        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: go.mod

      - name: Verify KVM is available
        run: |
          test -e /dev/kvm || { echo "::error::/dev/kvm missing — this runner lacks (nested) virtualization"; exit 1; }
          ls -l /dev/kvm

      - name: Run QEMU e2e (lane ${{ matrix.lane }} of ${{ env.LANES }})
        run: make ci-e2e LANE=${{ matrix.lane }} LANES=${{ env.LANES }}

      - name: Upload e2e logs
        if: always()
        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
          name: e2e-logs-lane-${{ matrix.lane }}
          path: |
            /tmp/e2e-ci-lane${{ matrix.lane }}.results
            /tmp/e2e-ci-lane${{ matrix.lane }}-*.log
          if-no-files-found: ignore

      # Open an SSH breakpoint to the failing e2e runner so maintainers
      # can attach, inspect kind/blockstor state, and resume with
      # `breakpoint resume`. Fires on every e2e failure (no label opt-in)
      # — the rationale is that an e2e failure already burned the runner
      # minutes and a maintainer almost always wants to inspect the wedged
      # cluster before tear-down. Forks can't reach the rendezvous server
      # because repository variables are not exposed to fork-PR workflows;
      # the step is silently skipped in that case (BREAKPOINT_ENDPOINT
      # comes through empty).
      #
      # Uses cozystack/breakpoint-action (fork of namespacelabs/breakpoint-action)
      # pinned by SHA. The fork adds pause-idle mode (initial grace period
      # for the first SSH connection, idle-aware exit afterwards), endpoint
      # output, and a dedicated Check Run "Breakpoint Open" that carries
      # the SSH endpoint in output.summary while the breakpoint is paused.
      - name: Breakpoint on E2E failure
        if: |
          failure() &&
          vars.BREAKPOINT_ENDPOINT != ''
        # cozystack/breakpoint-action v2-cozy.1
        # mode: pause-idle defaults: grace-period=20m, idle-timeout=10m
        uses: cozystack/breakpoint-action@a6f3a6f87be398ad63b6577351e3398e53f578e4
        with:
          mode: pause-idle
          endpoint: ${{ vars.BREAKPOINT_ENDPOINT }}
          authorized-users: androndo, Arsolitt, IvanHunters, kvaps, lexfrei, lllamnyp, mattia-eleuteri, matthieu-robin, myasnikovdaniil, sircthulhu, tym83
          check-run-name: "Breakpoint Open"
          github-token: ${{ github.token }}
          check-run-summary-template: |
            ## 🔴 SSH breakpoint open — paused for debug

            ```
            {endpoint}
            ```

            Inspect the wedged stand (Talos+QEMU — there is no Kind here):
            ```
            cd ~/_work/blockstor/blockstor
            cat /tmp/e2e-ci-lane*.results          # the lane + which scenario FAILed
            N=$(ls .work | grep -o 'ci-lane[0-9]*' | head -1)
            export KUBECONFIG=$PWD/.work/$N/kubeconfig
            kubectl get pods -A; kubectl get rd,resources -A
            cat /tmp/e2e-$N-<scenario>.log          # full log of the failing scenario
            ```

            Resume from inside: `breakpoint resume`. Otherwise the breakpoint
            exits 10 minutes after the last SSH session disconnects.

  e2e-piraeus:
    name: E2E (piraeus interop)
    # The piraeus-dependent scenarios (observability-three-way,
    # observability-capacity-correlation, rwx-ganesha) exercise the
    # LinstorCluster-CRD + linstor-csi surface that the upstream piraeus-
    # operator owns. They run here on a cluster that has piraeus installed
    # in EXTERNAL mode against blockstor's apiserver (INSTALL_PIRAEUS=1 →
    # ci-e2e.sh runs `make piraeus`, which now creates LinstorCluster with
    # spec.externalController.url + spec.apiTLS pointing at blockstor:3371).
    # No upstream Java linstor-controller runs in this job — the LINSTOR
    # state of record is blockstor's apiserver. rwx-ganesha is currently
    # SKIPped (coexistence-only; needs an external-mode port).
    #
    # `needs: e2e` runs this AFTER the 6-lane matrix releases its runners, so
    # the oracle pool is never asked for a 7th simultaneous cluster (the
    # over-subscription that killed lanes 7/8 at 8 lanes). `!cancelled()` lets
    # it run for visibility even if a matrix lane went red.
    runs-on: ${{ contains(github.event.pull_request.labels.*.name, 'debug') && 'self-hosted' || 'oracle-vm-24cpu-96gb-x86-64' }}
    needs: [detect-changes, lint, unit-test, e2e]
    if: ${{ !cancelled() && needs.detect-changes.outputs.code == 'true' && needs.lint.result == 'success' && needs.unit-test.result == 'success' }}
    timeout-minutes: 150
    env:
      INSTALL_PIRAEUS: "1"
    permissions:
      contents: read
      checks: write
    steps:
      - name: Clone the code
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

      - name: Setup Go
        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: go.mod

      - name: Verify KVM is available
        run: |
          test -e /dev/kvm || { echo "::error::/dev/kvm missing — this runner lacks (nested) virtualization"; exit 1; }
          ls -l /dev/kvm

      - name: Run piraeus-interop e2e
        # LANE=1 LANES=1 → the single lane runs every listed scenario;
        # ci-e2e.sh honours INSTALL_PIRAEUS=1 (job env) to deploy piraeus
        # after blockstor so these scenarios have the stack they drive.
        # csi-pvc-local runs BEFORE csi-pvc-replicated-rwo by design:
        # the local scenario is stateless (storage-only PVC, no DRBD,
        # no cross-node migration), so any cleanup races it surfaces
        # cannot contaminate the next scenario. The replicated one
        # creates Pods on two workers in sequence and its cleanup,
        # even after the synchronous Pod-delete fix, races
        # pv-controller's ControllerPublishVolume retries — putting
        # it last keeps that storm out of the local scenario's window.
        # observability-capacity-correlation re-enabled: issue #45 closed
        # by gating the real linstor-csi single-node create path on
        # `POST /v1/resource-definitions/{rd}/resources/{node}` (the
        # endpoint linstor-csi's `manual` scheduler hits when the SC
        # sets `nodeList` + `placementCount=1`). The level-1
        # PVC-stays-Pending assertion now passes on the dev stand.
        run: make ci-e2e LANE=1 LANES=1 SCENARIOS="rwx-ganesha observability-three-way observability-capacity-correlation csi-pvc-local csi-pvc-replicated-rwo"

      - name: Upload e2e logs
        if: always()
        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
          name: e2e-logs-piraeus
          path: |
            /tmp/e2e-ci-lane1.results
            /tmp/e2e-ci-lane1-*.log
          if-no-files-found: ignore

      - name: Breakpoint on piraeus-interop failure
        if: |
          failure() &&
          vars.BREAKPOINT_ENDPOINT != ''
        uses: cozystack/breakpoint-action@a6f3a6f87be398ad63b6577351e3398e53f578e4
        with:
          mode: pause-idle
          endpoint: ${{ vars.BREAKPOINT_ENDPOINT }}
          authorized-users: androndo, Arsolitt, IvanHunters, kvaps, lexfrei, lllamnyp, mattia-eleuteri, matthieu-robin, myasnikovdaniil, sircthulhu, tym83
          check-run-name: "Breakpoint Open (piraeus)"
          github-token: ${{ github.token }}
          check-run-summary-template: |
            ## 🔴 SSH breakpoint open (piraeus interop) — paused for debug

            ```
            {endpoint}
            ```

            ```
            cd ~/_work/blockstor/blockstor
            cat /tmp/e2e-ci-lane1.results
            export KUBECONFIG=$PWD/.work/ci-lane1/kubeconfig
            kubectl get pods -A; kubectl get rd,resources -A
            ```

            Resume from inside: `breakpoint resume`.