Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 143 additions & 0 deletions .github/workflows/step-duration-regression.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
name: Step Duration Regression

# Fails CI if the action's setup or post step exceeds STEP_MAX_SECONDS.
# Catches regressions of the dangling-Http2Session class of bug that
# silently added ~30s per step.

on:
workflow_dispatch:
pull_request:
push:
branches: [main]
schedule:
- cron: "17 6 * * *"

env:
STEP_MAX_SECONDS: "5"
SETUP_STEP_NAME: "Setup Docker Builder under test"

jobs:
exercise-action:
name: Run build with setup-docker-builder
runs-on: blacksmith
steps:
- name: Checkout
uses: actions/checkout@v6

- name: ${{ env.SETUP_STEP_NAME }}
uses: ./
with:
buildx-version: v0.23.0

- name: Trivial build
run: |
cat > Dockerfile.test <<'EOF'
FROM alpine:3.20
RUN echo "step-duration-regression"
EOF
docker buildx build -f Dockerfile.test --load -t setup-docker-builder-regression:local .

validate-step-durations:
name: Validate step durations
needs: exercise-action
if: always()
runs-on: blacksmith
permissions:
actions: read
contents: read
steps:
- name: Assert setup and post steps stay under threshold
env:
GH_TOKEN: ${{ github.token }}
RUN_ID: ${{ github.run_id }}
REPO: ${{ github.repository }}
MAX_SECONDS: ${{ env.STEP_MAX_SECONDS }}
SETUP_STEP_NAME: ${{ env.SETUP_STEP_NAME }}
TARGET_JOB_NAME: "Run build with setup-docker-builder"
run: |
set -euo pipefail

echo "Fetching steps for run ${RUN_ID} in ${REPO}..."
jobs_json=$(gh api \
-H "Accept: application/vnd.github+json" \
"repos/${REPO}/actions/runs/${RUN_ID}/jobs?per_page=100")

setup_name="${SETUP_STEP_NAME}"
post_name="Post ${SETUP_STEP_NAME}"

# Returns step duration in seconds, or "MISSING".
step_duration_seconds() {
local job="$1"
local step="$2"

local line
line=$(echo "$jobs_json" | jq -r \
--arg job "$job" --arg step "$step" '
.jobs[] | select(.name == $job) | .steps[] | select(.name == $step) |
"\(.started_at) \(.completed_at)"
')

local started completed
started=$(echo "$line" | awk '{print $1}')
completed=$(echo "$line" | awk '{print $2}')

if [[ -z "${started:-}" || -z "${completed:-}" \
|| "${started}" == "null" || "${completed}" == "null" ]]; then
echo "MISSING"
return
fi

local s c
s=$(date -u -d "$started" +%s)
c=$(date -u -d "$completed" +%s)
echo $(( c - s ))
}

echo ""
echo "=== ${TARGET_JOB_NAME} steps ==="
echo "$jobs_json" | jq -r --arg job "$TARGET_JOB_NAME" '
.jobs[] | select(.name == $job) | .steps[] |
" step=\"\(.name)\" started=\(.started_at) completed=\(.completed_at) conclusion=\(.conclusion)"
'

setup_duration=$(step_duration_seconds "$TARGET_JOB_NAME" "$setup_name")
post_duration=$(step_duration_seconds "$TARGET_JOB_NAME" "$post_name")

echo ""
echo "Setup step (\"${setup_name}\"): ${setup_duration}s"
echo "Post step (\"${post_name}\"): ${post_duration}s"
echo "Threshold: ${MAX_SECONDS}s"

{
echo "## Step Durations"
echo ""
echo "| Step | Duration | Threshold |"
echo "|---|---:|---:|"
echo "| Setup (\`${setup_name}\`) | **${setup_duration}s** | ${MAX_SECONDS}s |"
echo "| Post (\`${post_name}\`) | **${post_duration}s** | ${MAX_SECONDS}s |"
} >> "$GITHUB_STEP_SUMMARY"

fail=0

assert_under_threshold() {
local label="$1" value="$2"
if [[ "$value" == "MISSING" ]]; then
echo "::error::Could not find ${label} step in job \"${TARGET_JOB_NAME}\". Did the previous job fail before the step ran?"
fail=1
return
fi
if (( value > MAX_SECONDS )); then
echo "::error::${label} step took ${value}s (> ${MAX_SECONDS}s threshold). Check for dangling open handles (Http2Session, axios keep-alive, fs watcher) at the end of the action body."
fail=1
fi
}

assert_under_threshold "setup" "$setup_duration"
assert_under_threshold "post" "$post_duration"

if (( fail )); then
exit 1
fi

echo ""
echo "Both steps are within threshold."
2 changes: 1 addition & 1 deletion dist/index.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dist/index.js.map

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -760,6 +760,11 @@ void actionsToolkit.run(
}

stateHelper.setTmpDir(Context.tmpDir());

// Close the gRPC client and force-exit so a leaked open handle (HTTP/2
// session, axios keep-alive, etc.) cannot delay the step by ~30s.
reporter.closeBlacksmithAgentClient();
setImmediate(() => process.exit(process.exitCode ?? 0));
},
// post action - cleanup
async () => {
Expand Down Expand Up @@ -973,6 +978,10 @@ void actionsToolkit.run(
"Expose ID not found in state, skipping sticky disk commit",
);
}

// See main step: close gRPC client + force-exit to avoid ~30s hang.
reporter.closeBlacksmithAgentClient();
});
setImmediate(() => process.exit(process.exitCode ?? 0));
},
);
58 changes: 53 additions & 5 deletions src/reporter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@ import * as core from "@actions/core";
import axios from "axios";
import axiosRetry from "axios-retry";
import { create } from "@bufbuild/protobuf";
import { createClient } from "@connectrpc/connect";
import { createGrpcTransport } from "@connectrpc/connect-node";
import { Client, createClient } from "@connectrpc/connect";
import {
createGrpcTransport,
Http2SessionManager,
} from "@connectrpc/connect-node";
import {
MetricSchema,
Metric_MetricType,
Expand Down Expand Up @@ -44,15 +47,60 @@ const createBlacksmithAPIClient = () => {
return client;
};

export function createBlacksmithAgentClient() {
// Cached so we open a single HTTP/2 session per process. The session must
// be torn down via closeBlacksmithAgentClient() before the action exits,
// otherwise it keeps the Node event loop alive for ~30s.
let cachedAgentSessionManager: Http2SessionManager | undefined;
let cachedAgentClient: Client<typeof StickyDiskService> | undefined;
let cachedAgentBaseUrl: string | undefined;

export function createBlacksmithAgentClient(): Client<
typeof StickyDiskService
> {
const baseUrl = `http://192.168.127.1:${process.env.BLACKSMITH_STICKY_DISK_GRPC_PORT || "5557"}`;

if (cachedAgentClient && cachedAgentBaseUrl === baseUrl) {
return cachedAgentClient;
}

if (cachedAgentSessionManager) {
try {
cachedAgentSessionManager.abort();
} catch {
// best-effort
}
}

core.info(
`Creating Blacksmith agent client with port: ${process.env.BLACKSMITH_STICKY_DISK_GRPC_PORT || "5557"}`,
);

cachedAgentSessionManager = new Http2SessionManager(baseUrl);
const transport = createGrpcTransport({
baseUrl: `http://192.168.127.1:${process.env.BLACKSMITH_STICKY_DISK_GRPC_PORT || "5557"}`,
baseUrl,
sessionManager: cachedAgentSessionManager,
});
cachedAgentClient = createClient(StickyDiskService, transport);
cachedAgentBaseUrl = baseUrl;

return cachedAgentClient;
}

return createClient(StickyDiskService, transport);
// Must be called before the action exits. See cache comment above.
// Safe to call multiple times.
export function closeBlacksmithAgentClient(): void {
if (cachedAgentSessionManager) {
try {
cachedAgentSessionManager.abort();
} catch (error) {
core.debug(
`Failed to abort Blacksmith agent session: ${(error as Error).message}`,
);
}
cachedAgentSessionManager = undefined;
}
cachedAgentClient = undefined;
cachedAgentBaseUrl = undefined;
}

export async function reportBuildPushActionFailure(
Expand Down
Loading