From 62ac48609c72f88db590eee75812a4581fbfac14 Mon Sep 17 00:00:00 2001 From: 404prefrontalcortexnotfound <106208474+404prefrontalcortexnotfound@users.noreply.github.com> Date: Sun, 21 Jun 2026 19:30:01 +1000 Subject: [PATCH] ci: add attribution-check gate (vendored from skills-and-governance) --- .github/workflows/attribution-check.yml | 77 ++++++++ rules/attribution-denylist.txt | 35 ++++ scripts/agent-attribution-scrub.sh | 245 ++++++++++++++++++++++++ scripts/ci-attribution-check.sh | 172 +++++++++++++++++ 4 files changed, 529 insertions(+) create mode 100644 .github/workflows/attribution-check.yml create mode 100644 rules/attribution-denylist.txt create mode 100755 scripts/agent-attribution-scrub.sh create mode 100755 scripts/ci-attribution-check.sh diff --git a/.github/workflows/attribution-check.yml b/.github/workflows/attribution-check.yml new file mode 100644 index 0000000..ab86c1f --- /dev/null +++ b/.github/workflows/attribution-check.yml @@ -0,0 +1,77 @@ +name: attribution check + +# Fails a PR if any disallowed AI / model / private-identity attribution appears +# in the PR's commit messages OR in tracked file content. CHECK-ONLY: this never +# rewrites a remote commit object — remediation is done locally by the author. +# +# Issue: Decent-Tako/skills-and-governance#72 (child of epic #68). +# Engine: scripts/agent-attribution-scrub.sh + rules/attribution-denylist.txt. +# +# This workflow is BOTH: +# 1. The live gate for THIS repo (pull_request trigger below), and +# 2. A reusable workflow (workflow_call) other repos in the same checkout can +# reference. NOTE: workflow_call shares the CALLER's checkout, and the +# scrubber's --check-repo/--check-staged scan the tree the SCRIPT lives in. +# For a DIFFERENT repo to scan its OWN files, it must VENDOR the script + +# denylist into itself and call this workflow's standalone copy — see +# docs/agents/attribution-ci-rollout.md. workflow_call here is provided for +# same-repo reuse / composition; cross-repo adopters vendor. + +on: + pull_request: + workflow_call: + inputs: + skip-repo-scan: + description: "Skip the file-content tree scan; check commit messages only (for repos with un-cleaned legacy file content)." + type: boolean + default: false + required: false + +permissions: + contents: read + +jobs: + attribution: + name: no disallowed attribution + runs-on: ubuntu-latest + steps: + # fetch-depth: 0 — the commit-message loop walks the full PR range; a + # shallow checkout would not contain the base commit to diff against. + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Run attribution scrubber (check-only) over commits + files + env: + # On pull_request these are the real base/head SHAs. On workflow_call + # / push the action falls back to origin/..HEAD. + PR_BASE: ${{ github.event.pull_request.base.sha }} + PR_HEAD: ${{ github.event.pull_request.head.sha }} + SKIP_REPO_SCAN: ${{ inputs.skip-repo-scan && '1' || '0' }} + # Injected via env (NOT ${{ }} inside run:) so a fork whose default-branch + # name contains shell metacharacters cannot achieve RCE on the runner. + DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + shell: bash + run: | + set -euo pipefail + BASE="${PR_BASE:-}" + HEAD="${PR_HEAD:-}" + if [ -z "$BASE" ] || [ -z "$HEAD" ]; then + # Not a pull_request event: diff the default branch against HEAD. + DEFAULT="${DEFAULT_BRANCH:-}" + DEFAULT="${DEFAULT:-main}" + # Fetch the default branch into a remote-tracking ref. An explicit + # refspec guarantees origin/$DEFAULT exists (a bare `git fetch origin + # $DEFAULT` only populates FETCH_HEAD). `--depth=0` is INVALID (git + # shallow depth must be >= 1) — checkout above already used + # fetch-depth: 0, so a full-history fetch is correct here. + git fetch --no-tags origin "+refs/heads/$DEFAULT:refs/remotes/origin/$DEFAULT" + BASE="origin/$DEFAULT" + HEAD="HEAD" + fi + # Fail loudly if either endpoint is not actually present — a silent + # absent ref would otherwise make the diff vacuously "clean". + git cat-file -t "$BASE" >/dev/null 2>&1 || { echo "::error::base ref $BASE not present"; exit 2; } + git cat-file -t "$HEAD" >/dev/null 2>&1 || { echo "::error::head ref $HEAD not present"; exit 2; } + echo "Base: $BASE Head: $HEAD SKIP_REPO_SCAN=$SKIP_REPO_SCAN" + bash scripts/ci-attribution-check.sh "$BASE" "$HEAD" diff --git a/rules/attribution-denylist.txt b/rules/attribution-denylist.txt new file mode 100644 index 0000000..802c239 --- /dev/null +++ b/rules/attribution-denylist.txt @@ -0,0 +1,35 @@ +# Generic agent-attribution deny patterns (ERE, one per line). +# +# Consumed by scripts/agent-attribution-scrub.sh via `grep -E -f`. +# Each line is an extended regular expression matched against a single text line. +# Blank lines and lines beginning with `#` are ignored by the scrubber. +# +# SCOPE OF THIS FILE: generic AI/model/vendor authorship credits ONLY. +# Patterns are anchored to commit-trailer / credit syntax so ordinary prose that +# merely mentions a model name (e.g. "we evaluated Claude vs Gemini") is NOT +# flagged. A bare vendor word on its own line is intentionally not a pattern. +# +# DO NOT add private identity names, personal persona names, or symbolic +# signatures here. Committing those would make this scrubber leak the very names +# it is meant to suppress (epic Decent-Tako/skills-and-governance#68). Private +# patterns are loaded at runtime from a gitignored file — see +# rules/attribution-private-denylist.txt.example and the scrubber header. + +# --- Co-authored-by / Authored-by trailers naming an AI/model/vendor --- +# Matches the trailer at start-of-line (optional leading whitespace) where the +# value references a known AI vendor/model family. +^[[:space:]]*[Cc][Oo]-?[Aa][Uu][Tt][Hh][Oo][Rr][Ee][Dd]-?[Bb][Yy]:.*(Claude|Anthropic|GPT|OpenAI|ChatGPT|Codex|Gemini|Google[ -]?(AI|DeepMind)?|Bard|Grok|xAI|Copilot|Llama|Mistral|Cohere|Perplexity|Devin|Cursor) +^[[:space:]]*[Aa][Uu][Tt][Hh][Oo][Rr][Ee][Dd]-?[Bb][Yy]:.*(Claude|Anthropic|GPT|OpenAI|ChatGPT|Codex|Gemini|Bard|Grok|xAI|Copilot|Llama|Mistral|Cohere|Perplexity|Devin|Cursor) + +# Vendor noreply / service e-mail addresses used as a co-author identity. +^[[:space:]]*[Cc][Oo]-?[Aa][Uu][Tt][Hh][Oo][Rr][Ee][Dd]-?[Bb][Yy]:.*<[^>]*@(anthropic|openai|google|deepmind|x\.ai|mistral|cohere)\.[a-z]+> + +# --- "Generated with / Generated by" assistant boilerplate footers --- +# Catches both the punctuated marketing footer and the trailer-style key. +Generated[ -][Ww]ith \[?(Claude|ChatGPT|GPT|Gemini|Grok|Copilot|Codex|Cursor|Devin) +^[[:space:]]*[Gg][Ee][Nn][Ee][Rr][Aa][Tt][Ee][Dd]-?[Bb][Yy]:.*(Claude|Anthropic|GPT|OpenAI|ChatGPT|Codex|Gemini|Grok|xAI|Copilot|Cursor|Devin) +(Co-?authored|written|generated|created|drafted) (with|by) (Claude|ChatGPT|GPT-?[0-9]|OpenAI|Anthropic|Gemini|Grok|Copilot)( Code)?\b + +# --- Explicit model/vendor author credit lines --- +# e.g. "Author: Claude Opus 4.8", "Model: gpt-4o", "Reviewed-by: Gemini". +^[[:space:]]*(Author|Model|Reviewed-?[Bb]y|Signed-?off-?[Bb]y):.*(Claude|Anthropic|GPT-?[0-9]|OpenAI|ChatGPT|Codex|Gemini|Grok|xAI|Copilot) diff --git a/scripts/agent-attribution-scrub.sh b/scripts/agent-attribution-scrub.sh new file mode 100755 index 0000000..de2fbe4 --- /dev/null +++ b/scripts/agent-attribution-scrub.sh @@ -0,0 +1,245 @@ +#!/usr/bin/env bash +# +# agent-attribution-scrub.sh +# +# Strip or reject AI/model/private-identity authorship credits from commit +# messages and repository files. Enforces rules/attribution.md. +# +# This is the shared scrubber behind epic Decent-Tako/skills-and-governance#68 +# (issue #70). A commit-msg hook (#71) calls `--check-message` to REJECT a +# disallowed trailer; `--fix-message` removes it in place; `--check-staged` and +# `--check-repo` audit file content. +# +# Deny patterns come from two sources: +# 1. rules/attribution-denylist.txt — GENERIC AI/vendor/model credits, +# committed. Anchored to trailer/credit syntax so ordinary prose mentioning +# a model name is NOT flagged. +# 2. A runtime PRIVATE denylist, loaded IF PRESENT, NEVER committed (so the +# scrubber cannot leak the private names it suppresses). All present sources +# below are combined (patterns from each are applied). Locations: +# a. rules/attribution-private-denylist.txt (repo-local, gitignored) +# b. $HOME/.config/agent-attribution/private-denylist.txt +# c. $AGENT_ATTRIBUTION_PRIVATE_DENYLIST (explicit path override) +# Template: rules/attribution-private-denylist.txt.example +# +# Usage: +# agent-attribution-scrub.sh --check-message +# agent-attribution-scrub.sh --fix-message +# agent-attribution-scrub.sh --check-staged +# agent-attribution-scrub.sh --check-repo +# +# Exit codes: +# 0 clean (no disallowed attribution found) / fix applied +# 1 disallowed attribution found (check modes) +# 2 usage / environment error (bad args, missing file, no denylist) +# +# Portable to macOS bash 3.2 + BSD grep: no associative arrays; patterns are +# iterated via `grep -E -f`, never shell arrays; no GNU-only grep features. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +GENERIC_DENYLIST="${AGENT_ATTRIBUTION_DENYLIST:-$REPO_ROOT/rules/attribution-denylist.txt}" + +# Files excluded from --check-repo / --check-staged content scans: they legitimately +# contain attribution strings (the policy doc's anti-pattern examples, the denylist +# patterns themselves, and the deliberate test fixtures). Without this, the scanner +# would flag its own machinery. Paths are repo-root-relative, matched as ERE. +SELF_SCAN_EXCLUDE_RE='^(rules/attribution\.md|rules/attribution-denylist\.txt|rules/attribution-private-denylist\.txt(\.example)?|scripts/agent-attribution-scrub\.sh|hooks/INSTALL\.md|docs/agents/attribution-ci-rollout\.md|tests/test_agent_attribution_scrub\.py|tests/test_ci_attribution_check\.py)$' + +usage() { + sed -n '/^# Usage:/,/^# Portable/p' "${BASH_SOURCE[0]}" | sed 's/^# \{0,1\}//;/^Portable/d' +} + +die() { printf 'agent-attribution-scrub: %s\n' "$*" >&2; exit 2; } + +# Echo the list of denylist files to use (generic + any present private ones). +# A missing private file is silently skipped; a missing generic file is fatal. +collect_denylists() { + if [ ! -f "$GENERIC_DENYLIST" ]; then + die "generic denylist not found: $GENERIC_DENYLIST" + fi + printf '%s\n' "$GENERIC_DENYLIST" + + local p + for p in \ + "$REPO_ROOT/rules/attribution-private-denylist.txt" \ + "$HOME/.config/agent-attribution/private-denylist.txt" \ + "${AGENT_ATTRIBUTION_PRIVATE_DENYLIST:-}"; do + if [ -n "$p" ] && [ -f "$p" ]; then + printf '%s\n' "$p" + fi + done +} + +# Build a single cleaned pattern file (comments + blank lines stripped) into the +# global PATTERN_FILE. grep -f treats EVERY line of a pattern file as a regex, +# including `#` comments and blank lines — and a blank line matches everything — +# so the raw denylists must never be handed to grep directly. The caller is +# responsible for `rm -f "$PATTERN_FILE"` when done (or rely on its mktemp slot). +PATTERN_FILE="" +build_pattern_file() { + PATTERN_FILE="$(mktemp "${TMPDIR:-/tmp}/attr-scrub-pat.XXXXXX")" + local f + while IFS= read -r f; do + [ -n "$f" ] || continue + # Drop blank lines and # comments; keep real patterns. + grep -v -E '^[[:space:]]*(#|$)' "$f" >>"$PATTERN_FILE" || true + done <" + echo " - File content: delete the attribution credit, or attribute to Ben /" + echo " decent.tech per rules/attribution.md." + echo " - Policy: rules/attribution.md (private identities + AI trailers stripped" + echo " from client-touching surfaces)." + } >&2 +} + +# --- mode: --check-message --------------------------------------- +# Non-zero exit if the message text contains disallowed attribution. +check_message() { + local src="${1:-}" + [ -n "$src" ] || die "--check-message requires a file or '-'" + local text + if [ "$src" = "-" ]; then + text="$(cat)" + else + [ -f "$src" ] || die "message file not found: $src" + text="$(cat "$src")" + fi + + build_pattern_file + local hits + # grep returns 1 on no-match; capture without tripping set -e. + set +e + hits="$(printf '%s\n' "$text" | grep -n -E -f "$PATTERN_FILE")" + local rc=$? + set -e + rm -f "$PATTERN_FILE" + + if [ "$rc" -eq 0 ]; then + echo "DISALLOWED attribution in commit message:" >&2 + printf '%s\n' "$hits" | sed 's/^/ /' >&2 + print_remediation + return 1 + fi + return 0 +} + +# --- mode: --fix-message ------------------------------------------- +# Remove disallowed attribution lines in place; leave body intact. Idempotent. +fix_message() { + local file="${1:-}" + [ -n "$file" ] || die "--fix-message requires a file" + [ -f "$file" ] || die "message file not found: $file" + + build_pattern_file + local tmp + tmp="$(mktemp "${TMPDIR:-/tmp}/attr-scrub.XXXXXX")" + + # grep -v of the deny patterns keeps every non-attribution line. -v exits 0 as + # long as >=1 line survives; guard set -e and treat "all lines stripped" (rc 1) + # as an empty result rather than a failure. + set +e + grep -v -E -f "$PATTERN_FILE" "$file" >"$tmp" + local rc=$? + set -e + rm -f "$PATTERN_FILE" + if [ "$rc" -gt 1 ]; then + rm -f "$tmp" + die "grep failed while filtering $file" + fi + + # Collapse a trailing run of blank lines left where a trailer block was, then + # ensure exactly one terminating newline. + awk 'BEGIN{n=0} {lines[NR]=$0} END{ + last=NR + while (last>0 && lines[last] ~ /^[[:space:]]*$/) last-- + for(i=1;i<=last;i++) print lines[i] + }' "$tmp" >"$tmp.2" + + mv "$tmp.2" "$file" + rm -f "$tmp" + return 0 +} + +# --- shared content scanner ------------------------------------------------ +# Scan a newline-delimited list of repo-relative file paths (on stdin) for +# disallowed attribution, honouring SELF_SCAN_EXCLUDE_RE. Returns 1 on any hit. +scan_files() { + build_pattern_file + local found=1 # 1 == clean (mirrors grep: we flip at the end) + local rel abs hits rc + while IFS= read -r rel; do + [ -n "$rel" ] || continue + # Skip our own machinery + the policy doc. + if printf '%s\n' "$rel" | grep -q -E "$SELF_SCAN_EXCLUDE_RE"; then + continue + fi + abs="$REPO_ROOT/$rel" + [ -f "$abs" ] || continue + set +e + # -I: treat binary files as non-matching so coincidental bytes in images, + # compiled artifacts, etc. cannot produce false "Binary file matches" hits. + hits="$(grep -I -n -E -f "$PATTERN_FILE" "$abs")" + rc=$? + set -e + if [ "$rc" -eq 0 ]; then + found=0 + echo "DISALLOWED attribution in $rel:" >&2 + printf '%s\n' "$hits" | sed 's/^/ /' >&2 + fi + done + rm -f "$PATTERN_FILE" + + if [ "$found" -eq 0 ]; then + print_remediation + return 1 + fi + return 0 +} + +# --- mode: --check-staged -------------------------------------------------- +check_staged() { + cd "$REPO_ROOT" + git rev-parse --git-dir >/dev/null 2>&1 || die "--check-staged must run inside a git repo" + # Added/copied/modified/renamed staged files (text content only). + git diff --cached --name-only --diff-filter=ACMR | scan_files +} + +# --- mode: --check-repo ---------------------------------------------------- +check_repo() { + cd "$REPO_ROOT" + git rev-parse --git-dir >/dev/null 2>&1 || die "--check-repo must run inside a git repo" + # All tracked files in the working tree (ignored/untracked excluded by design). + git ls-files | scan_files +} + +main() { + local mode="${1:-}" + case "$mode" in + --check-message) shift; check_message "${1:-}" ;; + --fix-message) shift; fix_message "${1:-}" ;; + --check-staged) shift; check_staged ;; + --check-repo) shift; check_repo ;; + -h|--help|help|"") usage; [ -n "$mode" ] && exit 0 || exit 2 ;; + *) die "unknown mode: $mode (try --help)" ;; + esac +} + +main "$@" diff --git a/scripts/ci-attribution-check.sh b/scripts/ci-attribution-check.sh new file mode 100755 index 0000000..b038370 --- /dev/null +++ b/scripts/ci-attribution-check.sh @@ -0,0 +1,172 @@ +#!/usr/bin/env bash +# +# ci-attribution-check.sh +# +# CI-side wrapper around agent-attribution-scrub.sh. Runs the shared scrubber in +# CHECK-ONLY mode over (a) every commit message in a PR range and (b) the file +# content of the tree, FAILING the job with precise remediation if any +# disallowed AI/model/private-identity attribution is present. +# +# This wrapper NEVER rewrites a commit object. It only reads commit messages +# (`git log --format=%B`) and tracked file content. Remote history is untouched; +# the remediation is performed by the human/agent locally (commit --amend / +# interactive rebase), not by CI. +# +# Issue: Decent-Tako/skills-and-governance#72 (CI + ruleset attribution +# enforcement), child of epic #68. +# +# Usage: +# ci-attribution-check.sh +# +# merge-base side of the PR (e.g. origin/main or +# github.event.pull_request.base.sha) +# PR tip (e.g. github.event.pull_request.head.sha or HEAD) +# +# Environment: +# SCRUB path to agent-attribution-scrub.sh +# (default: alongside this script in scripts/) +# SKIP_REPO_SCAN if set to "1", skip the file-content tree scan and check +# only commit messages. Use this in repos that have not yet +# been cleaned of PRE-EXISTING attribution in files, so the +# gate still blocks NEW leaks in commit messages without +# failing on unrelated legacy content. (See rollout doc.) +# +# Exit codes: +# 0 clean +# 1 disallowed attribution found (commit message and/or file content) +# 2 usage / environment error +# +# Portable to bash 3.2 + BSD/GNU grep (no associative arrays, no mapfile). + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRUB="${SCRUB:-$SCRIPT_DIR/agent-attribution-scrub.sh}" + +die() { printf 'ci-attribution-check: %s\n' "$*" >&2; exit 2; } + +[ $# -eq 2 ] || die "usage: ci-attribution-check.sh " +BASE_REF="$1" +HEAD_REF="$2" + +[ -x "$SCRUB" ] || die "scrubber not found or not executable: $SCRUB" + +# Single reusable scratch file for per-commit scrubber output; cleaned on exit. +tmp="$(mktemp "${TMPDIR:-/tmp}/attr-msg.XXXXXX")" +trap 'rm -f "$tmp"' EXIT + +fail=0 + +# --------------------------------------------------------------------------- +# (a) Commit messages across the PR range. +# +# We resolve the symmetric-difference base so a PR that lags main does not have +# main's commits re-scanned. Every commit is checked; we DO NOT stop at the +# first offender — "precise remediation" means naming every bad commit. +# --------------------------------------------------------------------------- +set +e +range_base="$(git merge-base "$BASE_REF" "$HEAD_REF" 2>&1)" +mb_rc=$? +set -e +if [ "$mb_rc" -ne 0 ]; then + die "cannot compute merge-base of '$BASE_REF' and '$HEAD_REF' (shallow checkout? need fetch-depth: 0): ${range_base}" +fi + +commits="$(git rev-list "$range_base..$HEAD_REF")" + +if [ -z "$commits" ]; then + echo "No commits in range $BASE_REF..$HEAD_REF — nothing to check for commit messages." +else + echo "Checking commit messages in range ${range_base}..${HEAD_REF}" + bad_commits="" + # Iterate via heredoc-fed `read` (NOT `for sha in $commits`, which word-splits + # on IFS, nor `… | while`, whose subshell would discard fail/bad_commits). + while IFS= read -r sha || [ -n "$sha" ]; do + [ -n "$sha" ] || continue + # Feed the full commit message to the scrubber on stdin (check-only). + if ! git log -1 --format=%B "$sha" | "$SCRUB" --check-message - >"$tmp" 2>&1; then + short="$(git log -1 --format=%s "$sha")" + echo "" + echo "::error::Disallowed attribution in commit message of $sha" + echo " commit: $sha ${short}" + sed 's/^/ /' "$tmp" + bad_commits="$bad_commits $sha" + fail=1 + fi + done <:" headers on stderr). + set +e + "$SCRUB" --check-repo >"$tmp" 2>&1 + repo_rc=$? + set -e + cat "$tmp" + if [ "$repo_rc" -ne 0 ]; then + # One ::error:: per named file, mirroring the commit-message path's GHA + # annotations; fall back to a generic line if no file header was parsed. + named=0 + while IFS= read -r f; do + [ -n "$f" ] || continue + echo "::error file=$f::Disallowed attribution in tracked file: $f" + named=1 + done <&2 <<'EOF' + +============================================================================ +ATTRIBUTION CHECK FAILED — this PR carries disallowed AI / model / private- +identity attribution. CI is CHECK-ONLY and will not rewrite your commits. + +Fix it locally, then force-push the branch: + + Commit MESSAGE trailers (e.g. "Co-Authored-By: ", "Generated with ..."): + - Newest commit: git commit --amend (delete the trailer line) + - Older commit(s): git rebase -i (reword each flagged commit) + or use the scrubber's in-place fixer on a message file (run the vendored + copy of agent-attribution-scrub.sh that ships alongside this script): + agent-attribution-scrub.sh --fix-message + + FILE content: + - Delete the attribution credit, or attribute to Ben / decent.tech + per rules/attribution.md. + +Policy: rules/attribution.md. This gate exists because commit messages are +immutable once merged — strip attribution on the way IN, not after. +============================================================================ +EOF + exit 1 +fi + +echo "" +echo "Attribution check passed." +exit 0