Sync Knowledge #6

Workflow file for this run

.github/workflows/sync-knowledge.yml at 387704a

	name: Sync Knowledge

	on:
	schedule:
	- cron: '0 8 * * 1-5' # Weekday mornings at 08:00 UTC
	workflow_dispatch:
	inputs:
	source_id:
	description: 'Sync a specific source by ID (leave empty for all)'
	type: string
	required: false
	default: ''

	permissions:
	contents: write
	pull-requests: write

	jobs:
	sync-repo-knowledge:
	runs-on: ubuntu-latest
	env:
	FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Sync sources
	id: sync
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	SOURCE_FILTER: ${{ inputs.source_id \|\| '' }}
	WORKFLOW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
	run: \|
	set -euo pipefail

	CONFIG=".github/sync-sources.json"
	STATE=".github/sync-state.json"
	HAS_CHANGES="false"
	PR_TITLE=""
	PR_BODY=""

	# Read all source entries (or filter to one)
	if [ -n "$SOURCE_FILTER" ]; then
	sources=$(jq -c --arg id "$SOURCE_FILTER" '.sources[] \| select(.id == $id)' "$CONFIG")
	else
	sources=$(jq -c '.sources[]' "$CONFIG")
	fi

	if [ -z "$sources" ]; then
	echo "No matching sources found."
	echo "has_changes=false" >> "$GITHUB_OUTPUT"
	exit 0
	fi

	echo "$sources" \| while IFS= read -r source; do
	id=$(echo "$source" \| jq -r '.id')
	repo=$(echo "$source" \| jq -r '.repo')
	branch=$(echo "$source" \| jq -r '.branch')
	source_path=$(echo "$source" \| jq -r '.sourcePath')
	target_path=$(echo "$source" \| jq -r '.targetPath')
	generate_enabled=$(echo "$source" \| jq -r '.generate.enabled // false')

	echo "::group::Processing source: $id"
	echo " repo=$repo branch=$branch"
	echo " source=$source_path -> target=$target_path"

	# --- Get latest commit SHA ---
	latest_sha=$(gh api "repos/$repo/commits/$branch" --jq '.sha')
	echo " Latest commit: $latest_sha"

	# --- Read last synced SHA ---
	last_sha=$(jq -r --arg id "$id" '.sources[$id].lastSyncedCommit // ""' "$STATE")
	echo " Last synced: ${last_sha:-<never>}"

	# --- Skip if unchanged ---
	if [ "$latest_sha" = "$last_sha" ]; then
	echo " No new commits. Skipping."
	echo "::endgroup::"
	continue
	fi

	# --- Sparse checkout source repo ---
	src_dir=$(mktemp -d)
	echo " Cloning $repo (sparse: $source_path)..."
	git clone --filter=blob:none --sparse --branch "$branch" --depth 1 \
	"https://github.com/$repo.git" "$src_dir" 2>/dev/null
	(cd "$src_dir" && git sparse-checkout set "$source_path")

	# --- Build rsync exclude args ---
	exclude_args=""
	for pattern in $(echo "$source" \| jq -r '.exclude[]? // empty'); do
	exclude_args="$exclude_args --exclude=$pattern"
	done

	# --- Sync files ---
	mkdir -p "$target_path"
	rsync -av --delete $exclude_args "$src_dir/$source_path/" "$target_path/"
	rm -rf "$src_dir"

	# --- Get commit log ---
	commit_log=""
	if [ -n "$last_sha" ]; then
	echo " Fetching commit log ${last_sha:0:7}..${latest_sha:0:7}..."
	commit_log=$(gh api "repos/$repo/compare/${last_sha}...${latest_sha}" \
	--jq '[.commits[] \| "- [`\(.sha[0:7])`](\(.html_url)) \(.commit.message \| split("\n") \| .[0])"] \| join("\n")' \
	2>/dev/null \|\| echo " (Could not fetch commit log)")

	# Filter to commits that touch sourcePath
	changed_files=$(gh api "repos/$repo/compare/${last_sha}...${latest_sha}" \
	--jq --arg sp "$source_path" '[.files[] \| select(.filename \| startswith($sp)) \| .filename] \| length' \
	2>/dev/null \|\| echo "?")
	else
	commit_log="Initial sync from $repo ($branch)"
	changed_files="all"
	fi

	# --- Update sync state ---
	now=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
	jq --arg id "$id" --arg sha "$latest_sha" --arg ts "$now" \
	'.sources[$id] = {"lastSyncedCommit": $sha, "lastSyncedAt": $ts}' \
	"$STATE" > "${STATE}.tmp" && mv "${STATE}.tmp" "$STATE"

	echo " Synced. Updated state to $latest_sha"

	# --- Build PR metadata ---
	# Count commits
	if [ -n "$last_sha" ]; then
	commit_count=$(gh api "repos/$repo/compare/${last_sha}...${latest_sha}" \
	--jq '.commits \| length' 2>/dev/null \|\| echo "?")
	else
	commit_count="initial"
	fi

	# Save per-source PR info to temp files (for aggregation after loop)
	echo "$id" >> /tmp/sync_changed_ids
	{
	echo "## Source: \`$id\`"
	echo ""
	echo "Repository: [$repo](https://github.com/$repo) (branch: \`$branch\`)"
	echo "Path: \`$source_path\` → \`$target_path\`"
	echo "Commits: $commit_count new (${last_sha:0:7}..${latest_sha:0:7})"
	echo "Files in source path changed: $changed_files"
	echo ""
	echo "### Commit Log"
	echo ""
	echo "$commit_log"
	echo ""
	echo "---"
	} >> "/tmp/sync_pr_body_$id"

	echo "::endgroup::"
	done

	# --- Aggregate results ---
	if [ -f /tmp/sync_changed_ids ]; then
	changed_ids=$(cat /tmp/sync_changed_ids \| tr '\n' ', ' \| sed 's/,$//')
	count=$(wc -l < /tmp/sync_changed_ids \| tr -d ' ')

	# Combine PR bodies
	full_body="# Knowledge Sync Report"$'\n\n'
	full_body+="Synced sources: $changed_ids"$'\n'
	full_body+="Workflow run: [View run]($WORKFLOW_RUN_URL)"$'\n\n'

	for id_file in /tmp/sync_pr_body_*; do
	full_body+=$(cat "$id_file")
	full_body+=$'\n'
	done

	echo "has_changes=true" >> "$GITHUB_OUTPUT"
	echo "pr_title=[Knowledge Sync] $changed_ids: $count source(s) updated" >> "$GITHUB_OUTPUT"

	# Write PR body to file (too long for env var)
	echo "$full_body" > /tmp/pr_body.md
	else
	echo "has_changes=false" >> "$GITHUB_OUTPUT"
	fi

	# --- Generate experts for sources that need it ---
	- name: Setup Node.js
	if: steps.sync.outputs.has_changes == 'true'
	uses: actions/setup-node@v4
	with:
	node-version: '22'

	- name: Generate experts (if configured)
	if: steps.sync.outputs.has_changes == 'true'
	env:
	GH_TOKEN: ${{ secrets.PAT \|\| secrets.GITHUB_TOKEN }}
	SOURCE_FILTER: ${{ inputs.source_id \|\| '' }}
	run: \|
	set -euo pipefail

	CONFIG=".github/sync-sources.json"

	# Find sources with generate.enabled = true
	if [ -n "$SOURCE_FILTER" ]; then
	gen_sources=$(jq -c --arg id "$SOURCE_FILTER" \
	'.sources[] \| select(.id == $id and .generate.enabled == true)' "$CONFIG")
	else
	gen_sources=$(jq -c '.sources[] \| select(.generate.enabled == true)' "$CONFIG")
	fi

	if [ -z "$gen_sources" ]; then
	echo "No sources require expert generation."
	exit 0
	fi

	# Install Copilot CLI
	echo "Installing GitHub Copilot CLI..."
	npm install -g @github/copilot

	echo "$gen_sources" \| while IFS= read -r source; do
	id=$(echo "$source" \| jq -r '.id')
	target_path=$(echo "$source" \| jq -r '.targetPath')
	template_path=$(echo "$source" \| jq -r '.generate.templatePath')
	prompt_path=$(echo "$source" \| jq -r '.generate.promptPath')
	output_path=$(echo "$source" \| jq -r '.generate.outputPath')
	skill_name=$(echo "$source" \| jq -r '.generate.skillName // .id')

	echo "::group::Generating experts for: $id"

	# Read prompt template and perform variable substitution
	prompt_template=$(cat "$prompt_path")

	# List synced source files for context
	source_files=$(find "$target_path" -type f -name '*.md' \| head -200 \| sort)

	# Substitute template variables
	prompt="${prompt_template//\{\{SOURCE_PATH\}\}/$target_path}"
	prompt="${prompt//\{\{OUTPUT_PATH\}\}/$output_path}"
	prompt="${prompt//\{\{TEMPLATE_PATH\}\}/$template_path}"
	prompt="${prompt//\{\{SKILL_NAME\}\}/$skill_name}"

	# Append file listing
	prompt="$prompt"$'\n\n'"## Source files found:"$'\n'"$source_files"

	echo " Running Copilot CLI to generate experts..."
	mkdir -p "$output_path"

	# Run Copilot CLI — graceful degradation on failure
	copilot -p "$prompt" --allow-all-tools 2>&1 \|\| {
	echo "::warning::Copilot CLI failed for $id. PR will include synced content only."
	}

	echo "::endgroup::"
	done

	# --- Assess and update derived files ---
	- name: Assess derived files
	id: derived
	if: steps.sync.outputs.has_changes == 'true'
	env:
	GH_TOKEN: ${{ secrets.PAT \|\| secrets.GITHUB_TOKEN }}
	HAS_PAT: ${{ secrets.PAT && 'true' \|\| 'false' }}
	SOURCE_FILTER: ${{ inputs.source_id \|\| '' }}
	run: \|
	set -euo pipefail

	CONFIG=".github/sync-sources.json"
	PROMPT_TEMPLATE=".github/templates/assess-derived-file-prompt.md"
	VALIDATE_SCRIPT=".github/scripts/validate-derived.sh"
	chmod +x "$VALIDATE_SCRIPT"

	# Collect sources that had changes (written by sync step)
	if [ ! -f /tmp/sync_changed_ids ]; then
	echo "No changed sources. Skipping derived file assessment."
	exit 0
	fi

	# Check if Copilot CLI is available (requires PAT)
	HAS_COPILOT="false"
	if [ "$HAS_PAT" = "true" ]; then
	if command -v copilot &>/dev/null; then
	HAS_COPILOT="true"
	else
	echo "Installing GitHub Copilot CLI..."
	npm install -g @github/copilot 2>/dev/null && HAS_COPILOT="true" \|\| true
	fi
	fi

	# Initialize derived files report
	echo "## Derived Files" > /tmp/derived_report.md
	echo "" >> /tmp/derived_report.md
	echo "\| File \| Status \| Details \|" >> /tmp/derived_report.md
	echo "\|------\|--------\|---------\|" >> /tmp/derived_report.md

	derived_updated="false"

	while IFS= read -r changed_id; do
	# Get derivedFiles for this source
	derived_files=$(jq -c --arg id "$changed_id" \
	'.sources[] \| select(.id == $id) \| .derivedFiles[]?' "$CONFIG")

	[ -z "$derived_files" ] && continue

	echo "$derived_files" \| while IFS= read -r df; do
	df_path=$(echo "$df" \| jq -r '.path')
	df_desc=$(echo "$df" \| jq -r '.description')
	context_paths=$(echo "$df" \| jq -r '.contextPaths[]')

	echo "::group::Assessing derived file: $df_path"

	if [ ! -f "$df_path" ]; then
	echo " Derived file does not exist. Skipping."
	echo "\| \`$df_path\` \| ⏭️ Skipped \| File does not exist \|" >> /tmp/derived_report.md
	echo "::endgroup::"
	continue
	fi

	if [ "$HAS_COPILOT" != "true" ]; then
	echo " Copilot CLI not available. Skipping assessment."
	echo "\| \`$df_path\` \| ⏭️ Skipped \| Copilot CLI not available (set PAT secret) \|" >> /tmp/derived_report.md
	echo "::endgroup::"
	continue
	fi

	# Build directory listing from contextPaths
	dir_listing=""
	for ctx in $context_paths; do
	if [ -d "$ctx" ]; then
	dir_listing+="$ctx/"$'\n'
	dir_listing+=$(find "$ctx" -type f -name '*.md' \| sort \| sed "s\|^\| \|")
	dir_listing+=$'\n'
	fi
	done

	# Get changed files from git diff (staged + unstaged)
	changed_files_list=$(git diff --name-only HEAD -- 2>/dev/null \|\| echo "(initial sync)")

	# Read current derived file
	current_content=$(cat "$df_path")

	# Build prompt from template
	prompt=$(cat "$PROMPT_TEMPLATE")
	prompt="${prompt//\{\{DERIVED_PATH\}\}/$df_path}"
	prompt="${prompt//\{\{DESCRIPTION\}\}/$df_desc}"
	prompt="${prompt//\{\{CURRENT_CONTENT\}\}/$current_content}"
	prompt="${prompt//\{\{DIR_LISTING\}\}/$dir_listing}"
	prompt="${prompt//\{\{CHANGED_FILES\}\}/$changed_files_list}"

	# Back up current file
	cp "$df_path" "${df_path}.bak"

	# Run Copilot CLI for assessment
	echo " Running Copilot CLI assessment..."
	copilot_output=$(copilot -p "$prompt" --allow-all-tools 2>&1) \|\| {
	echo "::warning::Copilot CLI failed for $df_path. Keeping existing file."
	echo "\| \`$df_path\` \| ⚠️ Error \| Copilot CLI failed \|" >> /tmp/derived_report.md
	rm -f "${df_path}.bak"
	echo "::endgroup::"
	continue
	}

	# Parse verdict
	if echo "$copilot_output" \| grep -q "VERDICT: COMPATIBLE"; then
	echo " Derived file is compatible. No changes needed."
	echo "\| \`$df_path\` \| ✅ Compatible \| No changes needed \|" >> /tmp/derived_report.md
	rm -f "${df_path}.bak"
	echo "::endgroup::"
	continue
	fi

	# Extract regenerated content
	if echo "$copilot_output" \| grep -q "REGENERATED_FILE_START"; then
	regen_content=$(echo "$copilot_output" \| sed -n '/REGENERATED_FILE_START/,/REGENERATED_FILE_END/p' \| sed '1d;$d')

	if [ -z "$regen_content" ]; then
	echo "::warning::Empty regeneration output for $df_path. Keeping existing file."
	echo "\| \`$df_path\` \| ⚠️ Error \| Empty regeneration output \|" >> /tmp/derived_report.md
	mv "${df_path}.bak" "$df_path"
	echo "::endgroup::"
	continue
	fi

	# Write regenerated content
	echo "$regen_content" > "$df_path"

	# Validate
	if bash "$VALIDATE_SCRIPT" "$df_path"; then
	echo " Regenerated and validated successfully."
	# Get a short summary of changes
	diff_stat=$(diff --brief "${df_path}.bak" "$df_path" 2>/dev/null && echo "identical" \|\| echo "updated")
	echo "\| \`$df_path\` \| ✅ Updated \| Regenerated and validated \|" >> /tmp/derived_report.md
	rm -f "${df_path}.bak"
	echo "true" > /tmp/derived_updated
	else
	echo "::warning::Validation failed for regenerated $df_path. Reverting to previous version."
	mv "${df_path}.bak" "$df_path"
	echo "\| \`$df_path\` \| ⚠️ Kept old \| Validation failed on regenerated version \|" >> /tmp/derived_report.md
	fi
	else
	echo " Verdict was INCOMPATIBLE but no regenerated content found."
	echo "\| \`$df_path\` \| ⚠️ Kept old \| Incompatible but regeneration not produced \|" >> /tmp/derived_report.md
	mv "${df_path}.bak" "$df_path"
	fi

	echo "::endgroup::"
	done
	done < /tmp/sync_changed_ids

	# Append derived report to PR body
	if [ -f /tmp/pr_body.md ] && [ -f /tmp/derived_report.md ]; then
	echo "" >> /tmp/pr_body.md
	cat /tmp/derived_report.md >> /tmp/pr_body.md
	fi

	# Safety cleanup: remove any stray .bak files
	find . -name '*.bak' -delete 2>/dev/null \|\| true

	# --- Create Pull Request ---
	- name: Create Pull Request
	if: steps.sync.outputs.has_changes == 'true'
	uses: peter-evans/create-pull-request@v7
	with:
	token: ${{ secrets.GITHUB_TOKEN }}
	commit-message: 'sync: update knowledge from upstream sources'
	branch: sync/knowledge
	delete-branch: true
	title: ${{ steps.sync.outputs.pr_title }}
	body-path: /tmp/pr_body.md
	labels: knowledge-sync

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Sync Knowledge #6

Workflow file

Sync Knowledge #6

Uh oh!

Workflow file for this run