diff --git a/plugins/arckit-claude/hooks/autoresearch-tracer.mjs b/plugins/arckit-claude/hooks/autoresearch-tracer.mjs index f3460171..1e91f7cc 100644 --- a/plugins/arckit-claude/hooks/autoresearch-tracer.mjs +++ b/plugins/arckit-claude/hooks/autoresearch-tracer.mjs @@ -13,7 +13,8 @@ * Output: JSON with trace data (stored to file, not returned) */ -import { writeFileSync, mkdirSync, existsSync } from 'node:fs'; +import { writeFileSync, mkdirSync, existsSync, readdirSync, readFileSync } from 'node:fs'; +import { execSync } from 'node:child_process'; import { join, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; @@ -56,7 +57,7 @@ export function captureAutoresearchTrace(input) { verifier: input.verifier || {}, metadata: { traceId: `iter-${input.iteration || 0}`, - worktree: process.env.AUTORESEARCH_WORTREE || process.cwd(), + worktree: process.env.AUTORESEARCH_WORKTREE || process.env.AUTORESEARCH_WORTREE || process.cwd(), gitCommit: getGitCommit() } }; @@ -72,8 +73,10 @@ export function captureAutoresearchTrace(input) { */ function getGitCommit() { try { - const { execSync } = require('node:child_process'); - return execSync('git rev-parse --short HEAD', { encoding: 'utf8' }).trim(); + return execSync('git rev-parse --short HEAD', { + encoding: 'utf8', + stdio: ['ignore', 'pipe', 'ignore'] + }).trim(); } catch { return 'unknown'; } @@ -135,12 +138,10 @@ export function loadAllTraces(target, mode) { return []; } - const { readdirSync } = require('node:fs'); const files = readdirSync(tracesDir).filter(f => f.endsWith('.json')); return files.map(file => { try { - const { readFileSync } = require('node:fs'); return JSON.parse(readFileSync(join(tracesDir, file), 'utf8')); } catch { return null; @@ -152,17 +153,26 @@ export function loadAllTraces(target, mode) { * Main entry point for hook execution * Reads from stdin, processes, saves trace */ -function main() { - let input; - try { - input = JSON.parse(require('node:fs').readFileSync(0, 'utf8')); - } catch { - // No input, exit silently - return; +export function processTraceInput(raw) { + if (!raw.trim()) { + return null; } + const input = JSON.parse(raw); const trace = captureAutoresearchTrace(input); - console.log(JSON.stringify({ traceSaved: true, traceId: trace.metadata.traceId })); + return { traceSaved: true, traceId: trace.metadata.traceId }; +} + +function main() { + try { + const result = processTraceInput(readFileSync(0, 'utf8')); + if (result) { + console.log(JSON.stringify(result)); + } + } catch (error) { + console.error(`Invalid JSON input: ${error.message}`); + process.exit(1); + } } // Run if executed directly diff --git a/plugins/arckit-claude/hooks/harness-proposer.mjs b/plugins/arckit-claude/hooks/harness-proposer.mjs index faba8228..218ced0d 100644 --- a/plugins/arckit-claude/hooks/harness-proposer.mjs +++ b/plugins/arckit-claude/hooks/harness-proposer.mjs @@ -435,7 +435,7 @@ function generateToolProposal(proposal, currentContent, mechanism) { break; default: proposal.changes = [{ - type: 'add', + type: 'add_tool', tools: ['WebSearch'], content: 'Enable WebSearch for external research' }]; @@ -699,14 +699,37 @@ function modifyToolConfig(content, change) { const config = JSON.parse(content); switch (change.type) { + case 'restrict': + config.toolRestrictions = config.toolRestrictions || {}; + change.tools.forEach(t => { + config.toolRestrictions[t] = { + action: change.action || 'require_justification', + reason: change.content || '' + }; + }); + break; case 'disable': if (config.mcpServers) { - config.mcpServers = config.mcpServers.filter(s => - !change.tools.includes(s)); + if (Array.isArray(config.mcpServers)) { + config.mcpServers = config.mcpServers.filter(s => + !change.tools.includes(s)); + } else { + change.tools.forEach(t => { + delete config.mcpServers[t]; + }); + } } + config.disabledTools = Array.from(new Set([ + ...(config.disabledTools || []), + ...change.tools + ])); break; case 'add_tool': - if (config.mcpServers) { + config.allowedTools = Array.from(new Set([ + ...(config.allowedTools || []), + ...change.tools + ])); + if (Array.isArray(config.mcpServers)) { change.tools.forEach(t => { if (!config.mcpServers.includes(t)) { config.mcpServers.push(t); diff --git a/plugins/arckit-claude/hooks/harness-validator.mjs b/plugins/arckit-claude/hooks/harness-validator.mjs index 913ce83f..bd265ce0 100644 --- a/plugins/arckit-claude/hooks/harness-validator.mjs +++ b/plugins/arckit-claude/hooks/harness-validator.mjs @@ -13,7 +13,7 @@ * This module runs regression tests on held-out tasks to ensure robust improvement. */ -import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'node:fs'; +import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync } from 'node:fs'; import { join, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; @@ -29,6 +29,7 @@ const __dirname = dirname(fileURLToPath(import.meta.url)); * @param {Array} options.heldInTasks - Held-in task IDs * @param {Array} options.heldOutTasks - Held-out task IDs * @param {string} options.candidateHarnessPath - Path to candidate harness + * @param {Object} options.taskResults - Pre-scored validation results keyed by split/task * @param {Object} options.baselineScores - Baseline scores {heldIn: X.X, heldOut: Y.Y} * @param {number} options.minDelta - Minimum delta for acceptance (default: 0.3) * @returns {Object} Validation result @@ -40,6 +41,7 @@ export function validateHarness(options) { heldInTasks = [], heldOutTasks = [], candidateHarnessPath, + taskResults = {}, baselineScores = { heldIn: 0, heldOut: 0 }, minDelta = 0.3, iteration @@ -62,16 +64,31 @@ export function validateHarness(options) { // Validate held-in tasks const heldInResults = []; for (const task of heldInTasks) { - const taskResult = executeAndScoreTask(task, candidateHarnessPath, mode, command); + const taskResult = executeAndScoreTask(task, { + split: 'heldIn', + taskResults, + candidateHarnessPath, + mode, + command + }); heldInResults.push(taskResult); } // Validate held-out tasks const heldOutResults = []; for (const task of heldOutTasks) { - const taskResult = executeAndScoreTask(task, candidateHarnessPath, mode, command); + const taskResult = executeAndScoreTask(task, { + split: 'heldOut', + taskResults, + candidateHarnessPath, + mode, + command + }); heldOutResults.push(taskResult); } + + const missingScores = [...heldInResults, ...heldOutResults] + .filter(r => !Number.isFinite(r.score)); // Calculate average scores const heldInScores = heldInResults.map(r => r.score).filter(s => s !== null && s !== undefined); @@ -87,6 +104,23 @@ export function validateHarness(options) { // Calculate deltas result.deltas.heldIn = result.candidateScores.heldIn - baselineScores.heldIn; result.deltas.heldOut = result.candidateScores.heldOut - baselineScores.heldOut; + + if (heldInTasks.length === 0 || heldOutTasks.length === 0) { + result.reason = 'Rejected: held-in and held-out task lists are both required'; + result.heldInResults = heldInResults; + result.heldOutResults = heldOutResults; + saveValidationResult(result); + return result; + } + + if (missingScores.length > 0) { + const taskList = missingScores.map(r => `${r.split}:${r.taskId}`).join(', '); + result.reason = `Rejected: missing scored validation result for ${taskList}`; + result.heldInResults = heldInResults; + result.heldOutResults = heldOutResults; + saveValidationResult(result); + return result; + } // Apply conservative acceptance rule (Zhang et al., 2026, Algorithm 1) const deltaInNonNegative = result.deltas.heldIn >= 0; @@ -122,27 +156,48 @@ export function validateHarness(options) { /** * Execute a task and score the result */ -function executeAndScoreTask(taskId, harnessPath, mode, command) { - // This is a placeholder for the actual execution - // In practice, this would: - // 1. Set up the scratch project with task fixtures - // 2. Apply the candidate harness - // 3. Execute the command/agent/hook - // 4. Run structural checks - // 5. Run LLM-as-judge scoring - // 6. Return the score - - // For now, return a mock result - // In real implementation, this would call the actual execution pipeline +function executeAndScoreTask(task, context) { + const { split, taskResults, candidateHarnessPath, mode, command } = context; + const taskId = typeof task === 'string' ? task : task?.id || task?.taskId; + const scoredResult = resolveTaskResult(task, split, taskResults); + return { - taskId, + taskId: taskId || 'unknown', + split, executedAt: new Date().toISOString(), - structural: 'PASS', // or 'FAIL' - score: Math.random() * 3 + 7, // Random score between 7-10 - tracePath: `.arckit/autoresearch-traces/${command}/${mode}/iteration-${Date.now()}-${taskId}.json` + harnessPath: candidateHarnessPath, + structural: scoredResult?.structural || scoredResult?.status || 'UNSCORED', + score: Number.isFinite(scoredResult?.score) ? scoredResult.score : null, + tracePath: scoredResult?.tracePath || `.arckit/autoresearch-traces/${command}/${mode}/${taskId || 'unknown'}.json`, + error: scoredResult ? undefined : 'No scored validation result supplied' }; } +function resolveTaskResult(task, split, taskResults) { + if (task && typeof task === 'object' && Number.isFinite(task.score)) { + return task; + } + + if (task && typeof task === 'object' && task.resultPath) { + return loadResultFile(task.resultPath); + } + + if (typeof task === 'string' && existsSync(task)) { + return loadResultFile(task); + } + + const taskId = typeof task === 'string' ? task : task?.id || task?.taskId; + return taskResults?.[split]?.[taskId] || taskResults?.[taskId] || null; +} + +function loadResultFile(path) { + try { + return JSON.parse(readFileSync(path, 'utf8')); + } catch { + return null; + } +} + /** * Save validation result to file */ @@ -173,7 +228,6 @@ export function loadValidationResults(command, mode) { return []; } - const { readdirSync } = require('node:fs'); const files = readdirSync(resultsDir).filter(f => f.endsWith('.json')); return files.map(file => { diff --git a/plugins/arckit-claude/hooks/weakness-miner.mjs b/plugins/arckit-claude/hooks/weakness-miner.mjs index 67cac3bb..746c1368 100644 --- a/plugins/arckit-claude/hooks/weakness-miner.mjs +++ b/plugins/arckit-claude/hooks/weakness-miner.mjs @@ -259,14 +259,14 @@ function getOrCreateCluster(command, mode, signature, iteration) { if (!cluster) { cluster = createNewCluster(signature, iteration); clusters.push(cluster); - saveClusters(clustersPath, clusters); } else { // Update existing cluster cluster.count++; cluster.traces.push(iteration); - cluster.frequency = calculateFrequency(clusters); - saveClusters(clustersPath, clusters); } + + calculateFrequency(clusters); + saveClusters(clustersPath, clusters); return cluster; } @@ -388,10 +388,10 @@ function determineSeverity(signature) { */ function calculateFrequency(clusters) { const total = clusters.reduce((sum, c) => sum + c.count, 0); - return clusters.map(c => ({ - ...c, - frequency: total > 0 ? c.count / total : 0 - })); + for (const cluster of clusters) { + cluster.frequency = total > 0 ? cluster.count / total : 0; + } + return clusters; } /** diff --git a/tests/plugin/self-harness.test.mjs b/tests/plugin/self-harness.test.mjs new file mode 100644 index 00000000..129e31c4 --- /dev/null +++ b/tests/plugin/self-harness.test.mjs @@ -0,0 +1,152 @@ +#!/usr/bin/env node +/** + * Focused tests for the Self-Harness autoresearch utilities. + * + * Run with: node tests/plugin/self-harness.test.mjs + */ + +import { mkdtempSync, writeFileSync, readFileSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import test from 'node:test'; +import assert from 'node:assert/strict'; + +import { processTraceInput } from '../../plugins/arckit-claude/hooks/autoresearch-tracer.mjs'; +import { validateHarness } from '../../plugins/arckit-claude/hooks/harness-validator.mjs'; +import { applyProposal } from '../../plugins/arckit-claude/hooks/harness-proposer.mjs'; +import { mineWeaknesses } from '../../plugins/arckit-claude/hooks/weakness-miner.mjs'; + +function withTempDir(prefix, fn) { + const root = mkdtempSync(join(tmpdir(), prefix)); + const originalCwd = process.cwd(); + try { + process.chdir(root); + return fn(root); + } finally { + process.chdir(originalCwd); + rmSync(root, { recursive: true, force: true }); + } +} + +test('autoresearch tracer processes JSON input and writes a trace file', () => { + withTempDir('arckit-selfharness-trace-', (root) => { + const result = processTraceInput(JSON.stringify({ + command: 'requirements', + iteration: 7, + mode: 'prompt', + toolCalls: [{ name: 'Read', path: 'fixtures/project.md' }], + tokenCount: 123, + durationMs: 456, + artifacts: ['projects/001/ARC-001-REQ-v1.0.md'], + verifier: { passed: true } + })); + + assert.deepEqual(result, { traceSaved: true, traceId: 'iter-7' }); + + const tracePath = join(root, '.arckit', 'autoresearch-traces', 'requirements', 'prompt', 'iteration-007.json'); + const trace = JSON.parse(readFileSync(tracePath, 'utf8')); + assert.equal(trace.target, 'requirements'); + assert.equal(trace.execution.tokenCount, 123); + }); +}); + +test('harness validator rejects unscored tasks instead of inventing scores', () => { + withTempDir('arckit-selfharness-validator-', () => { + const result = validateHarness({ + command: 'requirements', + mode: 'prompt', + heldInTasks: ['001'], + heldOutTasks: ['002'], + baselineScores: { heldIn: 8, heldOut: 8 }, + iteration: 1 + }); + + assert.equal(result.accepted, false); + assert.match(result.reason, /missing scored validation result/); + assert.equal(result.heldInResults[0].score, null); + assert.equal(result.heldOutResults[0].score, null); + }); +}); + +test('harness validator accepts explicit scored task results deterministically', () => { + withTempDir('arckit-selfharness-validator-', () => { + const options = { + command: 'requirements', + mode: 'prompt', + heldInTasks: ['001'], + heldOutTasks: ['002'], + taskResults: { + heldIn: { '001': { structural: 'PASS', score: 8.5 } }, + heldOut: { '002': { structural: 'PASS', score: 8.1 } } + }, + baselineScores: { heldIn: 8, heldOut: 8 }, + iteration: 2 + }; + + const first = validateHarness(options); + const second = validateHarness(options); + + assert.equal(first.accepted, true); + assert.equal(second.accepted, true); + assert.deepEqual(first.candidateScores, second.candidateScores); + assert.deepEqual(first.deltas, second.deltas); + }); +}); + +test('weakness miner keeps cluster frequency numeric after repeated failures', () => { + withTempDir('arckit-selfharness-miner-', () => { + writeFileSync('trace.json', JSON.stringify({ + execution: { toolCalls: [], durationMs: 0, artifactsCreated: [] }, + output: 'short' + })); + + const verifier = { passed: false, failures: ['Document Control'] }; + mineWeaknesses('requirements', 1, 'prompt', 'trace.json', verifier, 0, 8); + mineWeaknesses('requirements', 2, 'prompt', 'trace.json', verifier, 0, 8); + + const clustersPath = join('.arckit', 'autoresearch-traces', 'requirements', 'prompt', 'clusters.json'); + const clusters = JSON.parse(readFileSync(clustersPath, 'utf8')); + assert.equal(clusters.length, 1); + assert.equal(typeof clusters[0].frequency, 'number'); + assert.equal(clusters[0].frequency, 1); + assert.deepEqual(clusters[0].traces, [1, 2]); + }); +}); + +test('harness proposer applies tool config changes to object-shaped MCP config', () => { + withTempDir('arckit-selfharness-proposer-', () => { + const configPath = 'mcp.json'; + writeFileSync(configPath, JSON.stringify({ + mcpServers: { + 'uk-tenders': { type: 'http', url: 'https://example.test/mcp' }, + 'aws-knowledge': { type: 'http', url: 'https://example.test/aws' } + } + }, null, 2)); + + applyProposal({ + changes: [ + { + type: 'restrict', + tools: ['Bash'], + action: 'require_justification', + content: 'Explain why Bash is necessary' + }, + { + type: 'disable', + tools: ['uk-tenders'] + }, + { + type: 'add_tool', + tools: ['WebSearch'] + } + ] + }, configPath); + + const config = JSON.parse(readFileSync(configPath, 'utf8')); + assert.equal(config.mcpServers['uk-tenders'], undefined); + assert.ok(config.mcpServers['aws-knowledge']); + assert.equal(config.toolRestrictions.Bash.action, 'require_justification'); + assert.deepEqual(config.disabledTools, ['uk-tenders']); + assert.deepEqual(config.allowedTools, ['WebSearch']); + }); +});