Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 24 additions & 14 deletions plugins/arckit-claude/hooks/autoresearch-tracer.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
* Output: JSON with trace data (stored to file, not returned)
*/

import { writeFileSync, mkdirSync, existsSync } from 'node:fs';
import { writeFileSync, mkdirSync, existsSync, readdirSync, readFileSync } from 'node:fs';
import { execSync } from 'node:child_process';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';

Expand Down Expand Up @@ -56,7 +57,7 @@ export function captureAutoresearchTrace(input) {
verifier: input.verifier || {},
metadata: {
traceId: `iter-${input.iteration || 0}`,
worktree: process.env.AUTORESEARCH_WORTREE || process.cwd(),
worktree: process.env.AUTORESEARCH_WORKTREE || process.env.AUTORESEARCH_WORTREE || process.cwd(),
gitCommit: getGitCommit()
}
};
Expand All @@ -72,8 +73,10 @@ export function captureAutoresearchTrace(input) {
*/
function getGitCommit() {
try {
const { execSync } = require('node:child_process');
return execSync('git rev-parse --short HEAD', { encoding: 'utf8' }).trim();
return execSync('git rev-parse --short HEAD', {
encoding: 'utf8',
stdio: ['ignore', 'pipe', 'ignore']
}).trim();
} catch {
return 'unknown';
}
Expand Down Expand Up @@ -135,12 +138,10 @@ export function loadAllTraces(target, mode) {
return [];
}

const { readdirSync } = require('node:fs');
const files = readdirSync(tracesDir).filter(f => f.endsWith('.json'));

return files.map(file => {
try {
const { readFileSync } = require('node:fs');
return JSON.parse(readFileSync(join(tracesDir, file), 'utf8'));
} catch {
return null;
Expand All @@ -152,17 +153,26 @@ export function loadAllTraces(target, mode) {
* Main entry point for hook execution
* Reads from stdin, processes, saves trace
*/
function main() {
let input;
try {
input = JSON.parse(require('node:fs').readFileSync(0, 'utf8'));
} catch {
// No input, exit silently
return;
export function processTraceInput(raw) {
if (!raw.trim()) {
return null;
}

const input = JSON.parse(raw);
const trace = captureAutoresearchTrace(input);
console.log(JSON.stringify({ traceSaved: true, traceId: trace.metadata.traceId }));
return { traceSaved: true, traceId: trace.metadata.traceId };
}

function main() {
try {
const result = processTraceInput(readFileSync(0, 'utf8'));
if (result) {
console.log(JSON.stringify(result));
}
} catch (error) {
console.error(`Invalid JSON input: ${error.message}`);
process.exit(1);
}
}

// Run if executed directly
Expand Down
31 changes: 27 additions & 4 deletions plugins/arckit-claude/hooks/harness-proposer.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ function generateToolProposal(proposal, currentContent, mechanism) {
break;
default:
proposal.changes = [{
type: 'add',
type: 'add_tool',
tools: ['WebSearch'],
content: 'Enable WebSearch for external research'
}];
Expand Down Expand Up @@ -699,14 +699,37 @@ function modifyToolConfig(content, change) {
const config = JSON.parse(content);

switch (change.type) {
case 'restrict':
config.toolRestrictions = config.toolRestrictions || {};
change.tools.forEach(t => {
config.toolRestrictions[t] = {
action: change.action || 'require_justification',
reason: change.content || ''
};
});
break;
case 'disable':
if (config.mcpServers) {
config.mcpServers = config.mcpServers.filter(s =>
!change.tools.includes(s));
if (Array.isArray(config.mcpServers)) {
config.mcpServers = config.mcpServers.filter(s =>
!change.tools.includes(s));
} else {
change.tools.forEach(t => {
delete config.mcpServers[t];
});
}
}
config.disabledTools = Array.from(new Set([
...(config.disabledTools || []),
...change.tools
]));
break;
case 'add_tool':
if (config.mcpServers) {
config.allowedTools = Array.from(new Set([
...(config.allowedTools || []),
...change.tools
]));
if (Array.isArray(config.mcpServers)) {
change.tools.forEach(t => {
if (!config.mcpServers.includes(t)) {
config.mcpServers.push(t);
Expand Down
94 changes: 74 additions & 20 deletions plugins/arckit-claude/hooks/harness-validator.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
* This module runs regression tests on held-out tasks to ensure robust improvement.
*/

import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'node:fs';
import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync } from 'node:fs';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';

Expand All @@ -29,6 +29,7 @@ const __dirname = dirname(fileURLToPath(import.meta.url));
* @param {Array} options.heldInTasks - Held-in task IDs
* @param {Array} options.heldOutTasks - Held-out task IDs
* @param {string} options.candidateHarnessPath - Path to candidate harness
* @param {Object} options.taskResults - Pre-scored validation results keyed by split/task
* @param {Object} options.baselineScores - Baseline scores {heldIn: X.X, heldOut: Y.Y}
* @param {number} options.minDelta - Minimum delta for acceptance (default: 0.3)
* @returns {Object} Validation result
Expand All @@ -40,6 +41,7 @@ export function validateHarness(options) {
heldInTasks = [],
heldOutTasks = [],
candidateHarnessPath,
taskResults = {},
baselineScores = { heldIn: 0, heldOut: 0 },
minDelta = 0.3,
iteration
Expand All @@ -62,16 +64,31 @@ export function validateHarness(options) {
// Validate held-in tasks
const heldInResults = [];
for (const task of heldInTasks) {
const taskResult = executeAndScoreTask(task, candidateHarnessPath, mode, command);
const taskResult = executeAndScoreTask(task, {
split: 'heldIn',
taskResults,
candidateHarnessPath,
mode,
command
});
heldInResults.push(taskResult);
}

// Validate held-out tasks
const heldOutResults = [];
for (const task of heldOutTasks) {
const taskResult = executeAndScoreTask(task, candidateHarnessPath, mode, command);
const taskResult = executeAndScoreTask(task, {
split: 'heldOut',
taskResults,
candidateHarnessPath,
mode,
command
});
heldOutResults.push(taskResult);
}

const missingScores = [...heldInResults, ...heldOutResults]
.filter(r => !Number.isFinite(r.score));

// Calculate average scores
const heldInScores = heldInResults.map(r => r.score).filter(s => s !== null && s !== undefined);
Expand All @@ -87,6 +104,23 @@ export function validateHarness(options) {
// Calculate deltas
result.deltas.heldIn = result.candidateScores.heldIn - baselineScores.heldIn;
result.deltas.heldOut = result.candidateScores.heldOut - baselineScores.heldOut;

if (heldInTasks.length === 0 || heldOutTasks.length === 0) {
result.reason = 'Rejected: held-in and held-out task lists are both required';
result.heldInResults = heldInResults;
result.heldOutResults = heldOutResults;
saveValidationResult(result);
return result;
}

if (missingScores.length > 0) {
const taskList = missingScores.map(r => `${r.split}:${r.taskId}`).join(', ');
result.reason = `Rejected: missing scored validation result for ${taskList}`;
result.heldInResults = heldInResults;
result.heldOutResults = heldOutResults;
saveValidationResult(result);
return result;
}

// Apply conservative acceptance rule (Zhang et al., 2026, Algorithm 1)
const deltaInNonNegative = result.deltas.heldIn >= 0;
Expand Down Expand Up @@ -122,27 +156,48 @@ export function validateHarness(options) {
/**
* Execute a task and score the result
*/
function executeAndScoreTask(taskId, harnessPath, mode, command) {
// This is a placeholder for the actual execution
// In practice, this would:
// 1. Set up the scratch project with task fixtures
// 2. Apply the candidate harness
// 3. Execute the command/agent/hook
// 4. Run structural checks
// 5. Run LLM-as-judge scoring
// 6. Return the score

// For now, return a mock result
// In real implementation, this would call the actual execution pipeline
function executeAndScoreTask(task, context) {
const { split, taskResults, candidateHarnessPath, mode, command } = context;
const taskId = typeof task === 'string' ? task : task?.id || task?.taskId;
const scoredResult = resolveTaskResult(task, split, taskResults);

return {
taskId,
taskId: taskId || 'unknown',
split,
executedAt: new Date().toISOString(),
structural: 'PASS', // or 'FAIL'
score: Math.random() * 3 + 7, // Random score between 7-10
tracePath: `.arckit/autoresearch-traces/${command}/${mode}/iteration-${Date.now()}-${taskId}.json`
harnessPath: candidateHarnessPath,
structural: scoredResult?.structural || scoredResult?.status || 'UNSCORED',
score: Number.isFinite(scoredResult?.score) ? scoredResult.score : null,
tracePath: scoredResult?.tracePath || `.arckit/autoresearch-traces/${command}/${mode}/${taskId || 'unknown'}.json`,
error: scoredResult ? undefined : 'No scored validation result supplied'
};
}

function resolveTaskResult(task, split, taskResults) {
if (task && typeof task === 'object' && Number.isFinite(task.score)) {
return task;
}

if (task && typeof task === 'object' && task.resultPath) {
return loadResultFile(task.resultPath);
}

if (typeof task === 'string' && existsSync(task)) {
return loadResultFile(task);
}

const taskId = typeof task === 'string' ? task : task?.id || task?.taskId;
return taskResults?.[split]?.[taskId] || taskResults?.[taskId] || null;
}

function loadResultFile(path) {
try {
return JSON.parse(readFileSync(path, 'utf8'));
} catch {
return null;
}
}

/**
* Save validation result to file
*/
Expand Down Expand Up @@ -173,7 +228,6 @@ export function loadValidationResults(command, mode) {
return [];
}

const { readdirSync } = require('node:fs');
const files = readdirSync(resultsDir).filter(f => f.endsWith('.json'));

return files.map(file => {
Expand Down
14 changes: 7 additions & 7 deletions plugins/arckit-claude/hooks/weakness-miner.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -259,14 +259,14 @@ function getOrCreateCluster(command, mode, signature, iteration) {
if (!cluster) {
cluster = createNewCluster(signature, iteration);
clusters.push(cluster);
saveClusters(clustersPath, clusters);
} else {
// Update existing cluster
cluster.count++;
cluster.traces.push(iteration);
cluster.frequency = calculateFrequency(clusters);
saveClusters(clustersPath, clusters);
}

calculateFrequency(clusters);
saveClusters(clustersPath, clusters);

return cluster;
}
Expand Down Expand Up @@ -388,10 +388,10 @@ function determineSeverity(signature) {
*/
function calculateFrequency(clusters) {
const total = clusters.reduce((sum, c) => sum + c.count, 0);
return clusters.map(c => ({
...c,
frequency: total > 0 ? c.count / total : 0
}));
for (const cluster of clusters) {
cluster.frequency = total > 0 ? cluster.count / total : 0;
}
return clusters;
}

/**
Expand Down
Loading
Loading