Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/features.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ Shipped, user-visible capabilities.
- find/search/read files
- edit/create/delete files
- scan/edit code via AST-based tools
- git status/diff/log/show/add/commit
- git diff/log/show/add/commit
- shell command execution
- scoped test execution via detected test runner
- web search/fetch
Expand Down
5 changes: 4 additions & 1 deletion docs/soul.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ I am Acolyte, a terminal-first coding agent built for practical execution. I red
5. For reviews, prioritize concrete repo-specific findings and keep output short.
6. Prefer actionable patch recommendations over generic policy essays.
7. Prefer one clear next action over multi-option menus unless the user asks to compare alternatives.
8. Prefer short direct prose by default; do not use dash bullets unless the user asks for a list or the content is inherently list-shaped.
9. Let tool output carry execution details; add prose only when it contributes missing context.
10. Say one short thing before acting, then let the work speak for itself.

## Execution behavior
1. Implement requested changes directly when intent is clear.
Expand All @@ -80,4 +83,4 @@ I am Acolyte, a terminal-first coding agent built for practical execution. I red
2. Batch mode exists for minimal scripting use.
3. Centralized memory enables continuity across machines.
4. Tool reliability and memory quality are higher priority than feature count.
5. Default behavior is language and toolchain agnostic unless the user asks for stack-specific handling.
5. Default behavior is language and toolchain agnostic unless the user asks for stack-specific handling.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"start": "bun run src/cli.ts",
"dev": "ACOLYTE_SERVER_RESTART=1 ./scripts/with-server.sh serve:watch bun run src/cli.ts",
"run": "bun run src/cli.ts run",
"prompt:show": "bun run scripts/show-prompt.ts",
"behavior:run": "bun run scripts/run-behavior.ts",
"perf:run": "bun run scripts/run-perf.ts",
"serve": "bun --env-file=.env run src/server.ts",
Expand Down
79 changes: 50 additions & 29 deletions scripts/behavior-scenarios.ts
Original file line number Diff line number Diff line change
Expand Up @@ -251,12 +251,22 @@ function validateTwoFileDepsRenameTrace(traceLines: string[]): string[] {
(line) => line.includes("tool=file-edit") && line.includes("path=src/cli-skill.ts"),
).length;

const firstReadTools = toolCallLines
.slice(0, 2)
.filter((line) => line.includes("tool=file-read"))
.map((line) => (line.includes("src/cli-run.ts") ? "run" : line.includes("src/cli-skill.ts") ? "skill" : "other"));
if (!(firstReadTools.includes("run") && firstReadTools.includes("skill"))) {
issues.push("first two tool calls should read src/cli-run.ts and src/cli-skill.ts");
const firstTool = toolCallLines[0];
const hasBatchedInitialRead =
firstTool?.includes("tool=file-read") &&
firstTool.includes("src/cli-run.ts") &&
firstTool.includes("src/cli-skill.ts");
const firstTwoReadsCoverBothFiles =
toolCallLines
.slice(0, 2)
.filter((line) => line.includes("tool=file-read"))
.some((line) => line.includes("src/cli-run.ts")) &&
toolCallLines
.slice(0, 2)
.filter((line) => line.includes("tool=file-read"))
.some((line) => line.includes("src/cli-skill.ts"));
if (!hasBatchedInitialRead && !firstTwoReadsCoverBothFiles) {
issues.push("initial direct reads should cover src/cli-run.ts and src/cli-skill.ts before editing");
}
if (toolCallLines.some((line) => line.includes("tool=file-find"))) {
issues.push("two-file deps rename should not use file-find");
Expand Down Expand Up @@ -284,12 +294,11 @@ async function createBoundedReturnFixWorkspace(workspace: string): Promise<void>
'import { scopedCallLog } from "./tool-guards";',
'import { WRITE_TOOL_SET } from "./tool-registry";',
"",
"export function acceptedLifecycleSignal(ctx: RunContext): string | undefined {",
"export function resetAcceptedLifecycleSignal(ctx: RunContext): void {",
" const signal = ctx.result?.signal;",
" if (!signal) return undefined;",
" if (ctx.currentError) return undefined;",
' if (signal === "no_op" && taskHasWrites(ctx)) return undefined;',
' if (signal === "done" || signal === "no_op" || signal === "blocked") return signal;',
" return undefined;",
"}",
"",
Expand All @@ -303,15 +312,14 @@ async function createBoundedReturnFixWorkspace(workspace: string): Promise<void>
" ctx.lifecycleState.repeatedFailure = { ...previous, count: previous.count + 1 };",
"}",
"",
"function failureSignatureForError(ctx: RunContext): string | undefined {",
"function clearFailureSignature(ctx: RunContext): void {",
" if (!ctx.currentError) return undefined;",
" return ctx.currentError.message.trim() || undefined;",
" return undefined;",
"}",
"",
"function normalizeFailureMessage(message: string | undefined): string | undefined {",
"function resetFailureMessage(message: string | undefined): void {",
" if (!message) return undefined;",
' const normalized = message.replace(/\\s+/g, " ").trim();',
" return normalized.length > 0 ? normalized : undefined;",
" return undefined;",
"}",
"",
].join("\n"),
Expand All @@ -325,13 +333,13 @@ async function validateBoundedReturnFixWorkspace(workspace: string): Promise<str
issues.push("src/lifecycle-state.ts should not keep return undefined; statements");
}
if (!content.includes("if (!signal) return;")) {
issues.push("acceptedLifecycleSignal should use bare return for missing signal");
issues.push("resetAcceptedLifecycleSignal should use bare return for missing signal");
}
if (!content.includes("if (!ctx.currentError) return;")) {
issues.push("failureSignatureForError should use bare return for missing currentError");
issues.push("clearFailureSignature should use bare return for missing currentError");
}
if (!content.includes("if (!message) return;")) {
issues.push("normalizeFailureMessage should use bare return for missing message");
issues.push("resetFailureMessage should use bare return for missing message");
}
return issues;
}
Expand Down Expand Up @@ -369,14 +377,15 @@ async function validatePostSuccessStopWorkspace(workspace: string): Promise<stri
function validatePostSuccessStopTrace(traceLines: string[]): string[] {
const issues: string[] = [];
const toolCallLines = traceLines.filter((line) => line.includes("event=lifecycle.tool.call"));
const toolResultLines = traceLines.filter((line) => line.includes("event=lifecycle.tool.result"));
const firstTool = toolCallLines[0];
if (!firstTool || !firstTool.includes("tool=file-read") || !firstTool.includes("src/provider-config.ts")) {
issues.push("first tool call should be file-read on src/provider-config.ts");
}
if (toolCallLines.some((line) => line.includes("tool=file-find") || line.includes("tool=file-search"))) {
issues.push("post-success stop scenario should not use file-find or file-search");
}
const readCalls = toolCallLines.filter(
const readCalls = toolResultLines.filter(
(line) => line.includes("tool=file-read") && line.includes("src/provider-config.ts"),
).length;
if (readCalls > 1) {
Expand Down Expand Up @@ -420,29 +429,41 @@ function validateBoundedReturnFixTrace(traceLines: string[]): string[] {
const sameFileEditCalls = toolCalls.filter(
(call) => call.tool === "file-edit" && call.path === "src/lifecycle-state.ts",
).length;
if (sameFileEditCalls > 2) {
issues.push(`bounded single-file scenario should use at most 2 file-edit calls, saw ${sameFileEditCalls}`);
if (sameFileEditCalls === 0) {
issues.push("bounded single-file scenario should update src/lifecycle-state.ts");
}

const successfulWriteCalls = traceLines.filter(
(line) =>
line.includes("event=lifecycle.tool.output") &&
line.includes("tool=file-edit") &&
line.includes('preview="Edit src/lifecycle-state.ts'),
).length;
if (successfulWriteCalls !== 1) {
issues.push(
`bounded single-file scenario should use exactly 1 successful write, saw ${successfulWriteCalls ?? "?"}`,
);
}

const verifyModeIndex = traceLines.findIndex(
(line) => line.includes("event=lifecycle.mode.changed") && line.includes("to=verify"),
);
const firstVerifyCommandIndex = traceLines.findIndex(
(line) => line.includes("event=lifecycle.tool.call") && line.includes("tool=shell-run"),
const firstVerifyReviewIndex = traceLines.findIndex(
(line) =>
line.includes("event=lifecycle.tool.result") &&
(line.includes("tool=code-scan") || line.includes("tool=test-run")) &&
line.includes("src/lifecycle-state.ts"),
);
if (verifyModeIndex >= 0 && firstVerifyCommandIndex > verifyModeIndex) {
const verifyPrelude = traceLines.slice(verifyModeIndex + 1, firstVerifyCommandIndex);
if (verifyModeIndex >= 0 && firstVerifyReviewIndex > verifyModeIndex) {
const verifyPrelude = traceLines.slice(verifyModeIndex + 1, firstVerifyReviewIndex);
const badVerifyPrelude = verifyPrelude.some(
(line) =>
line.includes("event=lifecycle.tool.call") &&
(line.includes("tool=file-read") ||
line.includes("tool=file-search") ||
line.includes("tool=code-scan") ||
line.includes("tool=git-diff")) &&
line.includes("event=lifecycle.tool.result") &&
(line.includes("tool=file-read") || line.includes("tool=file-search") || line.includes("tool=git-diff")) &&
line.includes("src/lifecycle-state.ts"),
);
if (badVerifyPrelude) {
issues.push("verify mode should run the verify command before rereading or diffing src/lifecycle-state.ts");
issues.push("verify mode should review src/lifecycle-state.ts before rereading or diffing it");
}
}

Expand Down
54 changes: 52 additions & 2 deletions scripts/run-behavior.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { describe, expect, test } from "bun:test";
import { BEHAVIOR_SCENARIO_LIST, parseBehaviorScenarioId } from "./behavior-scenarios";
import { analyzeBehavior, parseArgs, summarizeTrace } from "./run-behavior";
import { BEHAVIOR_SCENARIO_BY_ID, BEHAVIOR_SCENARIO_LIST, parseBehaviorScenarioId } from "./behavior-scenarios";
import { analyzeBehavior, parseArgs, summarizeTrace, summarizeTranscript } from "./run-behavior";

describe("run-behavior args", () => {
test("parseArgs applies defaults", () => {
Expand Down Expand Up @@ -49,6 +49,34 @@ describe("behavior scenarios", () => {
expect(BEHAVIOR_SCENARIO_LIST.some((scenario) => scenario.id === "scoped-code-edit-rename-target")).toBe(true);
expect(BEHAVIOR_SCENARIO_LIST.some((scenario) => scenario.id === "structured-code-edit-replace")).toBe(true);
});

test("two-file-deps-rename trace accepts a single batched initial file-read", () => {
const issues =
BEHAVIOR_SCENARIO_BY_ID["two-file-deps-rename"].validateTrace?.([
'ts event=lifecycle.tool.call tool=file-read paths="[{\\"path\\":\\"src/cli-run.ts\\"},{\\"path\\":\\"src/cli-skill.ts\\"}]"',
"ts event=lifecycle.tool.call tool=file-edit path=src/cli-run.ts",
"ts event=lifecycle.tool.call tool=file-edit path=src/cli-skill.ts",
"ts event=lifecycle.summary lifecycle_signal=done has_error=false",
]) ?? [];

expect(issues).not.toContain(
"initial direct reads should cover src/cli-run.ts and src/cli-skill.ts before editing",
);
});

test("bounded-return-fix trace counts successful writes from edit previews, not total write attempts", () => {
const issues =
BEHAVIOR_SCENARIO_BY_ID["bounded-return-fix"].validateTrace?.([
'ts event=lifecycle.tool.call tool=file-read paths="[{\\"path\\":\\"src/lifecycle-state.ts\\"}]"',
"ts event=lifecycle.tool.call tool=file-edit path=src/lifecycle-state.ts",
'ts event=lifecycle.tool.output tool=file-edit preview="Edit src/lifecycle-state.ts (+8 -8)"',
"ts event=lifecycle.tool.call tool=file-edit path=src/lifecycle-state.ts",
'ts event=lifecycle.tool.error tool=file-edit error="find block not found"',
"ts event=lifecycle.summary write_calls=2 lifecycle_signal=done has_error=false",
]) ?? [];

expect(issues).not.toContain("bounded single-file scenario should use exactly 1 successful write, saw 2");
});
});

describe("behavior analysis", () => {
Expand All @@ -69,6 +97,28 @@ describe("behavior analysis", () => {
expect(trace?.lifecycleSignal).toBe("done");
});

test("summarizeTranscript counts assistant preamble, tool lines, and post-write chatter", () => {
const transcript = summarizeTranscript(
[
"\u001b[2mStarted server on port 52930 (pid 9658)\u001b[22m",
"❯ Update src/foo.ts",
"• Checking src/foo.ts, then updating it.",
"\u001b[2m• Read src/foo.ts\u001b[22m",
"\u001b[2m• Edit src/foo.ts (+1 -1)\u001b[22m",
"Updated src/foo.ts.",
"\u001b[2m• Scan Code src/foo.ts\u001b[22m",
].join("\n"),
);

expect(transcript).toEqual({
assistantMessages: 2,
assistantMessagesBeforeFirstTool: 1,
assistantMessagesAfterFirstWrite: 1,
toolMessages: 3,
firstWriteSeen: true,
});
});

test("analyzeBehavior scores a clean bounded run highly", () => {
const analysis = analyzeBehavior({
exitCode: 0,
Expand Down
63 changes: 62 additions & 1 deletion scripts/run-behavior.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { mkdir, mkdtemp, readFile, rm } from "node:fs/promises";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { z } from "zod";
import { stripAnsi } from "../src/tui/serialize";
import {
BEHAVIOR_SCENARIO_BY_ID,
BEHAVIOR_SCENARIO_LIST,
Expand Down Expand Up @@ -38,6 +39,14 @@ const behaviorTraceSummarySchema = z.object({
otherErrorCount: z.number().int().nonnegative().optional(),
});

const behaviorTranscriptSummarySchema = z.object({
assistantMessages: z.number().int().nonnegative(),
assistantMessagesBeforeFirstTool: z.number().int().nonnegative(),
assistantMessagesAfterFirstWrite: z.number().int().nonnegative(),
toolMessages: z.number().int().nonnegative(),
firstWriteSeen: z.boolean(),
});

const behaviorAnalysisSchema = z.object({
score: z.number().min(0).max(1),
verdict: z.enum(["strong", "mixed", "weak"]),
Expand All @@ -57,6 +66,7 @@ const behaviorOutputSchema = z.object({
stdout: z.string(),
stderr: z.string(),
trace: behaviorTraceSummarySchema.optional(),
transcript: behaviorTranscriptSummarySchema.optional(),
analysis: behaviorAnalysisSchema,
});

Expand Down Expand Up @@ -114,6 +124,51 @@ function countToolCalls(taskLines: string[], toolNames: string[]): number {
).length;
}

function isToolTranscriptLine(line: string): boolean {
return /^(Read|Edit|Scan Code|Search|Find|Create|Delete|Run|Test|Fetch|Web Search|Add|Commit|Diff|Log|Show)\b/.test(
line,
);
}

export function summarizeTranscript(stdout: string): z.infer<typeof behaviorTranscriptSummarySchema> | undefined {
const text = stripAnsi(stdout);
const lines = text
.split("\n")
.map((line) => line.trimEnd())
.filter((line) => line.length > 0 && !line.startsWith("Started server on port") && !line.startsWith("❯ "));
if (lines.length === 0) return undefined;

let assistantMessages = 0;
let assistantMessagesBeforeFirstTool = 0;
let assistantMessagesAfterFirstWrite = 0;
let toolMessages = 0;
let seenFirstTool = false;
let seenFirstWrite = false;

for (const rawLine of lines) {
const line = rawLine.startsWith("• ") ? rawLine.slice(2) : rawLine;
if (isToolTranscriptLine(line)) {
toolMessages += 1;
seenFirstTool = true;
if (/^(Edit|Create|Delete|Commit|Add)\b/.test(line)) seenFirstWrite = true;
continue;
}

if (line.startsWith(" ")) continue;
assistantMessages += 1;
if (!seenFirstTool) assistantMessagesBeforeFirstTool += 1;
if (seenFirstWrite) assistantMessagesAfterFirstWrite += 1;
}

return behaviorTranscriptSummarySchema.parse({
assistantMessages,
assistantMessagesBeforeFirstTool,
assistantMessagesAfterFirstWrite,
toolMessages,
firstWriteSeen: seenFirstWrite,
});
}

async function readLogLines(logPath: string): Promise<string[]> {
try {
const raw = await readFile(logPath, "utf8");
Expand Down Expand Up @@ -142,7 +197,6 @@ export function summarizeTrace(lines: string[]): z.infer<typeof behaviorTraceSum
"file-find",
"file-search",
"code-scan",
"git-status",
"git-diff",
"git-log",
"git-show",
Expand Down Expand Up @@ -387,6 +441,7 @@ async function runScenario(
const afterLines = await readLogLines(behaviorEnv.daemonLogPath);
const traceLines = afterLines.slice(beforeLines.length);
const trace = summarizeTrace(traceLines);
const transcript = summarizeTranscript(result.stdout);
const correctnessIssues = [...(await scenario.validate(workspace)), ...(scenario.validateTrace?.(traceLines) ?? [])];

return behaviorOutputSchema.parse({
Expand All @@ -401,6 +456,7 @@ async function runScenario(
stdout: result.stdout,
stderr: result.stderr,
trace,
transcript,
analysis: analyzeBehavior({
exitCode: result.exitCode,
expectedChangeCount: scenario.expectedChanges.length,
Expand All @@ -422,6 +478,11 @@ function printRun(run: BehaviorRun): void {
`trace: task=${run.trace.taskId ?? "unknown"} model_calls=${run.trace.modelCalls ?? "?"} total_tools=${run.trace.totalToolCalls ?? "?"} unique_tools=${run.trace.uniqueToolCount ?? "?"} read=${run.trace.readCalls ?? "?"} search=${run.trace.searchCalls ?? "?"} write=${run.trace.writeCalls ?? "?"} pre_write_discovery=${run.trace.preWriteDiscoveryCalls ?? "?"} signal=${run.trace.lifecycleSignal ?? "?"} regenerations=${run.trace.regenerationCount ?? "?"} regen_limit_hit=${run.trace.regenerationLimitHit ?? "?"} guard_blocked=${run.trace.guardBlockedCount ?? "?"} guard_flags=${run.trace.guardFlagSetCount ?? "?"} has_error=${run.trace.hasError ?? "?"}`,
);
}
if (run.transcript) {
console.log(
`transcript: assistant=${run.transcript.assistantMessages} before_first_tool=${run.transcript.assistantMessagesBeforeFirstTool} after_first_write=${run.transcript.assistantMessagesAfterFirstWrite} tools=${run.transcript.toolMessages}`,
);
}
console.log(`analysis: ${run.analysis.reasons.join("; ")}`);
if (run.analysis.correctnessIssues.length > 0) {
console.log(`correctness: ${run.analysis.correctnessIssues.join("; ")}`);
Expand Down
16 changes: 16 additions & 0 deletions scripts/show-prompt.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import { describe, expect, test } from "bun:test";
import { parseArgs } from "./show-prompt";

describe("parseArgs", () => {
test("parses work mode", () => {
expect(parseArgs(["work"])).toEqual({ mode: "work", workspace: undefined });
});

test("parses verify mode with workspace", () => {
expect(parseArgs(["verify", "--workspace", "/tmp/demo"])).toEqual({ mode: "verify", workspace: "/tmp/demo" });
});

test("rejects unknown flags", () => {
expect(() => parseArgs(["work", "--bogus"])).toThrow("Unknown argument: --bogus");
});
});
Loading
Loading