KunAgent · XingYu-Zhong · Jun 29, 2026 · Jun 28, 2026 · Jun 28, 2026 · Jun 28, 2026
diff --git a/kun/README.md b/kun/README.md
@@ -51,6 +51,27 @@ Run from the `kun/` directory.
 - `npm run serve` – start the runtime after a build.
 - `npm run dev` – rebuild in watch mode.
 
+- `npm run benchmark:replay -- --suite <file>` - run a read-only HTTP/SSE agent replay suite.
+
+### Agent replay benchmark
+
+Start a Kun runtime, set `KUN_RUNTIME_URL` and `KUN_RUNTIME_TOKEN`, then run the five-task smoke set:
+
+```bash
+npm run benchmark:replay -- --suite benchmarks/agent-core.json --tag smoke --output replay-smoke.json
+```
+
+Run all 20 tasks twice and compare with an earlier report:
+
+```bash
+npm run benchmark:replay -- --suite benchmarks/agent-core.json --repeat 2 \
+  --baseline replay-baseline.json --output replay-current.json --fail-on-regression
+```
+
+Replay threads always use the `read-only` sandbox and disable interactive input. Reports include success rate,
+TTFT, full latency, tool time, SSE delivery delay, token/cache/cost counters, and Kun process peak RSS. The runtime
+token is accepted only through `KUN_RUNTIME_TOKEN`, so it does not leak through process arguments.
+
 ## CLI
 
 `kun serve` accepts the following flags:

diff --git a/kun/benchmarks/agent-core.json b/kun/benchmarks/agent-core.json
@@ -0,0 +1,130 @@
+{
+  "version": 1,
+  "name": "kun-agent-core",
+  "defaults": {
+    "reasoningEffort": "off",
+    "timeoutMs": 300000
+  },
+  "tasks": [
+    {
+      "id": "architecture-summary",
+      "tags": ["smoke", "architecture"],
+      "prompt": "Read the repository and explain the active Renderer -> preload -> main -> Kun runtime data path. Cite the most relevant file paths. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "runtime-entrypoint",
+      "tags": ["smoke", "runtime"],
+      "prompt": "Find the Kun serve-mode composition root and summarize how stores, model clients, tools, and the agent loop are assembled. Cite exact file paths. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "renderer-send-flow",
+      "tags": ["smoke", "frontend"],
+      "prompt": "Trace a chat message from the renderer composer through the preload/main bridge to the Kun turn endpoint. Return a concise ordered call path with files. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "sse-replay",
+      "tags": ["smoke", "runtime"],
+      "prompt": "Explain how Kun SSE event replay avoids duplicates and cursor rewind after reconnect or restart. Cite the implementation and tests. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "mcp-lifecycle",
+      "tags": ["smoke", "mcp"],
+      "prompt": "Inspect MCP startup, tool discovery, execution, and reconnect behavior. Identify the main reliability boundaries and cite the implementation files. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "cache-prefix",
+      "tags": ["cache"],
+      "prompt": "Explain what makes Kun's immutable prompt prefix stable and list dynamic data that must remain outside it. Cite code and documentation. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "provider-url-contract",
+      "tags": ["provider"],
+      "prompt": "Trace how baseUrl and endpointFormat affect provider URL construction and request bodies across chat and auxiliary model calls. Cite all important consumers. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "attachment-flow",
+      "tags": ["attachments"],
+      "prompt": "Trace an image or local file attachment from renderer selection to model input or fallback. Identify the cross-layer contract fields and failure points. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "approval-flow",
+      "tags": ["runtime", "security"],
+      "prompt": "Trace a tool approval request from agent loop creation through SSE/UI resolution back to tool execution. Cite routes, gates, and renderer handlers. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "goal-resume",
+      "tags": ["runtime", "goal"],
+      "prompt": "Explain how active goals survive runtime restart, how orphaned turns are reconciled, and where auto-resume is triggered. Cite tests if present. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "subagent-permissions",
+      "tags": ["subagent", "security"],
+      "prompt": "Explain how subagent tool policies inherit or restrict built-in tools, MCP servers, and skills without escalating the parent permissions. Cite enforcement points. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "settings-persistence",
+      "tags": ["settings"],
+      "prompt": "Trace a Kun settings change from renderer state through validation/persistence to managed runtime restart. Highlight rollback behavior. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "test-selection",
+      "tags": ["quality"],
+      "prompt": "Identify how the verify_changes tool selects and runs validation after edits. Explain its safety limits and output contract. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "build-pipeline",
+      "tags": ["build"],
+      "prompt": "Summarize the development, typecheck, test, build, and packaging pipeline for Kun. Cite package scripts and packaging configuration. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "security-boundaries",
+      "tags": ["security"],
+      "prompt": "Map the main trust boundaries for renderer IPC, filesystem tools, command execution, MCP, and secrets. Cite concrete enforcement files. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "runtime-hotspots",
+      "tags": ["performance"],
+      "prompt": "Inspect runtime event persistence, SSE replay, tool execution, and context assembly. Identify three evidence-based performance or memory hotspots with file references. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "thread-persistence",
+      "tags": ["storage"],
+      "prompt": "Explain how thread/session data is persisted and indexed across file and hybrid SQLite stores, including usage carryover. Cite implementation files. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "model-capabilities",
+      "tags": ["provider"],
+      "prompt": "Explain how model capabilities control image input, tool calling, reasoning effort, endpoint format, and context limits. Cite schemas and request construction. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "frontend-chunking",
+      "tags": ["frontend", "performance"],
+      "prompt": "Inspect renderer lazy loading and identify which Workbench surfaces are split into separate chunks and which heavy chat dependencies still load eagerly. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    },
+    {
+      "id": "failure-recovery",
+      "tags": ["runtime", "reliability"],
+      "prompt": "Map how the desktop app detects an unhealthy Kun child, budgets restarts, distinguishes settings restarts from crashes, and reports status to the renderer. Do not modify files.",
+      "expect": { "requiredAnyTools": ["read", "grep", "find", "ls"] }
+    }
+  ]
+}
diff --git a/kun/package.json b/kun/package.json
@@ -61,6 +61,7 @@
     "test": "vitest run",
     "test:watch": "vitest",
     "transcript:diff": "node ./scripts/transcript-diff.mjs",
+    "benchmark:replay": "npm run build && node ./dist/cli/replay-entry.js",
     "serve": "node ./dist/cli/serve-entry.js",
     "dev": "tsc -p tsconfig.build.json --watch"
   },

diff --git a/kun/src/adapters/hybrid/hybrid-thread-store.ts b/kun/src/adapters/hybrid/hybrid-thread-store.ts
@@ -932,6 +932,10 @@ function mergeTurnMetadata(previous: Turn, next: Turn): Turn {
     attachmentIds: mergeStringArrays(previous.attachmentIds, next.attachmentIds),
     activeSkillIds: mergeStringArrays(previous.activeSkillIds, next.activeSkillIds),
     injectedMemoryIds: mergeStringArrays(previous.injectedMemoryIds, next.injectedMemoryIds),
+    injectedMemorySummaries:
+      next.injectedMemorySummaries.length > 0
+        ? next.injectedMemorySummaries
+        : previous.injectedMemorySummaries,
     items: mergeTurnItems(previous.items, next.items)
   }
 }
@@ -971,6 +975,7 @@ function turnFromItems(threadId: string, turnId: string, items: TurnItem[], fall
     attachmentIds: attachmentIdsFromItems(items),
     activeSkillIds: [],
     injectedMemoryIds: [],
+    injectedMemorySummaries: [],
     createdAt,
     finishedAt: hasOpenItem ? undefined : items[items.length - 1]?.finishedAt ?? fallbackTime,
     items

diff --git a/kun/src/adapters/tool/background-shell-tool.ts b/kun/src/adapters/tool/background-shell-tool.ts
@@ -0,0 +1,142 @@
+import { LocalToolHost, type LocalTool } from './local-tool-host.js'
+import { withToolBoundary } from './builtin-tool-utils.js'
+import type { BackgroundShellRecordInput } from './builtin-tool-types.js'
+import {
+  isBashSessionId,
+  listBashSessionRecords,
+  pollBashSession,
+  readBashSessionPayload,
+  stopBashSessionById,
+  writeBashSessionStdin
+} from './builtin-bash-tool.js'
+
+export type BackgroundShellToolOptions = {
+  listBackgroundSessions?: (threadId?: string) => readonly BackgroundShellRecordInput[]
+}
+
+
+function normalizeYieldSeconds(value: unknown): number {
+  const raw = typeof value === 'number' && Number.isFinite(value) ? Math.floor(value) : 10
+  return Math.max(1, Math.min(60, raw))
+}
+
+export function createBackgroundShellTool(options: BackgroundShellToolOptions = {}): LocalTool {
+  return LocalToolHost.defineTool({
+    name: 'background_shell',
+    description:
+      'Manage shell sessions started with bash background=true. The bash tool assigns an 8-character session_id when starting a background command; use that id here. action="list" lists running sessions by default (set include_finished=true to also show completed/stopped/failed sessions; optional thread_only). action="read" returns a non-blocking output snapshot. action="poll" waits up to yield_seconds for more output or exit. action="write" sends stdin via input. action="stop" terminates a running session.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        action: {
+          type: 'string',
+          enum: ['list', 'read', 'poll', 'write', 'stop']
+        },
+        session_id: {
+          type: 'string',
+          description: 'Required for read, poll, write, and stop. The 8-character id returned by bash when background=true.'
+        },
+        yield_seconds: { type: 'number' },
+        include_finished: { type: 'boolean', default: false },
+        thread_only: { type: 'boolean', default: true },
+        input: { type: 'string' }
+      },
+      required: ['action'],
+      additionalProperties: false
+    },
+    policy: 'auto',
+    toolKind: 'tool_call',
+    execute: async (args, context) =>
+      withToolBoundary(async () => {
+        const action = typeof args.action === 'string' ? args.action.trim() : ''
+        if (action === 'list') {
+          const threadOnly = args.thread_only !== false
+          const threadId = threadOnly ? context.threadId : undefined
+          let sessions = options.listBackgroundSessions
+            ? [...options.listBackgroundSessions(threadId)]
+            : await listBashSessionRecords(threadId)
+          if (args.include_finished !== true) {
+            sessions = sessions.filter((session) => session.status === 'running')
+          }
+          return {
+            output: {
+              sessions: sessions.map((session) => ({
+                session_id: session.id,
+                command: session.command,
+                cwd: session.cwd,
+                shell: session.shell,
+                status: session.status,
+                started_at: session.startedAt,
+                ...(session.finishedAt ? { finished_at: session.finishedAt } : {}),
+                exit_code: session.exitCode,
+                output: session.output,
+                ...(session.outputTruncated ? { output_truncated: true } : {}),
+                ...(session.outputFilePath ? { output_file: session.outputFilePath } : {}),
+                detached: session.detached
+              })),
+              running: sessions.filter((session) => session.status === 'running').length
+            }
+          }
+        }
+
+        const sessionId = typeof args.session_id === 'string' ? args.session_id.trim() : ''
+        if (!sessionId) {
+          return { output: { error: 'session_id is required' }, isError: true }
+        }
+        if (!isBashSessionId(sessionId)) {
+          return {
+            output: {
+              error: 'session_id must be the 8-character id returned by bash when background=true',
+              session_id: sessionId
+            },
+            isError: true
+          }
+        }
+
+        if (action === 'read') {
+          const payload = await readBashSessionPayload(sessionId)
+          if (!payload) {
+            return { output: { error: 'background shell session not found', session_id: sessionId }, isError: true }
+          }
+          return { output: payload, isError: payload.status === 'failed' }
+        }
+
+        if (action === 'stop') {
+          const stopped = await stopBashSessionById(sessionId)
+          const payload = await readBashSessionPayload(sessionId)
+          if (!payload) {
+            return {
+              output: { error: 'background shell session not found', session_id: sessionId, stopped },
+              isError: true
+            }
+          }
+          return {
+            output: { ...payload, stop_sent: stopped },
+            isError: payload.status === 'running' || payload.status === 'failed'
+          }
+        }
+
+        if (action === 'write') {
+          const payload = await writeBashSessionStdin(
+            sessionId,
+            typeof args.input === 'string' ? args.input : '',
+            normalizeYieldSeconds(args.yield_seconds)
+          )
+          if (!payload) {
+            return { output: { error: 'background shell session not found', session_id: sessionId }, isError: true }
+          }
+          return { output: payload, isError: payload.status === 'failed' }
+        }
+
+        if (action === 'poll') {
+          const payload = await pollBashSession(sessionId, normalizeYieldSeconds(args.yield_seconds))
+          if (!payload) {
+            return { output: { error: 'background shell session not found', session_id: sessionId }, isError: true }
+          }
+          return { output: payload, isError: payload.status === 'failed' }
+        }
+
+        return { output: { error: `unsupported background_shell action: ${action}` }, isError: true }
+      })
+  })
+}