diff --git a/src/pipeline/contribute.ts b/src/pipeline/contribute.ts
index 298359b..ce8abdf 100644
--- a/src/pipeline/contribute.ts
+++ b/src/pipeline/contribute.ts
@@ -3,8 +3,9 @@ import { loadImage, type PipelineContext } from "./context.ts";
import { ACCESSIBILITY_REQUIREMENTS } from "./accessibility.ts";
import { createAgentIssue } from "../github/issue.ts";
-// Content types already covered by the standard library — never suggest these.
-const STANDARD = new Set([
+// Content types already covered by the standard library — never suggest these,
+// and never dispatch the generic page extraction to them (see extraction.ts).
+export const STANDARD = new Set([
"paragraph", "heading", "list", "table", "formField", "image", "quote", "caption", "footnote",
]);
diff --git a/src/pipeline/extraction.ts b/src/pipeline/extraction.ts
index 54067f8..887c8d9 100644
--- a/src/pipeline/extraction.ts
+++ b/src/pipeline/extraction.ts
@@ -5,6 +5,7 @@ import { loadAgent, type AgentSpec } from "../agents/loader.ts";
import { feedbackPreamble, loadImage, type InputImage, type PipelineContext } from "./context.ts";
import { ACCESSIBILITY_REQUIREMENTS } from "./accessibility.ts";
import { verifyAgentOutput } from "./feedback.ts";
+import { STANDARD as STANDARD_AGENTS } from "./contribute.ts";
import type { Fragment } from "./fragment.ts";
const PAGE_AGENT = "page";
@@ -131,11 +132,109 @@ async function correctPage(
return corrected || null;
}
+// Merge instruction for splicing a specialist fragment into the page output.
+const MERGE_SYSTEM = `You merge a higher-fidelity HTML fragment, produced by a specialist agent, into an
+existing accessible HTML page. Replace the page's weaker representation of that SAME content
+with the specialist fragment and change nothing else — keep all other content, order,
+headings, and structure exactly, and never leave both representations (no duplication).
+Output body content only (no /
/ wrapper).
+Respond with ONLY this JSON: { "html": "" }`;
+
+// Run a library specialist agent against the whole page image, asking it to
+// extract only the content its contract covers. Returns its HTML fragment, or
+// null when it finds nothing.
+async function runSpecialist(ctx: PipelineContext, agent: AgentSpec, img: InputImage): Promise {
+ const system = `${agent.content}\n\n${ACCESSIBILITY_REQUIREMENTS}`;
+ const user =
+ `Extract ONLY the content your contract covers from this page image (filename: ${img.name}). ` +
+ `If none is present, return {"no_content": true}. Otherwise respond with ONLY this JSON: ` +
+ `{ "no_content": false, "html": "" }`;
+ const capability = agent.capabilities.includes("vision") ? "vision" : "text";
+ const res = await ctx.router.complete(
+ agent.name,
+ capability,
+ [
+ { role: "system", content: system },
+ { role: "user", content: user },
+ ],
+ { images: [loadImage(img)] },
+ );
+ ctx.log.agentCall({ agent, phase: "extraction", image: img.name, output: res.text });
+ const parsed = extractJson<{ no_content?: boolean; html?: string }>(res.text);
+ if (!parsed || parsed.no_content || !parsed.html?.trim()) return null;
+ return parsed.html.trim();
+}
+
+// Splice a specialist fragment into the page body, replacing the page's own
+// (weaker) representation of that content. Returns the merged body, or null on
+// failure (caller keeps the original page output).
+async function mergeSpecialist(
+ ctx: PipelineContext,
+ img: InputImage,
+ pageHtml: string,
+ specialistName: string,
+ reason: string,
+ fragment: string,
+): Promise {
+ const user =
+ `## Current page (body HTML)\n\`\`\`html\n${pageHtml}\n\`\`\`\n\n` +
+ `## Specialist (${specialistName}) fragment for the ${reason || "flagged"} content on this page\n` +
+ `\`\`\`html\n${fragment}\n\`\`\`\n\n` +
+ `Replace the page's existing representation of that content with this specialist fragment; ` +
+ `keep everything else unchanged.`;
+ const res = await ctx.router.complete(PAGE_AGENT, "text", [
+ { role: "system", content: MERGE_SYSTEM },
+ { role: "user", content: user },
+ ]);
+ ctx.log.agentCall({
+ agent: { name: PAGE_AGENT, file: "page.md", content: MERGE_SYSTEM, capabilities: ["text"], sha: null, sessionBuilt: false },
+ phase: "extraction",
+ image: img.name,
+ output: res.text,
+ });
+ const parsed = extractJson<{ html?: string }>(res.text);
+ return parsed?.html?.trim() || null;
+}
+
+// If a page flagged a content type that an EXISTING library agent handles, run
+// that specialist on the page and merge its higher-fidelity fragment into the
+// page output. Non-blocking: any failure leaves the page output unchanged.
+// dispatched=true means a library specialist ran (so the suggestion is already
+// covered and should not be re-filed as a new-agent issue).
+async function dispatchSpecialist(
+ ctx: PipelineContext,
+ img: InputImage,
+ pageHtml: string,
+ suggestion: { name: string; reason: string },
+): Promise<{ html: string; dispatched: boolean }> {
+ const logical = suggestion.name.replace(/\.md$/, "");
+ if (STANDARD_AGENTS.has(logical)) return { html: pageHtml, dispatched: false };
+ const specialist = loadAgent(logical, {
+ agentsDir: ctx.paths.agentsDir,
+ tmpAgentsDir: ctx.paths.tmpAgentsDir(ctx.sessionId),
+ });
+ if (!specialist) return { html: pageHtml, dispatched: false };
+ try {
+ const fragment = await runSpecialist(ctx, specialist, img);
+ if (!fragment) {
+ ctx.log.event("specialist_no_content", { agent: specialist.file, image: img.name });
+ return { html: pageHtml, dispatched: true };
+ }
+ const merged = await mergeSpecialist(ctx, img, pageHtml, specialist.name, suggestion.reason, fragment);
+ ctx.log.event("specialist_dispatched", { agent: specialist.file, image: img.name, merged: Boolean(merged) });
+ return { html: merged ?? pageHtml, dispatched: true };
+ } catch (e) {
+ ctx.log.event("specialist_dispatch_failed", { agent: specialist.file, image: img.name, error: (e as Error).message });
+ return { html: pageHtml, dispatched: true };
+ }
+}
+
// One fragment per page, in submitted order. Each page is verified for source
// fidelity at build time (PRD §7.5/§7.12); a page that fails gets one self-
// correction pass. Verification is non-blocking — a run never fails because the
-// Feedback Agent is unavailable or unsure. Pages may also flag a content type that
-// warrants a specialist agent, collected as `suggestions` for the contribution step.
+// Feedback Agent is unavailable or unsure. When a page flags a content type that an
+// existing library agent handles, that specialist is dispatched and merged in;
+// otherwise the suggestion is collected for the contribution step.
export async function runExtraction(ctx: PipelineContext): Promise {
const pageAgent = loadPageAgent(ctx);
const fragments: Fragment[] = [];
@@ -145,6 +244,19 @@ export async function runExtraction(ctx: PipelineContext): Promise