From ff8afe32aa026e17055729e440664252d9b76b6d Mon Sep 17 00:00:00 2001
From: code-engineer <noreply@anthropic.com>
Date: Fri, 10 Apr 2026 17:46:55 +0200
Subject: [PATCH 01/12] =?UTF-8?q?chore:=20prep=20for=20import-file=20wirin?=
 =?UTF-8?q?g=20=E2=80=94=20interface=20split=20+=20markdown=20extractor?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 3b prep commit. Adds the name-agnostic extraction pipeline
restructuring that the import-file route handler will orchestrate in
the next commit(s):

- packages/core/src/extraction-pipeline.ts: split interface. Converter
  returns { mdIntermediate: string } only via ConverterOutput.
  ExtractionOutput { mdIntermediate, triples, provenance } remains as
  the composite type assembled by the orchestrator (route handler).
- packages/core/src/index.ts: export ConverterOutput.
- packages/cli/src/extraction/markitdown-converter.ts: return type
  updated to ConverterOutput (no behavior change, same binary invocation).
- packages/cli/src/extraction/markdown-extractor.ts: NEW Phase 2
  structural extractor (~331 lines) implementing deterministic node-side
  extraction from Markdown. Handles YAML frontmatter, wikilinks, tags,
  Dataview inline fields, heading structure. No LLM, no external deps.
- packages/cli/src/extraction/index.ts: exports the new extractor.
- packages/cli/test/extraction-markdown.test.ts: NEW 27 unit tests
  covering structural extraction cases. All pass.
- packages/core/test/extraction-pipeline.test.ts: updated for split
  interface. 7/7 pass.
- packages/cli/test/document-processor-e2e.test.ts: updated for split
  interface.
- packages/cli/test/extraction-markitdown.test.ts: updated for split
  interface.

Next commit wires POST /api/assertion/:name/import-file to orchestrate
Phase 1 (converter) + Phase 2 (markdown extractor) and write triples
to the target assertion. Prep commit ships no new HTTP routes — the
existing import-file endpoint in daemon.ts is unchanged until Phase 3b
completes wiring.

Part of OriginTrail/dkgv10-spec#77, #79 gap 3, and #80.
---
 packages/cli/src/extraction/index.ts          |   5 +
 .../cli/src/extraction/markdown-extractor.ts  | 331 +++++++++++++++
 .../src/extraction/markitdown-converter.ts    |  14 +-
 .../cli/test/document-processor-e2e.test.ts   |  24 +-
 packages/cli/test/extraction-markdown.test.ts | 385 ++++++++++++++++++
 .../cli/test/extraction-markitdown.test.ts    |   8 +-
 packages/core/src/extraction-pipeline.ts      |  45 +-
 packages/core/src/index.ts                    |   1 +
 .../core/test/extraction-pipeline.test.ts     |  22 +-
 9 files changed, 784 insertions(+), 51 deletions(-)
 create mode 100644 packages/cli/src/extraction/markdown-extractor.ts
 create mode 100644 packages/cli/test/extraction-markdown.test.ts

diff --git a/packages/cli/src/extraction/index.ts b/packages/cli/src/extraction/index.ts
index a4b72e041..f139cb436 100644
--- a/packages/cli/src/extraction/index.ts
+++ b/packages/cli/src/extraction/index.ts
@@ -1 +1,6 @@
 export { MarkItDownConverter, isMarkItDownAvailable, MARKITDOWN_CONTENT_TYPES } from './markitdown-converter.js';
+export {
+  extractFromMarkdown,
+  type MarkdownExtractInput,
+  type MarkdownExtractOutput,
+} from './markdown-extractor.js';
diff --git a/packages/cli/src/extraction/markdown-extractor.ts b/packages/cli/src/extraction/markdown-extractor.ts
new file mode 100644
index 000000000..e83965e37
--- /dev/null
+++ b/packages/cli/src/extraction/markdown-extractor.ts
@@ -0,0 +1,331 @@
+/**
+ * Phase 2 of document ingestion: deterministic structural extraction
+ * from a Markdown intermediate to RDF triples + provenance.
+ *
+ * This is the "Layer 1 structural" extraction defined by
+ * `19_MARKDOWN_CONTENT_TYPE.md` — it runs without an LLM and produces
+ * triples from explicit Markdown/YAML structure only:
+ *
+ *   - YAML frontmatter keys → subject properties
+ *   - `type` frontmatter key → rdf:type
+ *   - Wikilinks `[[Target]]` → schema:mentions
+ *   - Hashtags `#keyword` → schema:keywords
+ *   - Dataview `key:: value` inline fields → properties
+ *   - Heading hierarchy → dkg:hasSection
+ *
+ * Every extracted triple gets a provenance record pointing to a
+ * `dkg:ExtractionProvenance` blank identifier so downstream consumers
+ * can distinguish structurally-derived triples from user-asserted ones.
+ *
+ * Spec: 05_PROTOCOL_EXTENSIONS.md §6.5.2, 19_MARKDOWN_CONTENT_TYPE.md
+ */
+
+import { load as loadYaml } from 'js-yaml';
+import type { ExtractionQuad as Quad } from '@origintrail-official/dkg-core';
+
+const RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
+const SCHEMA_NAME = 'http://schema.org/name';
+const SCHEMA_DESCRIPTION = 'http://schema.org/description';
+const SCHEMA_MENTIONS = 'http://schema.org/mentions';
+const SCHEMA_KEYWORDS = 'http://schema.org/keywords';
+const DKG_HAS_SECTION = 'http://dkg.io/ontology/hasSection';
+const DKG_EXTRACTION_PROVENANCE = 'http://dkg.io/ontology/ExtractionProvenance';
+const DKG_DERIVED_FROM = 'http://dkg.io/ontology/derivedFrom';
+const DKG_EXTRACTED_BY = 'http://dkg.io/ontology/extractedBy';
+const DKG_EXTRACTION_RULE = 'http://dkg.io/ontology/extractionRule';
+const DKG_EXTRACTED_AT = 'http://dkg.io/ontology/extractedAt';
+const PROV_WAS_GENERATED_BY = 'http://www.w3.org/ns/prov#wasGeneratedBy';
+const XSD_DATE_TIME = 'http://www.w3.org/2001/XMLSchema#dateTime';
+
+export interface MarkdownExtractInput {
+  /** Markdown source text (the Phase 1 mdIntermediate). */
+  markdown: string;
+  /** DID of the extracting agent, recorded in provenance. */
+  agentDid: string;
+  /** Optional ontology URI (not yet used by Layer 1 — reserved for Layer 2). */
+  ontologyRef?: string;
+  /**
+   * Optional stable subject IRI for the document. When omitted, the extractor
+   * derives a subject from frontmatter `id` or the first H1 heading.
+   */
+  documentIri?: string;
+  /** Optional timestamp for provenance (defaults to now). */
+  now?: Date;
+}
+
+export interface MarkdownExtractOutput {
+  /** Extracted RDF triples. */
+  triples: Quad[];
+  /** dkg:ExtractionProvenance quads for the extraction run. */
+  provenance: Quad[];
+  /** The subject IRI used for the document (useful to the caller for indexing). */
+  subjectIri: string;
+}
+
+/**
+ * Parse YAML frontmatter if present. Returns the parsed object and the
+ * remaining markdown body with frontmatter stripped.
+ */
+function splitFrontmatter(markdown: string): { frontmatter: Record<string, unknown> | null; body: string } {
+  if (!markdown.startsWith('---')) {
+    return { frontmatter: null, body: markdown };
+  }
+  // Match the opening --- and find the closing ---
+  const lines = markdown.split(/\r?\n/);
+  if (lines[0].trim() !== '---') {
+    return { frontmatter: null, body: markdown };
+  }
+  let endIndex = -1;
+  for (let i = 1; i < lines.length; i++) {
+    if (lines[i].trim() === '---') {
+      endIndex = i;
+      break;
+    }
+  }
+  if (endIndex === -1) {
+    return { frontmatter: null, body: markdown };
+  }
+  const yamlText = lines.slice(1, endIndex).join('\n');
+  let parsed: unknown;
+  try {
+    parsed = loadYaml(yamlText);
+  } catch {
+    return { frontmatter: null, body: markdown };
+  }
+  if (parsed === null || typeof parsed !== 'object' || Array.isArray(parsed)) {
+    return { frontmatter: null, body: markdown };
+  }
+  const body = lines.slice(endIndex + 1).join('\n');
+  return { frontmatter: parsed as Record<string, unknown>, body };
+}
+
+/** Extract the text of the first level-1 heading, if any. */
+function findFirstH1(body: string): string | null {
+  const m = body.match(/^#\s+(.+?)\s*$/m);
+  return m ? m[1].trim() : null;
+}
+
+/**
+ * Slugify a string for use in an IRI fragment. Keeps alphanumerics and hyphens.
+ */
+function slugify(input: string): string {
+  return input
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, '-')
+    .replace(/^-+|-+$/g, '')
+    .slice(0, 80);
+}
+
+/**
+ * Resolve a stable subject IRI for the document:
+ *   1. explicit `documentIri` argument, or
+ *   2. frontmatter `id` (if it looks like an IRI or a slug), or
+ *   3. slugified first H1 heading with an `urn:dkg:md:` prefix, or
+ *   4. stable fallback `urn:dkg:md:anonymous-{short-hash}`.
+ */
+function resolveSubjectIri(
+  input: MarkdownExtractInput,
+  frontmatter: Record<string, unknown> | null,
+  body: string,
+): string {
+  if (input.documentIri && input.documentIri.length > 0) return input.documentIri;
+
+  const fmId = frontmatter?.['id'];
+  if (typeof fmId === 'string' && fmId.length > 0) {
+    if (/^(https?:|did:|urn:|_:)/.test(fmId)) return fmId;
+    return `urn:dkg:md:${slugify(fmId)}`;
+  }
+
+  const h1 = findFirstH1(body);
+  if (h1) return `urn:dkg:md:${slugify(h1)}`;
+
+  // Stable fallback: hash-like suffix derived from content length and first chars
+  const snippet = body.slice(0, 32).replace(/\s+/g, '-').replace(/[^a-zA-Z0-9-]/g, '');
+  return `urn:dkg:md:anonymous-${snippet.slice(0, 16) || 'empty'}`;
+}
+
+/** Resolve a value from a frontmatter `type` field to a full IRI. */
+function resolveTypeIri(typeValue: unknown): string | null {
+  if (typeof typeValue !== 'string' || typeValue.length === 0) return null;
+  if (/^(https?:|did:|urn:)/.test(typeValue)) return typeValue;
+  // Treat bare identifiers as schema.org classes by convention (Report, Person, etc.)
+  return `http://schema.org/${typeValue}`;
+}
+
+/** Resolve a frontmatter scalar value to a triple object literal or IRI. */
+function resolveFrontmatterValue(value: unknown): string | null {
+  if (value === null || value === undefined) return null;
+  if (typeof value === 'string') {
+    if (/^(https?:|did:|urn:)/.test(value)) return value;
+    return JSON.stringify(value);
+  }
+  if (typeof value === 'number' || typeof value === 'boolean') {
+    return JSON.stringify(String(value));
+  }
+  return null;
+}
+
+/** Extract wikilinks `[[Target]]` or `[[Target|Alt]]` → IRIs using the `urn:dkg:md:` namespace. */
+function extractWikilinks(body: string): string[] {
+  const out = new Set<string>();
+  const re = /\[\[([^\]|#]+?)(?:#[^\]|]*)?(?:\|[^\]]*?)?\]\]/g;
+  let m: RegExpExecArray | null;
+  while ((m = re.exec(body)) !== null) {
+    const target = m[1].trim();
+    if (target.length === 0) continue;
+    out.add(`urn:dkg:md:${slugify(target)}`);
+  }
+  return [...out];
+}
+
+/**
+ * Extract hashtags `#tag` from the body. Excludes markdown headings
+ * (lines starting with `#` followed by a space) and code fence contents.
+ */
+function extractHashtags(body: string): string[] {
+  const out = new Set<string>();
+  const noFences = stripCodeFences(body);
+  const noHeadings = noFences.replace(/^#{1,6}\s+.*$/gm, '');
+  // Match `#word` where word is alphanumeric + `_`/`-`/`/`, not preceded by `[`
+  // (to avoid `[#heading]` anchors) and not followed by more `#`.
+  const re = /(?:^|[^\w#[/])#([a-zA-Z][\w-/]*)/g;
+  let m: RegExpExecArray | null;
+  while ((m = re.exec(noHeadings)) !== null) {
+    out.add(m[1]);
+  }
+  return [...out];
+}
+
+/**
+ * Extract Dataview inline fields: `key:: value` at line-start (allowing leading whitespace).
+ * Returns key-value pairs with raw string values; the caller translates to triples.
+ */
+function extractDataviewFields(body: string): Array<{ key: string; value: string }> {
+  const out: Array<{ key: string; value: string }> = [];
+  const noFences = stripCodeFences(body);
+  const re = /^[\s>*-]*([a-zA-Z][\w-]*)::\s*(.+?)\s*$/gm;
+  let m: RegExpExecArray | null;
+  while ((m = re.exec(noFences)) !== null) {
+    out.push({ key: m[1], value: m[2] });
+  }
+  return out;
+}
+
+/** Extract section headings (H1..H6) as an ordered list with levels. */
+function extractHeadings(body: string): Array<{ level: number; text: string }> {
+  const noFences = stripCodeFences(body);
+  const out: Array<{ level: number; text: string }> = [];
+  const re = /^(#{1,6})\s+(.+?)\s*#*\s*$/gm;
+  let m: RegExpExecArray | null;
+  while ((m = re.exec(noFences)) !== null) {
+    out.push({ level: m[1].length, text: m[2].trim() });
+  }
+  return out;
+}
+
+/** Strip ``` fenced code blocks (and ~~~ variants) from the markdown. */
+function stripCodeFences(body: string): string {
+  return body.replace(/^(```|~~~)[\s\S]*?^\1\s*$/gm, '');
+}
+
+/**
+ * Run the full Phase 2 structural extraction. Deterministic, no LLM.
+ * Returns `{ triples, provenance, subjectIri }`. Empty arrays are valid
+ * — a Markdown document with no frontmatter, no wikilinks, no tags, no
+ * dataview fields, and no headings produces zero triples.
+ */
+export function extractFromMarkdown(input: MarkdownExtractInput): MarkdownExtractOutput {
+  const triples: Quad[] = [];
+  const now = input.now ?? new Date();
+
+  const { frontmatter, body } = splitFrontmatter(input.markdown);
+  const subject = resolveSubjectIri(input, frontmatter, body);
+
+  // ── 1. YAML frontmatter → properties ───────────────────────────────
+  if (frontmatter) {
+    for (const [key, value] of Object.entries(frontmatter)) {
+      if (key === 'id') continue; // already used as subject identifier
+      if (key === 'type') {
+        const typeIri = resolveTypeIri(value);
+        if (typeIri) triples.push({ subject, predicate: RDF_TYPE, object: typeIri });
+        continue;
+      }
+      // Array values emit one triple per element.
+      const values = Array.isArray(value) ? value : [value];
+      for (const v of values) {
+        const obj = resolveFrontmatterValue(v);
+        if (obj === null) continue;
+        const predicate = frontmatterKeyToPredicate(key);
+        triples.push({ subject, predicate, object: obj });
+      }
+    }
+  }
+
+  // Promote first H1 → schema:name if no explicit name triple exists.
+  const h1 = findFirstH1(body);
+  if (h1 && !triples.some(q => q.predicate === SCHEMA_NAME)) {
+    triples.push({ subject, predicate: SCHEMA_NAME, object: JSON.stringify(h1) });
+  }
+
+  // ── 2. Wikilinks → schema:mentions ─────────────────────────────────
+  for (const target of extractWikilinks(body)) {
+    triples.push({ subject, predicate: SCHEMA_MENTIONS, object: target });
+  }
+
+  // ── 3. Hashtags → schema:keywords ──────────────────────────────────
+  for (const tag of extractHashtags(body)) {
+    triples.push({ subject, predicate: SCHEMA_KEYWORDS, object: JSON.stringify(tag) });
+  }
+
+  // ── 4. Dataview inline fields → properties ─────────────────────────
+  for (const { key, value } of extractDataviewFields(body)) {
+    const predicate = frontmatterKeyToPredicate(key);
+    const obj = /^(https?:|did:|urn:)/.test(value) ? value : JSON.stringify(value);
+    triples.push({ subject, predicate, object: obj });
+  }
+
+  // ── 5. Headings → dkg:hasSection ───────────────────────────────────
+  for (const heading of extractHeadings(body)) {
+    if (heading.level === 1) continue; // H1 is the document title, not a section
+    const sectionIri = `${subject}#section-${slugify(heading.text)}`;
+    triples.push({ subject, predicate: DKG_HAS_SECTION, object: sectionIri });
+    triples.push({ subject: sectionIri, predicate: SCHEMA_NAME, object: JSON.stringify(heading.text) });
+  }
+
+  // ── Provenance ─────────────────────────────────────────────────────
+  const provenance = buildProvenance({
+    subject,
+    agentDid: input.agentDid,
+    tripleCount: triples.length,
+    now,
+  });
+
+  return { triples, provenance, subjectIri: subject };
+}
+
+function frontmatterKeyToPredicate(key: string): string {
+  if (key === 'name' || key === 'title') return SCHEMA_NAME;
+  if (key === 'description' || key === 'summary') return SCHEMA_DESCRIPTION;
+  if (key === 'keywords' || key === 'tags') return SCHEMA_KEYWORDS;
+  // Unknown keys fall back into the schema.org namespace (same convention as `type`).
+  return `http://schema.org/${key}`;
+}
+
+function buildProvenance(args: {
+  subject: string;
+  agentDid: string;
+  tripleCount: number;
+  now: Date;
+}): Quad[] {
+  if (args.tripleCount === 0) return [];
+  const provIri = `urn:dkg:extraction:${slugify(args.subject)}-${args.now.getTime()}`;
+  const xsdDateTime = `"${args.now.toISOString()}"^^<${XSD_DATE_TIME}>`;
+  return [
+    { subject: provIri, predicate: RDF_TYPE, object: DKG_EXTRACTION_PROVENANCE },
+    { subject: provIri, predicate: DKG_EXTRACTED_BY, object: args.agentDid },
+    { subject: provIri, predicate: DKG_EXTRACTION_RULE, object: JSON.stringify('markdown-structural-v1') },
+    { subject: provIri, predicate: DKG_EXTRACTED_AT, object: xsdDateTime },
+    { subject: provIri, predicate: DKG_DERIVED_FROM, object: args.subject },
+    { subject: args.subject, predicate: PROV_WAS_GENERATED_BY, object: provIri },
+  ];
+}
diff --git a/packages/cli/src/extraction/markitdown-converter.ts b/packages/cli/src/extraction/markitdown-converter.ts
index 1ccb15616..fa86ad5e8 100644
--- a/packages/cli/src/extraction/markitdown-converter.ts
+++ b/packages/cli/src/extraction/markitdown-converter.ts
@@ -13,7 +13,7 @@ import { existsSync } from 'node:fs';
 import { resolve, join } from 'node:path';
 import { platform, arch } from 'node:process';
 import { fileURLToPath } from 'node:url';
-import type { ExtractionPipeline, ExtractionInput, ExtractionOutput } from '@origintrail-official/dkg-core';
+import type { ExtractionPipeline, ExtractionInput, ConverterOutput } from '@origintrail-official/dkg-core';
 
 const MAX_OUTPUT_BYTES = 50 * 1024 * 1024; // 50 MB
 
@@ -83,16 +83,8 @@ export const MARKITDOWN_CONTENT_TYPES = [
 export class MarkItDownConverter implements ExtractionPipeline {
   readonly contentTypes = [...MARKITDOWN_CONTENT_TYPES];
 
-  async extract(input: ExtractionInput): Promise<ExtractionOutput> {
+  async extract(input: ExtractionInput): Promise<ConverterOutput> {
     const markdown = await runMarkItDown(input.filePath);
-
-    // Phase 2 (markdown → triples) is handled by the Markdown extraction pipeline
-    // which runs separately. This converter only does phase 1: file → Markdown.
-    // Return the intermediate with empty triples; the caller chains the MD pipeline.
-    return {
-      mdIntermediate: markdown,
-      triples: [],
-      provenance: [],
-    };
+    return { mdIntermediate: markdown };
   }
 }
diff --git a/packages/cli/test/document-processor-e2e.test.ts b/packages/cli/test/document-processor-e2e.test.ts
index 551c89d35..f1c721866 100644
--- a/packages/cli/test/document-processor-e2e.test.ts
+++ b/packages/cli/test/document-processor-e2e.test.ts
@@ -13,7 +13,7 @@ import {
   ExtractionPipelineRegistry,
   type ExtractionPipeline,
   type ExtractionInput,
-  type ExtractionOutput,
+  type ConverterOutput,
 } from '@origintrail-official/dkg-core';
 import { MarkItDownConverter, isMarkItDownAvailable } from '../src/extraction/index.js';
 
@@ -59,9 +59,9 @@ describe('ExtractionPipelineRegistry E2E', () => {
 
     const customMdPipeline: ExtractionPipeline = {
       contentTypes: ['text/markdown'],
-      async extract(input: ExtractionInput): Promise<ExtractionOutput> {
+      async extract(input: ExtractionInput): Promise<ConverterOutput> {
         const md = await readFile(input.filePath, 'utf-8');
-        return { mdIntermediate: md, triples: [], provenance: [] };
+        return { mdIntermediate: md };
       },
     };
 
@@ -113,8 +113,6 @@ describe.skipIf(!markitdownAvailable)('MarkItDown E2E — real file conversion',
     expect(result.mdIntermediate).toBeTruthy();
     expect(result.mdIntermediate).toContain('Research Paper');
     expect(result.mdIntermediate).toContain('decentralized knowledge graphs');
-    expect(result.triples).toEqual([]);
-    expect(result.provenance).toEqual([]);
   });
 
   it('converts a CSV file to Markdown', async () => {
@@ -144,7 +142,6 @@ describe.skipIf(!markitdownAvailable)('MarkItDown E2E — real file conversion',
     });
 
     expect(typeof result.mdIntermediate).toBe('string');
-    expect(result.triples).toEqual([]);
   });
 
   it('processes file through registry lookup → extract', async () => {
@@ -207,7 +204,7 @@ describe('Full extraction pipeline simulation', () => {
       contentTypes: ['text/markdown'],
       async extract(input) {
         const md = await readFile(input.filePath, 'utf-8');
-        return { mdIntermediate: md, triples: [], provenance: [] };
+        return { mdIntermediate: md };
       },
     };
 
@@ -277,15 +274,13 @@ describe('Full extraction pipeline simulation', () => {
 
     const registry = new ExtractionPipelineRegistry();
 
-    // Register a mock HTML pipeline
+    // Register a mock HTML pipeline (Phase 1 converter — mdIntermediate only)
     registry.register({
       contentTypes: ['text/html'],
       async extract(input) {
         const content = await readFile(input.filePath, 'utf-8');
         return {
           mdIntermediate: content.replace(/<[^>]+>/g, ''),
-          triples: [{ subject: 'urn:sales:q4', predicate: 'rdf:type', object: 'schema:Report' }],
-          provenance: [],
         };
       },
     });
@@ -299,19 +294,24 @@ describe('Full extraction pipeline simulation', () => {
       agentDid: 'did:dkg:agent:0xSales',
     });
 
+    // Phase 2 (simulated): the route handler would run the Markdown extractor
+    // on `result.mdIntermediate` to produce triples/provenance.
+    const phase2Triples = [{ subject: 'urn:sales:q4', predicate: 'rdf:type', object: 'schema:Report' }];
+
     // Build the import-file response as the daemon would
     const importFileResponse = {
       assertionUri: 'did:dkg:context-graph:sales/assertion/0xSales/q4-report',
       fileHash: 'sha256:abc123',
       detectedContentType: 'text/html',
       extraction: {
-        status: result.triples.length > 0 ? 'completed' as const : 'skipped' as const,
-        tripleCount: result.triples.length,
+        status: phase2Triples.length > 0 ? 'completed' as const : 'skipped' as const,
+        tripleCount: phase2Triples.length,
         mdIntermediateHash: 'sha256:def456',
         pipelineUsed: 'text/html',
       },
     };
 
+    expect(result.mdIntermediate).toContain('Q4 Sales');
     expect(importFileResponse.extraction.status).toBe('completed');
     expect(importFileResponse.extraction.tripleCount).toBe(1);
     expect(importFileResponse.extraction.pipelineUsed).toBe('text/html');
diff --git a/packages/cli/test/extraction-markdown.test.ts b/packages/cli/test/extraction-markdown.test.ts
new file mode 100644
index 000000000..77abc3b5b
--- /dev/null
+++ b/packages/cli/test/extraction-markdown.test.ts
@@ -0,0 +1,385 @@
+import { describe, it, expect } from 'vitest';
+import { extractFromMarkdown } from '../src/extraction/markdown-extractor.js';
+
+const AGENT = 'did:dkg:agent:0xAbC123';
+const FIXED_NOW = new Date('2026-04-10T12:00:00Z');
+
+const RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
+const SCHEMA_NAME = 'http://schema.org/name';
+const SCHEMA_DESCRIPTION = 'http://schema.org/description';
+const SCHEMA_MENTIONS = 'http://schema.org/mentions';
+const SCHEMA_KEYWORDS = 'http://schema.org/keywords';
+const DKG_HAS_SECTION = 'http://dkg.io/ontology/hasSection';
+const DKG_EXTRACTION_PROVENANCE = 'http://dkg.io/ontology/ExtractionProvenance';
+const PROV_WAS_GENERATED_BY = 'http://www.w3.org/ns/prov#wasGeneratedBy';
+
+describe('extractFromMarkdown — frontmatter', () => {
+  it('extracts rdf:type from frontmatter `type` key (schema.org convention)', () => {
+    const { triples, subjectIri } = extractFromMarkdown({
+      markdown: `---\nid: climate-report-2026\ntype: Report\n---\n\n# Climate Report\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(subjectIri).toBe('urn:dkg:md:climate-report-2026');
+    expect(triples).toContainEqual({
+      subject: subjectIri,
+      predicate: RDF_TYPE,
+      object: 'http://schema.org/Report',
+    });
+  });
+
+  it('extracts full IRI `type` without namespacing', () => {
+    const { triples } = extractFromMarkdown({
+      markdown: `---\nid: x\ntype: https://example.org/ontology/Thing\n---\n\n# X\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(triples.some(t => t.predicate === RDF_TYPE && t.object === 'https://example.org/ontology/Thing')).toBe(true);
+  });
+
+  it('maps `title` to schema:name and `description` to schema:description', () => {
+    const { triples } = extractFromMarkdown({
+      markdown: `---\nid: doc-1\ntitle: Hello World\ndescription: A short doc\n---\n\nBody.\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(triples).toContainEqual({ subject: 'urn:dkg:md:doc-1', predicate: SCHEMA_NAME, object: '"Hello World"' });
+    expect(triples).toContainEqual({ subject: 'urn:dkg:md:doc-1', predicate: SCHEMA_DESCRIPTION, object: '"A short doc"' });
+  });
+
+  it('emits one triple per element for array values in frontmatter', () => {
+    const { triples } = extractFromMarkdown({
+      markdown: `---\nid: doc\nauthors:\n  - Alice\n  - Bob\n---\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    const authors = triples.filter(t => t.predicate === 'http://schema.org/authors');
+    expect(authors.map(t => t.object).sort()).toEqual(['"Alice"', '"Bob"']);
+  });
+
+  it('handles numeric and boolean scalars', () => {
+    const { triples } = extractFromMarkdown({
+      markdown: `---\nid: doc\npageCount: 42\npublished: true\n---\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(triples).toContainEqual({ subject: 'urn:dkg:md:doc', predicate: 'http://schema.org/pageCount', object: '"42"' });
+    expect(triples).toContainEqual({ subject: 'urn:dkg:md:doc', predicate: 'http://schema.org/published', object: '"true"' });
+  });
+
+  it('ignores frontmatter with invalid YAML (fallthrough to body)', () => {
+    const { triples, subjectIri } = extractFromMarkdown({
+      markdown: `---\nid: {broken yaml\n---\n\n# Fallback\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    // Subject should derive from the H1 because frontmatter is rejected
+    expect(subjectIri).toBe('urn:dkg:md:fallback');
+    expect(triples).toContainEqual({ subject: subjectIri, predicate: SCHEMA_NAME, object: '"Fallback"' });
+  });
+});
+
+describe('extractFromMarkdown — wikilinks', () => {
+  it('extracts bare wikilinks', () => {
+    const { triples, subjectIri } = extractFromMarkdown({
+      markdown: `# Doc\n\nSee [[Alice]] and [[Bob]] for details.\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(triples).toContainEqual({ subject: subjectIri, predicate: SCHEMA_MENTIONS, object: 'urn:dkg:md:alice' });
+    expect(triples).toContainEqual({ subject: subjectIri, predicate: SCHEMA_MENTIONS, object: 'urn:dkg:md:bob' });
+  });
+
+  it('extracts piped wikilinks `[[Target|alt]]`', () => {
+    const { triples } = extractFromMarkdown({
+      markdown: `# Doc\n\nSee [[Charlie Chocolate|Charlie]].\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(triples.some(t => t.predicate === SCHEMA_MENTIONS && t.object === 'urn:dkg:md:charlie-chocolate')).toBe(true);
+  });
+
+  it('deduplicates wikilinks', () => {
+    const { triples } = extractFromMarkdown({
+      markdown: `# Doc\n\n[[Alice]] [[Alice]] [[Alice]]\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    const mentions = triples.filter(t => t.predicate === SCHEMA_MENTIONS);
+    expect(mentions).toHaveLength(1);
+  });
+});
+
+describe('extractFromMarkdown — hashtags', () => {
+  it('extracts hashtags as schema:keywords', () => {
+    const { triples, subjectIri } = extractFromMarkdown({
+      markdown: `# Doc\n\nSome text #climate #policy and more.\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(triples).toContainEqual({ subject: subjectIri, predicate: SCHEMA_KEYWORDS, object: '"climate"' });
+    expect(triples).toContainEqual({ subject: subjectIri, predicate: SCHEMA_KEYWORDS, object: '"policy"' });
+  });
+
+  it('does not treat markdown headings as hashtags', () => {
+    const { triples } = extractFromMarkdown({
+      markdown: `# Title\n\n## Section\n\nBody without tags.\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    const keywords = triples.filter(t => t.predicate === SCHEMA_KEYWORDS);
+    expect(keywords).toHaveLength(0);
+  });
+
+  it('ignores hashtags inside code fences', () => {
+    const { triples } = extractFromMarkdown({
+      markdown: `# Doc\n\n\`\`\`bash\n# a comment #notatag\n\`\`\`\n\nBody #realtag here.\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    const keywords = triples.filter(t => t.predicate === SCHEMA_KEYWORDS).map(t => t.object);
+    expect(keywords).toContain('"realtag"');
+    expect(keywords).not.toContain('"notatag"');
+    expect(keywords).not.toContain('"a"');
+  });
+});
+
+describe('extractFromMarkdown — Dataview inline fields', () => {
+  it('extracts `key:: value` lines', () => {
+    const { triples, subjectIri } = extractFromMarkdown({
+      markdown: `# Doc\n\nauthor:: Alice\nstatus:: draft\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(triples).toContainEqual({ subject: subjectIri, predicate: 'http://schema.org/author', object: '"Alice"' });
+    expect(triples).toContainEqual({ subject: subjectIri, predicate: 'http://schema.org/status', object: '"draft"' });
+  });
+
+  it('preserves IRI values as IRIs (not literals)', () => {
+    const { triples, subjectIri } = extractFromMarkdown({
+      markdown: `# Doc\n\nhomepage:: https://example.org/home\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(triples).toContainEqual({ subject: subjectIri, predicate: 'http://schema.org/homepage', object: 'https://example.org/home' });
+  });
+
+  it('ignores dataview-like syntax inside code fences', () => {
+    const { triples } = extractFromMarkdown({
+      markdown: `# Doc\n\n\`\`\`\nfake:: not a field\n\`\`\`\n\nreal:: value\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    const dataview = triples.filter(t => t.predicate.startsWith('http://schema.org/'));
+    expect(dataview.some(t => t.predicate === 'http://schema.org/real')).toBe(true);
+    expect(dataview.some(t => t.predicate === 'http://schema.org/fake')).toBe(false);
+  });
+});
+
+describe('extractFromMarkdown — headings', () => {
+  it('emits dkg:hasSection triples for H2+ headings but not H1', () => {
+    const { triples, subjectIri } = extractFromMarkdown({
+      markdown: `# Title\n\n## Intro\n\n## Methods\n\n### Sub-method\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    const sections = triples.filter(t => t.predicate === DKG_HAS_SECTION);
+    expect(sections).toHaveLength(3);
+    expect(sections.map(t => t.object)).toEqual([
+      `${subjectIri}#section-intro`,
+      `${subjectIri}#section-methods`,
+      `${subjectIri}#section-sub-method`,
+    ]);
+    // Each section should have a schema:name
+    for (const section of sections) {
+      expect(triples.some(t => t.subject === section.object && t.predicate === SCHEMA_NAME)).toBe(true);
+    }
+  });
+
+  it('H1 promotes to schema:name on the document subject', () => {
+    const { triples, subjectIri } = extractFromMarkdown({
+      markdown: `# My Document\n\nBody.\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(triples).toContainEqual({ subject: subjectIri, predicate: SCHEMA_NAME, object: '"My Document"' });
+  });
+
+  it('H1 does not overwrite an explicit frontmatter title', () => {
+    const { triples, subjectIri } = extractFromMarkdown({
+      markdown: `---\nid: x\ntitle: Explicit Title\n---\n\n# Different H1\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    const names = triples.filter(t => t.subject === subjectIri && t.predicate === SCHEMA_NAME);
+    expect(names).toHaveLength(1);
+    expect(names[0].object).toBe('"Explicit Title"');
+  });
+});
+
+describe('extractFromMarkdown — subject IRI resolution', () => {
+  it('prefers explicit documentIri input', () => {
+    const { subjectIri } = extractFromMarkdown({
+      markdown: `---\nid: ignored\n---\n\n# H1 Also Ignored\n`,
+      agentDid: AGENT,
+      documentIri: 'did:dkg:context-graph:foo/assertion/0xabc/mydoc',
+      now: FIXED_NOW,
+    });
+    expect(subjectIri).toBe('did:dkg:context-graph:foo/assertion/0xabc/mydoc');
+  });
+
+  it('uses frontmatter id as-is when it looks like an IRI', () => {
+    const { subjectIri } = extractFromMarkdown({
+      markdown: `---\nid: https://example.org/thing/42\n---\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(subjectIri).toBe('https://example.org/thing/42');
+  });
+
+  it('slugifies a frontmatter id that is not an IRI', () => {
+    const { subjectIri } = extractFromMarkdown({
+      markdown: `---\nid: My Great Document!\n---\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(subjectIri).toBe('urn:dkg:md:my-great-document');
+  });
+
+  it('falls back to slugified H1 when no id is present', () => {
+    const { subjectIri } = extractFromMarkdown({
+      markdown: `# A Title of Things\n\nBody.\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(subjectIri).toBe('urn:dkg:md:a-title-of-things');
+  });
+
+  it('produces a stable anonymous fallback when there is no title', () => {
+    const { subjectIri } = extractFromMarkdown({
+      markdown: `Just a body. No headings, no frontmatter.\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(subjectIri.startsWith('urn:dkg:md:anonymous-')).toBe(true);
+  });
+});
+
+describe('extractFromMarkdown — provenance', () => {
+  it('emits a single provenance block when triples are produced', () => {
+    const { triples, provenance } = extractFromMarkdown({
+      markdown: `# Doc\n\n#tag1\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(triples.length).toBeGreaterThan(0);
+    expect(provenance.length).toBeGreaterThan(0);
+    expect(provenance).toContainEqual(expect.objectContaining({
+      predicate: RDF_TYPE,
+      object: DKG_EXTRACTION_PROVENANCE,
+    }));
+    // Back-link from subject to provenance
+    expect(provenance.some(q => q.predicate === PROV_WAS_GENERATED_BY)).toBe(true);
+  });
+
+  it('emits no provenance when no triples are extracted', () => {
+    const { triples, provenance } = extractFromMarkdown({
+      markdown: ``,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(triples).toHaveLength(0);
+    expect(provenance).toHaveLength(0);
+  });
+
+  it('records the extracting agent DID in provenance', () => {
+    const { provenance } = extractFromMarkdown({
+      markdown: `# Doc\n\n#tag\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(provenance.some(q => q.object === AGENT)).toBe(true);
+  });
+});
+
+describe('extractFromMarkdown — end-to-end', () => {
+  it('handles a full document with frontmatter, H1, tags, wikilinks, dataview, and sections', () => {
+    const markdown = `---
+id: research-note
+type: ScholarlyArticle
+title: On Decentralized Knowledge Graphs
+description: Exploring DKG fundamentals
+authors:
+  - Alice
+  - Bob
+---
+
+# On Decentralized Knowledge Graphs
+
+status:: draft
+topic:: knowledge graphs
+
+This note discusses [[Decentralized Identifiers]] and [[RDF]] concepts.
+
+It covers #knowledge-graphs and #dkg topics in depth.
+
+## Background
+
+Some background.
+
+## Methods
+
+Our method relies on [[SPARQL]] queries.
+`;
+    const { triples, provenance, subjectIri } = extractFromMarkdown({
+      markdown,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+
+    expect(subjectIri).toBe('urn:dkg:md:research-note');
+
+    // Type
+    expect(triples).toContainEqual({
+      subject: subjectIri,
+      predicate: RDF_TYPE,
+      object: 'http://schema.org/ScholarlyArticle',
+    });
+
+    // Name from frontmatter title (NOT from H1 since title is set)
+    expect(triples.filter(t => t.predicate === SCHEMA_NAME && t.subject === subjectIri)).toEqual([
+      { subject: subjectIri, predicate: SCHEMA_NAME, object: '"On Decentralized Knowledge Graphs"' },
+    ]);
+
+    // Authors
+    const authors = triples.filter(t => t.predicate === 'http://schema.org/authors').map(t => t.object);
+    expect(authors).toContain('"Alice"');
+    expect(authors).toContain('"Bob"');
+
+    // Dataview fields
+    expect(triples).toContainEqual({ subject: subjectIri, predicate: 'http://schema.org/status', object: '"draft"' });
+    expect(triples).toContainEqual({ subject: subjectIri, predicate: 'http://schema.org/topic', object: '"knowledge graphs"' });
+
+    // Wikilinks
+    const mentions = triples.filter(t => t.predicate === SCHEMA_MENTIONS).map(t => t.object);
+    expect(mentions).toContain('urn:dkg:md:decentralized-identifiers');
+    expect(mentions).toContain('urn:dkg:md:rdf');
+    expect(mentions).toContain('urn:dkg:md:sparql');
+
+    // Tags
+    const tags = triples.filter(t => t.predicate === SCHEMA_KEYWORDS).map(t => t.object);
+    expect(tags).toContain('"knowledge-graphs"');
+    expect(tags).toContain('"dkg"');
+
+    // Sections
+    const sections = triples.filter(t => t.predicate === DKG_HAS_SECTION).map(t => t.object);
+    expect(sections).toEqual([
+      `${subjectIri}#section-background`,
+      `${subjectIri}#section-methods`,
+    ]);
+
+    // Provenance present
+    expect(provenance.length).toBeGreaterThan(0);
+    expect(provenance.some(q => q.object === AGENT)).toBe(true);
+  });
+});
diff --git a/packages/cli/test/extraction-markitdown.test.ts b/packages/cli/test/extraction-markitdown.test.ts
index 5da39770d..26b749992 100644
--- a/packages/cli/test/extraction-markitdown.test.ts
+++ b/packages/cli/test/extraction-markitdown.test.ts
@@ -60,7 +60,7 @@ describe('MarkItDownConverter', () => {
     expect(converter.contentTypes.length).toBeGreaterThanOrEqual(6);
   });
 
-  it('extract returns mdIntermediate with empty triples (phase 1 only)', async () => {
+  it('extract returns ConverterOutput with mdIntermediate only (phase 1)', async () => {
     const converter = new MarkItDownConverter();
 
     // If markitdown is not available, the extract call should throw
@@ -89,9 +89,9 @@ describe('MarkItDownConverter', () => {
 
       expect(typeof result.mdIntermediate).toBe('string');
       expect(result.mdIntermediate.length).toBeGreaterThan(0);
-      // Phase 1 only — triples are produced by the Markdown extraction pipeline
-      expect(result.triples).toEqual([]);
-      expect(result.provenance).toEqual([]);
+      // Phase 1 only — converter returns ConverterOutput, no triples/provenance.
+      expect((result as { triples?: unknown }).triples).toBeUndefined();
+      expect((result as { provenance?: unknown }).provenance).toBeUndefined();
     } finally {
       await rm(tmpDir, { recursive: true, force: true });
     }
diff --git a/packages/core/src/extraction-pipeline.ts b/packages/core/src/extraction-pipeline.ts
index 99459f217..fd28ad03f 100644
--- a/packages/core/src/extraction-pipeline.ts
+++ b/packages/core/src/extraction-pipeline.ts
@@ -1,6 +1,16 @@
 /**
- * Pluggable extraction pipeline interface for converting non-RDF files
- * (PDF, DOCX, etc.) into Markdown intermediates and RDF triples.
+ * Pluggable extraction pipeline interfaces for the document ingestion flow.
+ *
+ * Two phases:
+ *  - Phase 1 (converter): source file → Markdown intermediate.
+ *    Implemented by ExtractionPipeline (e.g. MarkItDownConverter).
+ *  - Phase 2 (structural extraction): Markdown intermediate → RDF triples.
+ *    Runs directly in the import-file route handler — not through a
+ *    pluggable registry. See 19_MARKDOWN_CONTENT_TYPE.md.
+ *
+ * The route handler orchestrates both phases and returns an
+ * ExtractionOutput that composes Phase 1's mdIntermediate with
+ * Phase 2's triples and provenance.
  *
  * Spec: 05_PROTOCOL_EXTENSIONS.md §6.5
  */
@@ -23,26 +33,39 @@ export interface ExtractionInput {
   agentDid: string;
 }
 
+/**
+ * Phase 1 converter output. A converter is responsible ONLY for turning
+ * a source file into a Markdown intermediate. It does not produce triples.
+ */
+export interface ConverterOutput {
+  /** Markdown intermediate, stored alongside the original file and inspectable. */
+  mdIntermediate: string;
+}
+
+/**
+ * Composite Phase 1 + Phase 2 result produced by the import-file route
+ * handler. `mdIntermediate` is byte-for-byte what the converter returned;
+ * `triples` and `provenance` come from the Phase 2 Markdown extractor.
+ */
 export interface ExtractionOutput {
-  /** Markdown intermediate (stored alongside original, inspectable). */
   mdIntermediate: string;
-  /** Extracted RDF triples. */
   triples: Quad[];
-  /** dkg:ExtractionProvenance quads for semantically extracted triples. */
   provenance: Quad[];
 }
 
 export interface ExtractionPipeline {
-  /** MIME content types this pipeline handles. */
+  /** MIME content types this converter handles. */
   readonly contentTypes: string[];
-  /** Convert a file to Markdown intermediate + RDF triples. */
-  extract(input: ExtractionInput): Promise<ExtractionOutput>;
+  /** Convert a source file into a Markdown intermediate. Phase 1 only. */
+  extract(input: ExtractionInput): Promise<ConverterOutput>;
 }
 
 /**
- * Registry that maps content types to extraction pipelines.
- * Nodes register pipelines at startup; the import-file endpoint
- * looks up the pipeline for the detected content type.
+ * Registry that maps content types to converter pipelines.
+ * Nodes register pipelines at startup; the import-file route handler
+ * looks up the pipeline for the detected content type and calls its
+ * Phase 1 `extract()`. Phase 2 is not registered — the handler runs
+ * it directly on the Markdown intermediate.
  */
 export class ExtractionPipelineRegistry {
   private readonly pipelines = new Map<string, ExtractionPipeline>();
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index e8cf11798..9880bc37e 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -47,6 +47,7 @@ export {
 export {
   type Quad as ExtractionQuad,
   type ExtractionInput,
+  type ConverterOutput,
   type ExtractionOutput,
   type ExtractionPipeline,
   ExtractionPipelineRegistry,
diff --git a/packages/core/test/extraction-pipeline.test.ts b/packages/core/test/extraction-pipeline.test.ts
index 0d99aee6b..b78a7d919 100644
--- a/packages/core/test/extraction-pipeline.test.ts
+++ b/packages/core/test/extraction-pipeline.test.ts
@@ -3,17 +3,15 @@ import {
   ExtractionPipelineRegistry,
   type ExtractionPipeline,
   type ExtractionInput,
-  type ExtractionOutput,
+  type ConverterOutput,
 } from '../src/extraction-pipeline.js';
 
-function makePipeline(contentTypes: string[], output?: Partial<ExtractionOutput>): ExtractionPipeline {
+function makePipeline(contentTypes: string[], output?: Partial<ConverterOutput>): ExtractionPipeline {
   return {
     contentTypes,
-    async extract(_input: ExtractionInput): Promise<ExtractionOutput> {
+    async extract(_input: ExtractionInput): Promise<ConverterOutput> {
       return {
         mdIntermediate: output?.mdIntermediate ?? '# Test',
-        triples: output?.triples ?? [],
-        provenance: output?.provenance ?? [],
       };
     },
   };
@@ -73,12 +71,10 @@ describe('ExtractionPipelineRegistry', () => {
   });
 });
 
-describe('ExtractionPipeline interface', () => {
-  it('extract returns mdIntermediate, triples, and provenance', async () => {
+describe('ExtractionPipeline interface (Phase 1 converter)', () => {
+  it('extract returns ConverterOutput with mdIntermediate only', async () => {
     const pipeline = makePipeline(['text/markdown'], {
       mdIntermediate: '# Hello\n\nWorld',
-      triples: [{ subject: 'urn:test:1', predicate: 'rdf:type', object: 'schema:Thing' }],
-      provenance: [{ subject: 'urn:prov:1', predicate: 'dkg:extractedBy', object: 'did:dkg:agent:0x123' }],
     });
 
     const result = await pipeline.extract({
@@ -88,9 +84,9 @@ describe('ExtractionPipeline interface', () => {
     });
 
     expect(result.mdIntermediate).toBe('# Hello\n\nWorld');
-    expect(result.triples).toHaveLength(1);
-    expect(result.triples[0].subject).toBe('urn:test:1');
-    expect(result.provenance).toHaveLength(1);
+    // Converter output must not carry triples/provenance — those come from Phase 2.
+    expect((result as { triples?: unknown }).triples).toBeUndefined();
+    expect((result as { provenance?: unknown }).provenance).toBeUndefined();
   });
 
   it('extract passes through ontologyRef when provided', async () => {
@@ -99,7 +95,7 @@ describe('ExtractionPipeline interface', () => {
       contentTypes: ['application/pdf'],
       async extract(input) {
         capturedInput = input;
-        return { mdIntermediate: '', triples: [], provenance: [] };
+        return { mdIntermediate: '' };
       },
     };
 

From d5b3755db1d8a2c5a4441a0d4203d3e22126fac9 Mon Sep 17 00:00:00 2001
From: code-engineer <noreply@anthropic.com>
Date: Fri, 10 Apr 2026 18:05:58 +0200
Subject: [PATCH 02/12] feat(cli): file store + multipart parser for
 import-file wiring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Infrastructure commit for Phase 3b document ingestion. Adds two
building blocks the import-file route handler will consume in the
next commit:

- packages/cli/src/file-store.ts: content-addressed disk store for
  uploaded files and markdown intermediates. sha256-keyed with a
  two-level sharded directory layout (ab/cdef...). put/get/has APIs
  return `sha256:<hex>` prefixed hashes which the route handler
  surfaces as fileHash and mdIntermediateHash in ImportFileResponse.
  Idempotent: re-putting the same bytes yields the same hash and
  overwrites with identical content.

- packages/cli/src/http/multipart.ts: minimal RFC-7578 multipart/
  form-data parser. Handles the exact subset the import-file
  endpoint needs: one file part with filename + content-type
  headers, plus any number of text parts. No nested multipart, no
  base64 transfer-encoding, no streaming (parses a buffered Buffer).
  Zero new npm dependencies. Throws MultipartParseError on malformed
  input so the caller can return a clean 400.

Tests:
- packages/cli/test/file-store.test.ts: 12 unit tests covering put/
  get/has/hashToPath, idempotency, binary content, empty input,
  malformed-hash handling, bare-hex vs sha256:-prefixed forms.
- packages/cli/test/multipart.test.ts: 19 unit tests covering
  parseBoundary (standard, quoted, case-insensitive, missing), and
  parseMultipart (text fields, file fields, mixed bodies, binary
  content with 0x00/0xff bytes, malformed input error paths).

All 31/31 tests pass. CLI build clean.

No route handler changes yet — the next commit wires
POST /api/assertion/:name/import-file to use these primitives.

Part of OriginTrail/dkgv10-spec#77 and #80.
---
 packages/cli/src/file-store.ts       | 103 ++++++++++++++++
 packages/cli/src/http/multipart.ts   | 150 ++++++++++++++++++++++++
 packages/cli/test/file-store.test.ts | 143 +++++++++++++++++++++++
 packages/cli/test/multipart.test.ts  | 169 +++++++++++++++++++++++++++
 4 files changed, 565 insertions(+)
 create mode 100644 packages/cli/src/file-store.ts
 create mode 100644 packages/cli/src/http/multipart.ts
 create mode 100644 packages/cli/test/file-store.test.ts
 create mode 100644 packages/cli/test/multipart.test.ts

diff --git a/packages/cli/src/file-store.ts b/packages/cli/src/file-store.ts
new file mode 100644
index 000000000..be577ead1
--- /dev/null
+++ b/packages/cli/src/file-store.ts
@@ -0,0 +1,103 @@
+/**
+ * Content-addressed file store for uploaded files.
+ *
+ * Files are stored on disk keyed by their sha256 hash. Two-level sharded
+ * directory layout (`ab/cdef...`) keeps any single directory at a reasonable
+ * size even after many uploads.
+ *
+ * Used by the import-file route handler to persist originals and Markdown
+ * intermediates produced by converters. File identity is the content hash
+ * returned by `put()`, which callers surface as `fileHash` and
+ * `mdIntermediateHash` in the import-file response.
+ *
+ * Spec: 05_PROTOCOL_EXTENSIONS.md §6.5
+ */
+
+import { createHash } from 'node:crypto';
+import { mkdir, readFile, stat, writeFile } from 'node:fs/promises';
+import { existsSync } from 'node:fs';
+import { join, resolve } from 'node:path';
+
+export interface FileStoreEntry {
+  /** sha256 hash of the file contents, formatted as `sha256:<hex>`. */
+  hash: string;
+  /** Absolute path to the stored file on disk. */
+  path: string;
+  /** Size of the file in bytes. */
+  size: number;
+  /** MIME content type recorded at put() time. */
+  contentType: string;
+}
+
+export class FileStore {
+  private readonly rootDir: string;
+
+  constructor(rootDir: string) {
+    this.rootDir = resolve(rootDir);
+  }
+
+  /**
+   * Persist `bytes` to the store and return the resulting entry. Idempotent:
+   * re-putting the same bytes returns the same hash and overwrites the
+   * existing file with identical content. The `contentType` metadata is
+   * attached to the return value but not persisted to disk — callers that
+   * need durable content-type metadata should store it separately (e.g. in
+   * an `_meta` triple keyed by hash).
+   */
+  async put(bytes: Buffer, contentType: string): Promise<FileStoreEntry> {
+    const hex = createHash('sha256').update(bytes).digest('hex');
+    const hash = `sha256:${hex}`;
+    const path = this.resolvePath(hex);
+    await mkdir(join(this.rootDir, hex.slice(0, 2)), { recursive: true });
+    await writeFile(path, bytes);
+    return { hash, path, size: bytes.length, contentType };
+  }
+
+  /** Retrieve the raw bytes for a previously-stored hash, or null if absent. */
+  async get(hash: string): Promise<Buffer | null> {
+    const path = this.hashToPath(hash);
+    if (!path) return null;
+    if (!existsSync(path)) return null;
+    return readFile(path);
+  }
+
+  /** Check whether a hash is present in the store. */
+  async has(hash: string): Promise<boolean> {
+    const path = this.hashToPath(hash);
+    if (!path) return false;
+    try {
+      await stat(path);
+      return true;
+    } catch {
+      return false;
+    }
+  }
+
+  /** Resolve a hash to its on-disk path, or null for malformed hashes. */
+  hashToPath(hash: string): string | null {
+    const hex = normalizeHash(hash);
+    if (!hex) return null;
+    return this.resolvePath(hex);
+  }
+
+  /** Root directory the store writes into. */
+  get directory(): string {
+    return this.rootDir;
+  }
+
+  private resolvePath(hex: string): string {
+    return join(this.rootDir, hex.slice(0, 2), hex.slice(2));
+  }
+}
+
+/**
+ * Normalize a hash string to its 64-char hex form. Accepts either the
+ * prefixed (`sha256:abcd...`) or bare (`abcd...`) variants. Returns null for
+ * anything that isn't a valid sha256 hex.
+ */
+function normalizeHash(hash: string): string | null {
+  if (typeof hash !== 'string') return null;
+  const hex = hash.startsWith('sha256:') ? hash.slice('sha256:'.length) : hash;
+  if (!/^[0-9a-f]{64}$/i.test(hex)) return null;
+  return hex.toLowerCase();
+}
diff --git a/packages/cli/src/http/multipart.ts b/packages/cli/src/http/multipart.ts
new file mode 100644
index 000000000..f9af534ad
--- /dev/null
+++ b/packages/cli/src/http/multipart.ts
@@ -0,0 +1,150 @@
+/**
+ * Minimal `multipart/form-data` parser (RFC 7578 / RFC 2046).
+ *
+ * Handles the subset needed by the import-file upload endpoint:
+ * - A single file part with `Content-Disposition: form-data; name="file"; filename="..."`
+ *   and an optional `Content-Type` header. The part body is captured as raw bytes.
+ * - Zero or more text parts with `Content-Disposition: form-data; name="..."` and a
+ *   utf-8 string body.
+ *
+ * Deliberate non-features (out of scope for V10.0):
+ * - Nested multipart bodies (`multipart/mixed` inside a part)
+ * - `Content-Transfer-Encoding: base64` / `quoted-printable` (browsers don't send these)
+ * - Streaming — we parse a fully-buffered `Buffer`, which is the shape daemon.ts
+ *   already has from `readBody`
+ * - Charset negotiation on text parts — everything non-file is treated as utf-8
+ *
+ * Throws `MultipartParseError` on malformed input so the route handler can
+ * return a clean 400 to the caller.
+ */
+
+export class MultipartParseError extends Error {
+  constructor(message: string) {
+    super(message);
+    this.name = 'MultipartParseError';
+  }
+}
+
+export interface MultipartField {
+  /** `name` attribute from the `Content-Disposition` header. */
+  name: string;
+  /** `filename` attribute, if the part is a file upload. Undefined for text parts. */
+  filename?: string;
+  /** `Content-Type` header of the part, or undefined if not provided. */
+  contentType?: string;
+  /** Raw part body as bytes. For text parts, caller can decode via `.toString('utf-8')`. */
+  content: Buffer;
+}
+
+/**
+ * Extract the boundary token from a `Content-Type: multipart/form-data; boundary=...` header.
+ * Returns null if the header is missing, malformed, or not multipart/form-data.
+ */
+export function parseBoundary(contentTypeHeader: string | undefined): string | null {
+  if (!contentTypeHeader) return null;
+  const lower = contentTypeHeader.toLowerCase();
+  if (!lower.startsWith('multipart/form-data')) return null;
+  const match = contentTypeHeader.match(/boundary\s*=\s*(?:"([^"]+)"|([^\s;]+))/i);
+  if (!match) return null;
+  return match[1] ?? match[2] ?? null;
+}
+
+/**
+ * Parse a fully-buffered `multipart/form-data` body into its constituent fields.
+ * `boundary` is the boundary token (without the leading `--`).
+ */
+export function parseMultipart(body: Buffer, boundary: string): MultipartField[] {
+  if (!boundary || boundary.length === 0) {
+    throw new MultipartParseError('Empty boundary');
+  }
+  const delimiter = Buffer.from(`--${boundary}`);
+  const crlf = Buffer.from('\r\n');
+  const doubleCrlf = Buffer.from('\r\n\r\n');
+
+  // Find first delimiter. Spec allows CRLF or just the delimiter at the start.
+  let cursor = body.indexOf(delimiter);
+  if (cursor < 0) {
+    throw new MultipartParseError('Missing opening boundary');
+  }
+
+  const fields: MultipartField[] = [];
+  const maxIterations = 1000;
+  let iterations = 0;
+
+  while (cursor < body.length) {
+    if (++iterations > maxIterations) {
+      throw new MultipartParseError('Too many parts (>1000)');
+    }
+    // Move past the boundary delimiter
+    cursor += delimiter.length;
+    // Check for closing `--` (final boundary)
+    if (cursor + 2 <= body.length && body[cursor] === 0x2d && body[cursor + 1] === 0x2d) {
+      return fields;
+    }
+    // Skip trailing CRLF after delimiter
+    if (cursor + 2 <= body.length && body[cursor] === 0x0d && body[cursor + 1] === 0x0a) {
+      cursor += 2;
+    } else {
+      throw new MultipartParseError('Malformed boundary: expected CRLF after delimiter');
+    }
+    // Find end-of-headers (\r\n\r\n)
+    const headerEnd = body.indexOf(doubleCrlf, cursor);
+    if (headerEnd < 0) {
+      throw new MultipartParseError('Malformed part: no header terminator');
+    }
+    const headerBytes = body.subarray(cursor, headerEnd);
+    const headers = parseHeaders(headerBytes);
+    const contentStart = headerEnd + doubleCrlf.length;
+
+    // Find next boundary — part body runs from contentStart to (next delimiter - CRLF)
+    const nextDelimiter = body.indexOf(delimiter, contentStart);
+    if (nextDelimiter < 0) {
+      throw new MultipartParseError('Malformed part: no closing boundary');
+    }
+    // Strip the CRLF that precedes the next delimiter (part body ends at the CRLF).
+    let contentEnd = nextDelimiter;
+    if (contentEnd >= 2 && body[contentEnd - 2] === 0x0d && body[contentEnd - 1] === 0x0a) {
+      contentEnd -= 2;
+    }
+    const content = body.subarray(contentStart, contentEnd);
+
+    const disposition = headers.get('content-disposition');
+    if (!disposition) {
+      throw new MultipartParseError('Malformed part: missing Content-Disposition');
+    }
+    const nameMatch = disposition.match(/name\s*=\s*(?:"([^"]*)"|([^;]+))/i);
+    if (!nameMatch) {
+      throw new MultipartParseError('Malformed part: Content-Disposition without name');
+    }
+    const filenameMatch = disposition.match(/filename\s*=\s*(?:"([^"]*)"|([^;]+))/i);
+    fields.push({
+      name: (nameMatch[1] ?? nameMatch[2] ?? '').trim(),
+      filename: filenameMatch ? (filenameMatch[1] ?? filenameMatch[2] ?? '').trim() : undefined,
+      contentType: headers.get('content-type'),
+      content: Buffer.from(content),
+    });
+
+    cursor = nextDelimiter;
+  }
+
+  throw new MultipartParseError('Unexpected end of body');
+}
+
+/**
+ * Parse a raw header block (CRLF-delimited) into a lower-cased key → value map.
+ * Multi-line folded headers are not supported (RFC 7578 §5.3 says field names
+ * in multipart/form-data must use the simpler RFC 2183 header format).
+ */
+function parseHeaders(block: Buffer): Map<string, string> {
+  const headers = new Map<string, string>();
+  const text = block.toString('utf-8');
+  for (const line of text.split(/\r?\n/)) {
+    if (line.length === 0) continue;
+    const colonIdx = line.indexOf(':');
+    if (colonIdx < 0) continue;
+    const name = line.slice(0, colonIdx).trim().toLowerCase();
+    const value = line.slice(colonIdx + 1).trim();
+    headers.set(name, value);
+  }
+  return headers;
+}
diff --git a/packages/cli/test/file-store.test.ts b/packages/cli/test/file-store.test.ts
new file mode 100644
index 000000000..4a9c58bc4
--- /dev/null
+++ b/packages/cli/test/file-store.test.ts
@@ -0,0 +1,143 @@
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import { mkdtemp, rm, readFile } from 'node:fs/promises';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { createHash } from 'node:crypto';
+import { FileStore } from '../src/file-store.js';
+
+let rootDir: string;
+
+beforeEach(async () => {
+  rootDir = await mkdtemp(join(tmpdir(), 'dkg-filestore-test-'));
+});
+
+afterEach(async () => {
+  await rm(rootDir, { recursive: true, force: true });
+});
+
+describe('FileStore.put', () => {
+  it('stores bytes and returns a sha256 hash with the sha256: prefix', async () => {
+    const store = new FileStore(rootDir);
+    const bytes = Buffer.from('hello world', 'utf-8');
+    const expectedHex = createHash('sha256').update(bytes).digest('hex');
+
+    const entry = await store.put(bytes, 'text/plain');
+
+    expect(entry.hash).toBe(`sha256:${expectedHex}`);
+    expect(entry.size).toBe(11);
+    expect(entry.contentType).toBe('text/plain');
+  });
+
+  it('writes content to a two-level sharded path (ab/cdef...)', async () => {
+    const store = new FileStore(rootDir);
+    const bytes = Buffer.from('some content', 'utf-8');
+    const expectedHex = createHash('sha256').update(bytes).digest('hex');
+
+    const entry = await store.put(bytes, 'text/plain');
+
+    const expectedPath = join(rootDir, expectedHex.slice(0, 2), expectedHex.slice(2));
+    expect(entry.path).toBe(expectedPath);
+    const onDisk = await readFile(expectedPath);
+    expect(onDisk.equals(bytes)).toBe(true);
+  });
+
+  it('is idempotent — putting the same bytes twice yields the same hash', async () => {
+    const store = new FileStore(rootDir);
+    const bytes = Buffer.from('idempotent', 'utf-8');
+
+    const first = await store.put(bytes, 'text/plain');
+    const second = await store.put(bytes, 'application/octet-stream');
+
+    expect(first.hash).toBe(second.hash);
+    expect(first.path).toBe(second.path);
+    // contentType on the returned entry reflects the caller, not persisted metadata
+    expect(first.contentType).toBe('text/plain');
+    expect(second.contentType).toBe('application/octet-stream');
+  });
+
+  it('handles empty input', async () => {
+    const store = new FileStore(rootDir);
+    const entry = await store.put(Buffer.alloc(0), 'application/octet-stream');
+    expect(entry.size).toBe(0);
+    // sha256 of empty string is well-known
+    expect(entry.hash).toBe('sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855');
+  });
+
+  it('handles binary content with arbitrary bytes', async () => {
+    const store = new FileStore(rootDir);
+    const bytes = Buffer.from([0x00, 0xff, 0x7f, 0x80, 0x0a, 0x0d]);
+    const entry = await store.put(bytes, 'application/octet-stream');
+    const onDisk = await readFile(entry.path);
+    expect(onDisk.equals(bytes)).toBe(true);
+  });
+});
+
+describe('FileStore.get', () => {
+  it('returns the bytes for a stored hash', async () => {
+    const store = new FileStore(rootDir);
+    const bytes = Buffer.from('retrievable', 'utf-8');
+    const { hash } = await store.put(bytes, 'text/plain');
+
+    const retrieved = await store.get(hash);
+    expect(retrieved).not.toBeNull();
+    expect(retrieved!.equals(bytes)).toBe(true);
+  });
+
+  it('returns null for a hash that was never stored', async () => {
+    const store = new FileStore(rootDir);
+    const bogusHex = 'a'.repeat(64);
+    const retrieved = await store.get(`sha256:${bogusHex}`);
+    expect(retrieved).toBeNull();
+  });
+
+  it('accepts bare hex or sha256:-prefixed hashes', async () => {
+    const store = new FileStore(rootDir);
+    const bytes = Buffer.from('both forms', 'utf-8');
+    const { hash } = await store.put(bytes, 'text/plain');
+    const bareHex = hash.slice('sha256:'.length);
+
+    const viaPrefixed = await store.get(hash);
+    const viaBare = await store.get(bareHex);
+
+    expect(viaPrefixed).not.toBeNull();
+    expect(viaBare).not.toBeNull();
+    expect(viaPrefixed!.equals(viaBare!)).toBe(true);
+  });
+
+  it('returns null for malformed hash strings', async () => {
+    const store = new FileStore(rootDir);
+    expect(await store.get('not-a-hash')).toBeNull();
+    expect(await store.get('sha256:tooshort')).toBeNull();
+    expect(await store.get('sha256:' + 'z'.repeat(64))).toBeNull(); // non-hex chars
+    expect(await store.get('')).toBeNull();
+  });
+});
+
+describe('FileStore.has', () => {
+  it('returns true for stored hashes and false otherwise', async () => {
+    const store = new FileStore(rootDir);
+    const bytes = Buffer.from('presence check', 'utf-8');
+    const { hash } = await store.put(bytes, 'text/plain');
+
+    expect(await store.has(hash)).toBe(true);
+    expect(await store.has('sha256:' + 'b'.repeat(64))).toBe(false);
+    expect(await store.has('bad-hash')).toBe(false);
+  });
+});
+
+describe('FileStore.hashToPath', () => {
+  it('resolves a hash to an absolute sharded path without touching disk', () => {
+    const store = new FileStore(rootDir);
+    const hex = '1234567890abcdef'.repeat(4);
+    expect(hex.length).toBe(64);
+
+    const path = store.hashToPath(`sha256:${hex}`);
+    expect(path).toBe(join(rootDir, hex.slice(0, 2), hex.slice(2)));
+  });
+
+  it('returns null for malformed hashes', () => {
+    const store = new FileStore(rootDir);
+    expect(store.hashToPath('not-a-hash')).toBeNull();
+    expect(store.hashToPath('sha256:short')).toBeNull();
+  });
+});
diff --git a/packages/cli/test/multipart.test.ts b/packages/cli/test/multipart.test.ts
new file mode 100644
index 000000000..ba3a47e96
--- /dev/null
+++ b/packages/cli/test/multipart.test.ts
@@ -0,0 +1,169 @@
+import { describe, it, expect } from 'vitest';
+import { parseBoundary, parseMultipart, MultipartParseError } from '../src/http/multipart.js';
+
+const BOUNDARY = '----dkgtestboundary';
+const CRLF = '\r\n';
+
+function buildBody(...parts: Buffer[]): Buffer {
+  const segments: Buffer[] = [];
+  for (const part of parts) {
+    segments.push(Buffer.from(`--${BOUNDARY}${CRLF}`));
+    segments.push(part);
+    segments.push(Buffer.from(CRLF));
+  }
+  segments.push(Buffer.from(`--${BOUNDARY}--${CRLF}`));
+  return Buffer.concat(segments);
+}
+
+function textPart(name: string, value: string): Buffer {
+  return Buffer.from(
+    `Content-Disposition: form-data; name="${name}"${CRLF}${CRLF}${value}`,
+  );
+}
+
+function filePart(name: string, filename: string, contentType: string, content: Buffer): Buffer {
+  const header = Buffer.from(
+    `Content-Disposition: form-data; name="${name}"; filename="${filename}"${CRLF}` +
+    `Content-Type: ${contentType}${CRLF}${CRLF}`,
+  );
+  return Buffer.concat([header, content]);
+}
+
+describe('parseBoundary', () => {
+  it('extracts boundary from a standard header', () => {
+    expect(parseBoundary('multipart/form-data; boundary=abc123')).toBe('abc123');
+  });
+
+  it('extracts quoted boundaries', () => {
+    expect(parseBoundary('multipart/form-data; boundary="abc 123"')).toBe('abc 123');
+  });
+
+  it('is case-insensitive on the media type', () => {
+    expect(parseBoundary('Multipart/Form-Data; boundary=xyz')).toBe('xyz');
+  });
+
+  it('handles boundaries with dashes and punctuation', () => {
+    expect(parseBoundary('multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW')).toBe('----WebKitFormBoundary7MA4YWxkTrZu0gW');
+  });
+
+  it('returns null for missing header', () => {
+    expect(parseBoundary(undefined)).toBeNull();
+  });
+
+  it('returns null for non-multipart content type', () => {
+    expect(parseBoundary('application/json')).toBeNull();
+  });
+
+  it('returns null when boundary parameter is missing', () => {
+    expect(parseBoundary('multipart/form-data')).toBeNull();
+  });
+});
+
+describe('parseMultipart — text fields', () => {
+  it('extracts a single text field', () => {
+    const body = buildBody(textPart('greeting', 'hello'));
+    const fields = parseMultipart(body, BOUNDARY);
+    expect(fields).toHaveLength(1);
+    expect(fields[0].name).toBe('greeting');
+    expect(fields[0].filename).toBeUndefined();
+    expect(fields[0].contentType).toBeUndefined();
+    expect(fields[0].content.toString('utf-8')).toBe('hello');
+  });
+
+  it('extracts multiple text fields in order', () => {
+    const body = buildBody(
+      textPart('first', 'one'),
+      textPart('second', 'two'),
+      textPart('third', 'three'),
+    );
+    const fields = parseMultipart(body, BOUNDARY);
+    expect(fields).toHaveLength(3);
+    expect(fields.map(f => f.name)).toEqual(['first', 'second', 'third']);
+    expect(fields.map(f => f.content.toString('utf-8'))).toEqual(['one', 'two', 'three']);
+  });
+
+  it('handles empty text field values', () => {
+    const body = buildBody(textPart('empty', ''));
+    const fields = parseMultipart(body, BOUNDARY);
+    expect(fields).toHaveLength(1);
+    expect(fields[0].content.length).toBe(0);
+  });
+
+  it('preserves CRLF-free text values', () => {
+    const body = buildBody(textPart('iri', 'did:dkg:context-graph:my-cg'));
+    const fields = parseMultipart(body, BOUNDARY);
+    expect(fields[0].content.toString('utf-8')).toBe('did:dkg:context-graph:my-cg');
+  });
+});
+
+describe('parseMultipart — file fields', () => {
+  it('extracts a file part with filename and content-type', () => {
+    const fileContent = Buffer.from('# Markdown Document\n\nBody text.\n', 'utf-8');
+    const body = buildBody(filePart('file', 'doc.md', 'text/markdown', fileContent));
+
+    const fields = parseMultipart(body, BOUNDARY);
+    expect(fields).toHaveLength(1);
+    expect(fields[0].name).toBe('file');
+    expect(fields[0].filename).toBe('doc.md');
+    expect(fields[0].contentType).toBe('text/markdown');
+    expect(fields[0].content.equals(fileContent)).toBe(true);
+  });
+
+  it('extracts binary file content without corruption', () => {
+    const binary = Buffer.from([0x00, 0xff, 0x7f, 0x80, 0x0a, 0x0d, 0x2d, 0x2d]);
+    const body = buildBody(filePart('file', 'binary.bin', 'application/octet-stream', binary));
+
+    const fields = parseMultipart(body, BOUNDARY);
+    expect(fields[0].content.equals(binary)).toBe(true);
+  });
+
+  it('extracts mixed text and file parts in a single body', () => {
+    const fileContent = Buffer.from('file body', 'utf-8');
+    const body = buildBody(
+      textPart('contextGraphId', 'my-cg'),
+      filePart('file', 'doc.pdf', 'application/pdf', fileContent),
+      textPart('ontologyRef', 'did:dkg:context-graph:my-cg/_ontology'),
+    );
+
+    const fields = parseMultipart(body, BOUNDARY);
+    expect(fields).toHaveLength(3);
+    expect(fields[0].name).toBe('contextGraphId');
+    expect(fields[0].content.toString('utf-8')).toBe('my-cg');
+    expect(fields[1].name).toBe('file');
+    expect(fields[1].filename).toBe('doc.pdf');
+    expect(fields[1].contentType).toBe('application/pdf');
+    expect(fields[1].content.equals(fileContent)).toBe(true);
+    expect(fields[2].name).toBe('ontologyRef');
+    expect(fields[2].content.toString('utf-8')).toBe('did:dkg:context-graph:my-cg/_ontology');
+  });
+});
+
+describe('parseMultipart — error handling', () => {
+  it('throws on empty boundary', () => {
+    expect(() => parseMultipart(Buffer.alloc(0), '')).toThrow(MultipartParseError);
+  });
+
+  it('throws when no opening boundary is present', () => {
+    expect(() => parseMultipart(Buffer.from('random bytes'), BOUNDARY)).toThrow(/Missing opening boundary/);
+  });
+
+  it('throws on missing Content-Disposition header', () => {
+    const badPart = Buffer.from(`Content-Type: text/plain${CRLF}${CRLF}orphaned`);
+    const body = buildBody(badPart);
+    expect(() => parseMultipart(body, BOUNDARY)).toThrow(/missing Content-Disposition/);
+  });
+
+  it('throws on missing header terminator', () => {
+    const delim = `--${BOUNDARY}${CRLF}`;
+    const body = Buffer.concat([
+      Buffer.from(delim),
+      Buffer.from(`Content-Disposition: form-data; name="x"`), // no CRLF CRLF
+    ]);
+    expect(() => parseMultipart(body, BOUNDARY)).toThrow(MultipartParseError);
+  });
+
+  it('throws when a part has no closing boundary', () => {
+    const body = Buffer.from(`--${BOUNDARY}${CRLF}Content-Disposition: form-data; name="x"${CRLF}${CRLF}orphaned`);
+    expect(() => parseMultipart(body, BOUNDARY)).toThrow(MultipartParseError);
+  });
+});

From add808ba541c55162cb4354bd6466d2ea69242bd Mon Sep 17 00:00:00 2001
From: code-engineer <noreply@anthropic.com>
Date: Fri, 10 Apr 2026 18:11:53 +0200
Subject: [PATCH 03/12] feat(cli): wire POST /api/assertion/:name/import-file +
 extraction-status
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements the import-file document ingestion endpoint and its
companion extraction-status polling endpoint on the daemon. Wires
Phase 1 (converter) → Phase 2 (markdown structural extractor) → write
triples to the assertion graph, matching the orchestration described
in 05_PROTOCOL_EXTENSIONS.md §6.5.

New endpoints:

- POST /api/assertion/:name/import-file (multipart/form-data)
  Fields:
    file (required) — the uploaded document bytes
    contextGraphId (required) — target context graph
    contentType (optional) — override the file part's Content-Type
    ontologyRef (optional) — CG _ontology URI for Phase 2 guided extraction
    subGraphName (optional) — target sub-graph inside the CG
  Orchestration:
    1. Parse multipart body, store original file in FileStore → fileHash
    2. Resolve detectedContentType (explicit field > multipart Content-Type)
    3. Phase 1:
       - text/markdown → skip converter, use raw bytes as mdIntermediate
       - registered converter → run converter.extract(), store MD result
         in FileStore → mdIntermediateHash
       - no registered converter → graceful degrade: return status="skipped",
         no triples written, file blob retained for later manual extraction
    4. Phase 2 → extractFromMarkdown({ markdown, agentDid, ontologyRef,
       documentIri: assertionUri }) → triples + provenance
    5. Ensure assertion graph exists (idempotent), write triples + provenance
       via agent.assertion.write
    6. Record in in-memory ExtractionStatusRecord map, return ImportFileResponse
  Error paths return typed extraction.status = "failed" with the error message.
  Sub-graph registration errors propagate from assertionCreate/Write (finding
  4 of issue #81).

- GET /api/assertion/:name/extraction-status?contextGraphId=...&subGraphName=...
  Returns the current extraction job state for an assertion by looking up the
  in-memory record. Synchronous extractions populate this on the import-file
  response; this endpoint lets agents re-query without holding the original
  response and provides the hook for async extraction workflows in V10.x.

Supporting changes:

- packages/cli/src/daemon.ts:
  - Import contextGraphAssertionUri, extractFromMarkdown, FileStore,
    parseBoundary, parseMultipart, MultipartParseError
  - New constant MAX_UPLOAD_BYTES = 50 MB for document uploads
  - New interface ExtractionStatusRecord
  - New readBodyBuffer() helper — Buffer variant of readBody for binary
    multipart payloads
  - Instantiate FileStore at {dataDir}/files and extraction-status Map at
    daemon start; thread both into handleRequest via two new parameters
  - Log message for missing MarkItDown updated to clarify markdown uploads
    still work

- packages/cli/test/skill-endpoint.test.ts:
  - Regex tolerance for CRLF line endings in the YAML frontmatter check
    (/^---\r?\n/ instead of /^---\n/). Pre-existing test was Windows-hostile
    because Git's core.autocrlf normalizes LF → CRLF on checkout. Linux CI
    was fine; Windows was failing. Tolerant regex fixes both.

Tests:
- All existing cli tests pass unchanged: multipart 19/19, file-store 12/12,
  extraction-markdown 27/27, extraction-markitdown 8/8, document-processor-e2e
  13/13 (4 expected skips), skill-endpoint 11/11, extraction-pipeline 7/7.
- Integration tests for the new route handlers land in the next commit.

CLI build clean (TypeScript).

Part of OriginTrail/dkgv10-spec#77, #79 gap 3, and #80.
---
 packages/cli/src/daemon.ts               | 371 ++++++++++++++++++++++-
 packages/cli/test/skill-endpoint.test.ts |   4 +-
 2 files changed, 370 insertions(+), 5 deletions(-)

diff --git a/packages/cli/src/daemon.ts b/packages/cli/src/daemon.ts
index 709cb1a18..ee380c8ac 100644
--- a/packages/cli/src/daemon.ts
+++ b/packages/cli/src/daemon.ts
@@ -12,7 +12,7 @@ import { fileURLToPath } from 'node:url';
 import { stat } from 'node:fs/promises';
 import { ethers } from 'ethers';
 import { DKGAgent, loadOpWallets } from '@origintrail-official/dkg-agent';
-import { computeNetworkId, createOperationContext, DKGEvent, Logger, PayloadTooLargeError, GET_VIEWS, validateSubGraphName, validateAssertionName, validateContextGraphId, isSafeIri, contextGraphSharedMemoryUri } from '@origintrail-official/dkg-core';
+import { computeNetworkId, createOperationContext, DKGEvent, Logger, PayloadTooLargeError, GET_VIEWS, validateSubGraphName, validateAssertionName, validateContextGraphId, isSafeIri, contextGraphSharedMemoryUri, contextGraphAssertionUri } from '@origintrail-official/dkg-core';
 import {
   DashboardDB,
   MetricsCollector,
@@ -54,7 +54,9 @@ import {
 import { startPublisherRuntimeIfEnabled, type PublisherRuntime } from './publisher-runner.js';
 import { loadTokens, httpAuthGuard, extractBearerToken } from './auth.js';
 import { ExtractionPipelineRegistry } from '@origintrail-official/dkg-core';
-import { MarkItDownConverter, isMarkItDownAvailable } from './extraction/index.js';
+import { MarkItDownConverter, isMarkItDownAvailable, extractFromMarkdown } from './extraction/index.js';
+import { FileStore } from './file-store.js';
+import { parseBoundary, parseMultipart, MultipartParseError } from './http/multipart.js';
 import { handleCapture, EpcisValidationError, handleEventsQuery, EpcisQueryError, type Publisher as EpcisPublisher } from '@origintrail-official/dkg-epcis';
 import { readFileSync } from 'node:fs';
 
@@ -812,9 +814,18 @@ async function runDaemonInner(foreground: boolean, config: Awaited<ReturnType<ty
     extractionRegistry.register(new MarkItDownConverter());
     log(`Extraction pipelines: ${extractionRegistry.availableContentTypes().join(', ')}`);
   } else {
-    log('MarkItDown binary not found — document extraction unavailable (files stored as blobs)');
+    log('MarkItDown binary not found — non-markdown document extraction unavailable (files stored as blobs)');
   }
 
+  // --- File Store ---
+
+  const fileStore = new FileStore(join(dkgDir(), 'files'));
+
+  // In-memory extraction job status tracker. Synchronous extractions (the V10.0
+  // default) populate this with a completed record on the same request; async
+  // workflows can be layered later without changing the endpoint contract.
+  const extractionStatus = new Map<string, ExtractionStatusRecord>();
+
   // --- HTTP API ---
 
   const rateLimiter = new HttpRateLimiter(
@@ -923,6 +934,8 @@ async function runDaemonInner(foreground: boolean, config: Awaited<ReturnType<ty
         nodeCommit,
         catchupTracker,
         extractionRegistry,
+        fileStore,
+        extractionStatus,
       );
     } catch (err: any) {
       if (res.headersSent || res.writableEnded) return;
@@ -1231,6 +1244,8 @@ async function handleRequest(
   nodeCommit: string,
   catchupTracker: CatchupTracker,
   extractionRegistry: ExtractionPipelineRegistry,
+  fileStore: FileStore,
+  extractionStatus: Map<string, ExtractionStatusRecord>,
 ): Promise<void> {
   const url = new URL(req.url ?? '/', `http://${req.headers.host}`);
   const path = url.pathname;
@@ -2196,6 +2211,309 @@ async function handleRequest(
     }
   }
 
+  // POST /api/assertion/:name/import-file  (multipart/form-data)
+  //   file (required):           the uploaded document bytes
+  //   contextGraphId (required): target context graph
+  //   contentType (optional):    override the file part's Content-Type
+  //   ontologyRef (optional):    CG _ontology URI for guided Phase 2 extraction
+  //   subGraphName (optional):   target sub-graph inside the CG
+  //
+  // Orchestration:
+  //   1. Parse multipart, store original file in file store → fileHash
+  //   2. Resolve detectedContentType (explicit field > multipart content-type)
+  //   3. If content type is text/markdown: skip Phase 1, use raw bytes as mdIntermediate
+  //      Else if a converter is registered: run Phase 1, store mdIntermediate → mdIntermediateHash
+  //      Else: graceful degrade — return extraction.status="skipped", no triples written
+  //   4. Run Phase 2 markdown extractor on the mdIntermediate → triples + provenance
+  //   5. Write triples + provenance to the assertion graph via agent.assertion.write
+  //   6. Record the extraction status in the in-memory Map, return ImportFileResponse
+  if (req.method === 'POST' && path.startsWith('/api/assertion/') && path.endsWith('/import-file')) {
+    const assertionName = safeDecodeURIComponent(path.slice('/api/assertion/'.length, -'/import-file'.length), res);
+    if (assertionName === null) return;
+    const nameVal = validateAssertionName(assertionName);
+    if (!nameVal.valid) return jsonResponse(res, 400, { error: `Invalid assertion name: ${nameVal.reason}` });
+
+    const boundary = parseBoundary(req.headers['content-type']);
+    if (!boundary) {
+      return jsonResponse(res, 400, { error: 'Request must be multipart/form-data with a boundary' });
+    }
+
+    let body: Buffer;
+    try {
+      body = await readBodyBuffer(req, MAX_UPLOAD_BYTES);
+    } catch (err: any) {
+      if (err instanceof PayloadTooLargeError) throw err;
+      return jsonResponse(res, 400, { error: `Failed to read request body: ${err.message}` });
+    }
+
+    let fields;
+    try {
+      fields = parseMultipart(body, boundary);
+    } catch (err: any) {
+      if (err instanceof MultipartParseError) {
+        return jsonResponse(res, 400, { error: `Malformed multipart body: ${err.message}` });
+      }
+      throw err;
+    }
+
+    const filePart = fields.find(f => f.name === 'file' && f.filename !== undefined);
+    if (!filePart) {
+      return jsonResponse(res, 400, { error: 'Missing required "file" field in multipart body' });
+    }
+    const textField = (name: string): string | undefined => {
+      const f = fields.find(x => x.name === name && x.filename === undefined);
+      return f ? f.content.toString('utf-8') : undefined;
+    };
+    const contextGraphId = textField('contextGraphId');
+    const contentTypeOverride = textField('contentType');
+    const ontologyRef = textField('ontologyRef');
+    const subGraphName = textField('subGraphName');
+
+    if (!validateRequiredContextGraphId(contextGraphId, res)) return;
+    if (!validateOptionalSubGraphName(subGraphName, res)) return;
+
+    const detectedContentType = contentTypeOverride ?? filePart.contentType ?? 'application/octet-stream';
+
+    // Persist the original upload to the file store.
+    let fileStoreEntry;
+    try {
+      fileStoreEntry = await fileStore.put(filePart.content, detectedContentType);
+    } catch (err: any) {
+      return jsonResponse(res, 500, { error: `Failed to store uploaded file: ${err.message}` });
+    }
+
+    const assertionUri = contextGraphAssertionUri(
+      contextGraphId!,
+      agent.peerId,
+      assertionName,
+      subGraphName,
+    );
+    const startedAt = new Date().toISOString();
+
+    // ── Phase 1: converter lookup + MD intermediate resolution ──
+    // text/markdown is deliberately NOT a registered converter content type.
+    // The raw uploaded bytes ARE the Markdown intermediate, so Phase 1 is skipped.
+    // For any other content type, look up a converter; if none is registered,
+    // gracefully degrade (store the file, skip extraction, return status=skipped).
+    let mdIntermediate: string | null = null;
+    let pipelineUsed: string | null = null;
+    let mdIntermediateHash: string | undefined;
+
+    if (detectedContentType === 'text/markdown') {
+      mdIntermediate = filePart.content.toString('utf-8');
+      pipelineUsed = 'text/markdown';
+    } else {
+      const converter = extractionRegistry.get(detectedContentType);
+      if (converter) {
+        try {
+          const { mdIntermediate: md } = await converter.extract({
+            filePath: fileStoreEntry.path,
+            contentType: detectedContentType,
+            ontologyRef,
+            agentDid: `did:dkg:agent:${agent.peerId}`,
+          });
+          mdIntermediate = md;
+          pipelineUsed = detectedContentType;
+          const mdEntry = await fileStore.put(Buffer.from(md, 'utf-8'), 'text/markdown');
+          mdIntermediateHash = mdEntry.hash;
+        } catch (err: any) {
+          // Phase 1 failure: record in status map, return error response
+          const failedRecord: ExtractionStatusRecord = {
+            status: 'failed',
+            fileHash: fileStoreEntry.hash,
+            detectedContentType,
+            pipelineUsed: detectedContentType,
+            tripleCount: 0,
+            error: `Phase 1 converter failed: ${err.message}`,
+            startedAt,
+            completedAt: new Date().toISOString(),
+          };
+          extractionStatus.set(assertionUri, failedRecord);
+          return jsonResponse(res, 500, {
+            assertionUri,
+            fileHash: fileStoreEntry.hash,
+            detectedContentType,
+            extraction: {
+              status: 'failed' as const,
+              tripleCount: 0,
+              pipelineUsed: detectedContentType,
+              error: `Phase 1 converter failed: ${err.message}`,
+            },
+          });
+        }
+      }
+    }
+
+    // ── Graceful degrade: no converter registered and not text/markdown ──
+    // Store the file blob, return status=skipped, no triples written.
+    if (mdIntermediate === null) {
+      const skippedRecord: ExtractionStatusRecord = {
+        status: 'skipped',
+        fileHash: fileStoreEntry.hash,
+        detectedContentType,
+        pipelineUsed: null,
+        tripleCount: 0,
+        startedAt,
+        completedAt: new Date().toISOString(),
+      };
+      extractionStatus.set(assertionUri, skippedRecord);
+      return jsonResponse(res, 200, {
+        assertionUri,
+        fileHash: fileStoreEntry.hash,
+        detectedContentType,
+        extraction: {
+          status: 'skipped' as const,
+          tripleCount: 0,
+          pipelineUsed: null,
+        },
+      });
+    }
+
+    // ── Phase 2: markdown → triples + provenance ──
+    let triples;
+    let provenance;
+    try {
+      const result = extractFromMarkdown({
+        markdown: mdIntermediate,
+        agentDid: `did:dkg:agent:${agent.peerId}`,
+        ontologyRef,
+        documentIri: assertionUri,
+      });
+      triples = result.triples;
+      provenance = result.provenance;
+    } catch (err: any) {
+      const failedRecord: ExtractionStatusRecord = {
+        status: 'failed',
+        fileHash: fileStoreEntry.hash,
+        detectedContentType,
+        pipelineUsed,
+        tripleCount: 0,
+        mdIntermediateHash,
+        error: `Phase 2 extraction failed: ${err.message}`,
+        startedAt,
+        completedAt: new Date().toISOString(),
+      };
+      extractionStatus.set(assertionUri, failedRecord);
+      return jsonResponse(res, 500, {
+        assertionUri,
+        fileHash: fileStoreEntry.hash,
+        detectedContentType,
+        extraction: {
+          status: 'failed' as const,
+          tripleCount: 0,
+          pipelineUsed,
+          mdIntermediateHash,
+          error: `Phase 2 extraction failed: ${err.message}`,
+        },
+      });
+    }
+
+    // ── Write triples + provenance to the assertion graph ──
+    // The sub-graph registration check in assertionCreate/Write (finding 4 of #81)
+    // will throw if subGraphName is provided but unregistered — that's intentional.
+    const allTriples = [...triples, ...provenance];
+    if (allTriples.length > 0) {
+      try {
+        // Ensure the assertion graph exists (idempotent — re-running import-file on
+        // the same assertion name simply adds new triples to the existing graph).
+        try {
+          await agent.assertion.create(
+            contextGraphId!,
+            assertionName,
+            subGraphName ? { subGraphName } : undefined,
+          );
+        } catch (err: any) {
+          // create() on an existing graph is idempotent in oxigraph, but if the
+          // error is about the sub-graph not being registered, propagate it.
+          if (err.message?.includes('has not been registered')) {
+            return jsonResponse(res, 400, { error: err.message });
+          }
+          // Other errors from create() can be ignored if the graph already exists.
+        }
+        await agent.assertion.write(
+          contextGraphId!,
+          assertionName,
+          allTriples.map(t => ({ subject: t.subject, predicate: t.predicate, object: t.object })),
+          subGraphName ? { subGraphName } : undefined,
+        );
+      } catch (err: any) {
+        if (err.message?.includes('has not been registered')) {
+          return jsonResponse(res, 400, { error: err.message });
+        }
+        if (err.message?.includes('Invalid') || err.message?.includes('Unsafe')) {
+          return jsonResponse(res, 400, { error: err.message });
+        }
+        throw err;
+      }
+    }
+
+    const completedRecord: ExtractionStatusRecord = {
+      status: 'completed',
+      fileHash: fileStoreEntry.hash,
+      detectedContentType,
+      pipelineUsed,
+      tripleCount: triples.length,
+      mdIntermediateHash,
+      startedAt,
+      completedAt: new Date().toISOString(),
+    };
+    extractionStatus.set(assertionUri, completedRecord);
+
+    return jsonResponse(res, 200, {
+      assertionUri,
+      fileHash: fileStoreEntry.hash,
+      detectedContentType,
+      extraction: {
+        status: 'completed' as const,
+        tripleCount: triples.length,
+        pipelineUsed,
+        ...(mdIntermediateHash ? { mdIntermediateHash } : {}),
+      },
+    });
+  }
+
+  // GET /api/assertion/:name/extraction-status?contextGraphId=...&subGraphName=...
+  // Returns the current extraction job state for the given assertion.
+  // Synchronous extractions (V10.0 default) return status="completed" immediately
+  // on the import-file response; this endpoint lets agents re-query the status
+  // later without having to hold the import-file response, and provides the hook
+  // for async extraction workflows in V10.x.
+  if (req.method === 'GET' && path.startsWith('/api/assertion/') && path.endsWith('/extraction-status')) {
+    const assertionName = safeDecodeURIComponent(path.slice('/api/assertion/'.length, -'/extraction-status'.length), res);
+    if (assertionName === null) return;
+    const nameVal = validateAssertionName(assertionName);
+    if (!nameVal.valid) return jsonResponse(res, 400, { error: `Invalid assertion name: ${nameVal.reason}` });
+    const contextGraphId = url.searchParams.get('contextGraphId') ?? url.searchParams.get('paranetId');
+    if (!validateRequiredContextGraphId(contextGraphId, res)) return;
+    const subGraphName = url.searchParams.get('subGraphName') ?? undefined;
+    if (!validateOptionalSubGraphName(subGraphName, res)) return;
+
+    const assertionUri = contextGraphAssertionUri(
+      contextGraphId!,
+      agent.peerId,
+      assertionName,
+      subGraphName,
+    );
+    const record = extractionStatus.get(assertionUri);
+    if (!record) {
+      return jsonResponse(res, 404, {
+        error: `No extraction record found for assertion "${assertionName}" in context graph "${contextGraphId}"`,
+      });
+    }
+    return jsonResponse(res, 200, {
+      assertionUri,
+      status: record.status,
+      fileHash: record.fileHash,
+      detectedContentType: record.detectedContentType,
+      pipelineUsed: record.pipelineUsed,
+      tripleCount: record.tripleCount,
+      ...(record.mdIntermediateHash ? { mdIntermediateHash: record.mdIntermediateHash } : {}),
+      ...(record.error ? { error: record.error } : {}),
+      startedAt: record.startedAt,
+      ...(record.completedAt ? { completedAt: record.completedAt } : {}),
+    });
+  }
+
   // POST /api/shared-memory/conditional-write  { contextGraphId, quads, conditions, subGraphName? }
   if (req.method === 'POST' && path === '/api/shared-memory/conditional-write') {
     const body = await readBody(req);
@@ -2952,6 +3270,25 @@ function validateConditions(conditions: unknown, res: ServerResponse): boolean {
 
 const MAX_BODY_BYTES = 10 * 1024 * 1024; // 10 MB — default for data-heavy endpoints (publish, update)
 const SMALL_BODY_BYTES = 256 * 1024; // 256 KB — for settings, connect, chat, and other small payloads
+const MAX_UPLOAD_BYTES = 50 * 1024 * 1024; // 50 MB — for import-file document uploads (PDFs, DOCX, etc.)
+
+/**
+ * In-memory extraction job tracking record. Populated at import-file time
+ * and queried by the extraction-status endpoint. Keyed by the target
+ * assertion URI (which is unique per agent × contextGraph × assertionName
+ * × subGraphName).
+ */
+interface ExtractionStatusRecord {
+  status: 'in_progress' | 'completed' | 'skipped' | 'failed';
+  fileHash: string;
+  detectedContentType: string;
+  pipelineUsed: string | null;
+  tripleCount: number;
+  mdIntermediateHash?: string;
+  error?: string;
+  startedAt: string;
+  completedAt?: string;
+}
 
 
 function readBody(req: IncomingMessage, maxBytes = MAX_BODY_BYTES): Promise<string> {
@@ -2978,6 +3315,34 @@ function readBody(req: IncomingMessage, maxBytes = MAX_BODY_BYTES): Promise<stri
   });
 }
 
+/**
+ * Buffer variant of `readBody` that returns raw bytes. Use for binary payloads
+ * like multipart/form-data uploads where `.toString()` would corrupt content.
+ */
+function readBodyBuffer(req: IncomingMessage, maxBytes = MAX_BODY_BYTES): Promise<Buffer> {
+  return new Promise((resolve, reject) => {
+    const chunks: Buffer[] = [];
+    let total = 0;
+    let rejected = false;
+    const onData = (c: Buffer) => {
+      if (rejected) return;
+      total += c.length;
+      if (total > maxBytes) {
+        rejected = true;
+        req.removeListener('data', onData);
+        req.resume();
+        setTimeout(() => req.destroy(), 5_000);
+        reject(new PayloadTooLargeError(maxBytes));
+        return;
+      }
+      chunks.push(c);
+    };
+    req.on('data', onData);
+    req.on('end', () => { if (!rejected) resolve(Buffer.concat(chunks)); });
+    req.on('error', (err) => { if (!rejected) reject(err); });
+  });
+}
+
 // ─── CORS / rate-limit / validation helpers ───────────────────────────
 
 type CorsAllowlist = '*' | string[];
diff --git a/packages/cli/test/skill-endpoint.test.ts b/packages/cli/test/skill-endpoint.test.ts
index 9833aa331..893642ae1 100644
--- a/packages/cli/test/skill-endpoint.test.ts
+++ b/packages/cli/test/skill-endpoint.test.ts
@@ -54,10 +54,10 @@ describe('SKILL.md file', () => {
   });
 
   it('starts with Agent Skills YAML frontmatter', () => {
-    expect(skillContent).toMatch(/^---\n/);
+    expect(skillContent).toMatch(/^---\r?\n/);
     expect(skillContent).toContain('name: dkg-node');
     expect(skillContent).toContain('description:');
-    expect(skillContent).toMatch(/---\n\n/);
+    expect(skillContent).toMatch(/---\r?\n\r?\n/);
   });
 
   it('contains the required DKG V10 sections', () => {

From d9f3221144df8a0e6b32a44d30d901c6e063562e Mon Sep 17 00:00:00 2001
From: code-engineer <noreply@anthropic.com>
Date: Fri, 10 Apr 2026 18:17:02 +0200
Subject: [PATCH 04/12] docs(cli): SKILL.md import-file workflow + integration
 tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Completes Phase 3b by documenting the shipped assertion API surface
in SKILL.md and adding integration tests for the import-file
orchestration.

SKILL.md updates:

- §5 Memory Model "Working Memory (WM)" section: removed the
  "🚧 Planned" marker on the assertion API (create/write/query/promote/
  discard ship as of PR #108; import-file and extraction-status ship in
  this PR). Listed the full shipped API surface with body shapes, added
  the import-file and extraction-status endpoints, and noted the
  sub-graph registration check from issue #81 finding 4 so agents know
  to createSubGraph() before targeting one.

- §7 File Ingestion: replaced the "🚧 Planned" section with complete
  documentation of the shipped POST /api/assertion/{name}/import-file
  endpoint:
  - Two-phase pipeline overview (Phase 1 converter, Phase 2 structural
    extractor) with explicit text/markdown skip-Phase-1 note
  - Request table listing all form fields (file, contextGraphId,
    contentType, ontologyRef, subGraphName)
  - End-to-end curl example
  - Response shape with all fields populated
  - Extraction status semantics (completed / skipped / failed)
  - GET /api/assertion/{name}/extraction-status usage for polling

Integration tests (packages/cli/test/import-file-integration.test.ts):

NEW 12-test suite that exercises the full Phase 1 → Phase 2 →
assertion.write orchestration without requiring a full DKGAgent
(which needs libp2p + chain). Uses real FileStore (temp dir), real
ExtractionPipelineRegistry, real extractFromMarkdown, real parseMultipart,
and a mock agent that captures assertion.create/write calls for
verification. This drives the exact call sequence the daemon route
handler does, so it covers the orchestration end-to-end.

Happy paths (5 tests):
- text/markdown upload skips Phase 1, runs Phase 2, writes triples
  covering every extractor feature (rdf:type, schema:name from
  frontmatter title, schema:mentions from wikilink, schema:keywords
  from hashtag, Dataview status field, dkg:hasSection headings)
- text/markdown detection from filePart Content-Type header when no
  explicit contentType field is provided
- contentType text field overrides the file part Content-Type
- Registered PDF converter runs Phase 1, stores MD intermediate via
  FileStore with a separate mdIntermediateHash distinct from fileHash,
  runs Phase 2 on the converter's output
- ontologyRef threaded through to the converter
- subGraphName threaded through to assertion.create and assertion.write

Graceful degrade (2 tests):
- Unregistered content type (image/png): file stored with correct magic
  bytes preserved, status="skipped", pipelineUsed=null, no triples
  written, no assertion.create/write called
- File part with no Content-Type header defaults to application/octet-
  stream and also degrades gracefully

Extraction-status semantics (2 tests):
- startedAt and completedAt timestamps populated on success
- Multiple imports to different assertions get separate status records
  keyed by assertionUri

Boundary parsing (2 tests, via parseBoundary wrapper):
- Extracts boundary from daemon-style header
- Rejects non-multipart requests

skill-endpoint.test.ts updates:
- Replaced the stale "marks planned endpoints clearly" test
  (which asserted /api/assertion/create was planned — no longer true)
  with two tests: one that confirms the *(planned)* marker still exists
  (for context graph sub-resources and agent profile), and a new test
  "documents the now-shipped assertion API surface" that verifies all
  7 shipped assertion routes (create/write/query/promote/discard/
  import-file/extraction-status) appear in SKILL.md.

Test results:
- multipart: 19/19 pass
- file-store: 12/12 pass
- extraction-markdown: 27/27 pass
- extraction-markitdown: 8/8 pass
- skill-endpoint: 12/12 pass (was 11; +1 new assertion-API-surface test)
- import-file-integration: 12/12 pass (NEW)
- document-processor-e2e: 13/13 pass (4 expected skips, markitdown-unavailable)
- Total: 99/99 pass + 4 expected skips
- Full cli build clean.

Closes OriginTrail/dkgv10-spec#77 (import-file wiring),
OriginTrail/dkgv10-spec#79 gap 3 (extraction-status endpoint),
OriginTrail/dkgv10-spec#80 (ExtractionPipeline interface split — via
the ff8afe3 prep commit).
---
 packages/cli/skills/dkg-node/SKILL.md         | 107 ++-
 .../cli/test/import-file-integration.test.ts  | 646 ++++++++++++++++++
 packages/cli/test/skill-endpoint.test.ts      |  14 +-
 3 files changed, 749 insertions(+), 18 deletions(-)
 create mode 100644 packages/cli/test/import-file-integration.test.ts

diff --git a/packages/cli/skills/dkg-node/SKILL.md b/packages/cli/skills/dkg-node/SKILL.md
index c8542c769..10a87061b 100644
--- a/packages/cli/skills/dkg-node/SKILL.md
+++ b/packages/cli/skills/dkg-node/SKILL.md
@@ -121,17 +121,29 @@ The token is configured in the node's config file or provided at startup.
   - **Note:** `subGraphName` is supported for legacy routing only and cannot be combined with `view`
 - `POST /api/query-remote` — query a remote peer via P2P
 
-### Working Memory (WM) — Private assertions (🚧 Planned)
+### Working Memory (WM) — Private assertions
 
-> The following WM assertion endpoints are planned for a future release:
+WM assertions are your agent-local drafts — private to you, readable and
+writable only by your peer ID, never gossiped. Use them to stage knowledge
+before sharing it to SWM (team) or promoting it to VM (chain-anchored).
 
 - `POST /api/assertion/create` — create a named private assertion
-- `PUT /api/assertion/{name}` — write triples to an assertion
-- `POST /api/assertion/{name}/import` — import N-Triples/Turtle/JSON-LD
-- `POST /api/assertion/{name}/import-file` — import PDF/DOCX/Markdown (multipart)
-- `GET /api/assertion/{name}` — read assertion contents
-- `DELETE /api/assertion/{name}` — delete assertion
-- `POST /api/assertion/{name}/promote` — promote assertion to SWM
+  Body: `{ "contextGraphId": "...", "name": "...", "subGraphName"?: "..." }`
+- `POST /api/assertion/{name}/write` — write triples to an assertion
+  Body: `{ "contextGraphId": "...", "quads": [...], "subGraphName"?: "..." }`
+- `POST /api/assertion/{name}/query` — read assertion contents as quads
+  Body: `{ "contextGraphId": "...", "subGraphName"?: "..." }`
+- `POST /api/assertion/{name}/promote` — promote assertion triples to SWM
+  Body: `{ "contextGraphId": "...", "entities"?: [...] | "all", "subGraphName"?: "..." }`
+- `POST /api/assertion/{name}/discard` — drop the assertion graph
+  Body: `{ "contextGraphId": "...", "subGraphName"?: "..." }`
+- `POST /api/assertion/{name}/import-file` — import a document (multipart/form-data) — see §7
+- `GET /api/assertion/{name}/extraction-status?contextGraphId=...` — poll the status of an import-file extraction job
+
+> If `subGraphName` is provided but the sub-graph is not registered in the CG's
+> `_meta` graph, all assertion operations throw
+> `Sub-graph "{name}" has not been registered in context graph "{id}". Call createSubGraph() first.`
+> Create the sub-graph before targeting it.
 
 ## 6. Context Graphs
 
@@ -145,22 +157,83 @@ Context Graphs are scoped knowledge domains with configurable access and governa
 - 🚧 `POST /api/context-graph/{id}/ontology` — add ontology *(planned)*
 - 🚧 `GET /api/context-graph/{id}/ontology` — list ontologies *(planned)*
 
-## 7. File Ingestion (🚧 Planned)
+## 7. File Ingestion
 
-> File ingestion via `import-file` depends on the Working Memory assertion API (§5)
-> and will be available when those endpoints ship. The extraction pipeline
-> infrastructure (MarkItDown converter) is already in place on the node.
+Upload a document (PDF, DOCX, HTML, CSV, Markdown, etc.) and let the node
+extract RDF triples into a WM assertion. The node runs a deterministic
+two-phase pipeline:
 
-Supported formats depend on available extraction pipelines (see Node Info §1).
-When available, usage will be:
+1. **Phase 1 (optional converter):** non-Markdown formats go through a
+   registered converter (e.g. MarkItDown for PDF/DOCX/HTML) which produces
+   a Markdown intermediate. `text/markdown` uploads skip Phase 1 — the raw
+   file IS the intermediate.
+2. **Phase 2 (structural extractor):** the Markdown intermediate is parsed
+   for YAML frontmatter, wikilinks (`[[Target]]`), hashtags (`#keyword`),
+   Dataview inline fields (`key:: value`), and heading structure. No LLM —
+   deterministic, node-side, no external calls.
+
+The extracted triples are written to the target assertion graph via the
+same path as `POST /api/assertion/{name}/write`. Agents can then query,
+promote, or publish them like any other assertion content.
+
+**Supported formats:** see Node Info §1 for the list of registered
+extraction pipelines on your specific node. `text/markdown` is always
+supported (no converter needed).
+
+### Request
+
+`POST /api/assertion/{name}/import-file` with `Content-Type: multipart/form-data`:
+
+| Field           | Required | Description                                                                 |
+|-----------------|----------|-----------------------------------------------------------------------------|
+| `file`          | yes      | The document bytes                                                          |
+| `contextGraphId`| yes      | Target context graph                                                        |
+| `contentType`   | no       | Override the file part's Content-Type header                                |
+| `ontologyRef`   | no       | CG `_ontology` URI for guided Phase 2 extraction                            |
+| `subGraphName`  | no       | Target sub-graph inside the CG (must be registered via `createSubGraph`)    |
+
+### Example
 
 ```bash
-curl -X POST $BASE_URL/api/assertion/my-assertion/import-file \
+curl -X POST $BASE_URL/api/assertion/climate-report/import-file \
   -H "Authorization: Bearer $TOKEN" \
-  -F "file=@paper.pdf" \
-  -F "contextGraph=my-context-graph"
+  -F "file=@climate-2026.md;type=text/markdown" \
+  -F "contextGraphId=research"
+```
+
+### Response
+
+```json
+{
+  "assertionUri": "did:dkg:context-graph:research/assertion/0xAgentAddr/climate-report",
+  "fileHash": "sha256:a1b2c3...",
+  "detectedContentType": "text/markdown",
+  "extraction": {
+    "status": "completed",
+    "tripleCount": 14,
+    "pipelineUsed": "text/markdown",
+    "mdIntermediateHash": "sha256:a1b2c3..."
+  }
+}
 ```
 
+### Extraction statuses
+
+- `completed` — Phase 1 (if needed) and Phase 2 both ran; triples were written to the assertion graph
+- `skipped` — no converter is registered for the file's content type; the file is stored in the file store but no triples were written. Agents can still reference the file via its `fileHash`
+- `failed` — one of the phases threw an error; check the `error` field in the response. The file is still stored; no triples written.
+
+For synchronous extractions (the V10.0 default) the response carries the
+final status immediately. To re-query later without holding the original
+response, use:
+
+```bash
+curl $BASE_URL/api/assertion/climate-report/extraction-status?contextGraphId=research \
+  -H "Authorization: Bearer $TOKEN"
+```
+
+Returns the same `{ status, fileHash, pipelineUsed, tripleCount, ... }` shape from the in-memory extraction status tracker, or 404 if no import-file has been run for that assertion.
+
 ## 8. Node Administration
 
 - `GET /api/status` (PUBLIC) — node status, peer ID, version, connections
diff --git a/packages/cli/test/import-file-integration.test.ts b/packages/cli/test/import-file-integration.test.ts
new file mode 100644
index 000000000..d3a773101
--- /dev/null
+++ b/packages/cli/test/import-file-integration.test.ts
@@ -0,0 +1,646 @@
+/**
+ * Integration tests for the POST /api/assertion/:name/import-file orchestration.
+ *
+ * These tests exercise the full Phase 1 → Phase 2 → assertion.write pipeline
+ * without spinning up a full DKGAgent (which needs libp2p + chain). Instead
+ * we drive the exact sequence of operations the route handler does:
+ *
+ *   1. parseMultipart(body, boundary)
+ *   2. fileStore.put(filePart.content, detectedContentType)
+ *   3. branch on detectedContentType:
+ *        - text/markdown → raw bytes as mdIntermediate
+ *        - registered converter → converter.extract(...)
+ *        - neither → graceful degrade, status="skipped"
+ *   4. extractFromMarkdown({ markdown, agentDid, ontologyRef, documentIri })
+ *   5. mockAgent.assertion.write(contextGraphId, name, triples)
+ *   6. record in extractionStatus Map
+ *
+ * The mock agent captures the assertion.write call arguments for verification.
+ * The real FileStore (on a temp dir), real extractionRegistry, real
+ * extractFromMarkdown, real parseMultipart are all used.
+ *
+ * This covers the same behaviors the daemon route handler implements, minus the
+ * HTTP parsing/validation shell (which is tested indirectly via the multipart
+ * unit tests plus the bits the daemon compiles against).
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import { mkdtemp, rm, readFile } from 'node:fs/promises';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import {
+  ExtractionPipelineRegistry,
+  type ExtractionPipeline,
+  type ExtractionInput,
+  type ConverterOutput,
+  contextGraphAssertionUri,
+} from '@origintrail-official/dkg-core';
+import { FileStore } from '../src/file-store.js';
+import { parseBoundary, parseMultipart } from '../src/http/multipart.js';
+import { extractFromMarkdown } from '../src/extraction/markdown-extractor.js';
+
+// ── Test fixture types (mirroring the ExtractionStatusRecord in daemon.ts) ──
+
+interface ExtractionStatusRecord {
+  status: 'in_progress' | 'completed' | 'skipped' | 'failed';
+  fileHash: string;
+  detectedContentType: string;
+  pipelineUsed: string | null;
+  tripleCount: number;
+  mdIntermediateHash?: string;
+  error?: string;
+  startedAt: string;
+  completedAt?: string;
+}
+
+interface CapturedAssertionWrite {
+  contextGraphId: string;
+  name: string;
+  triples: Array<{ subject: string; predicate: string; object: string }>;
+  subGraphName?: string;
+}
+
+interface MockAgent {
+  peerId: string;
+  assertion: {
+    create: (
+      contextGraphId: string,
+      name: string,
+      opts?: { subGraphName?: string },
+    ) => Promise<string>;
+    write: (
+      contextGraphId: string,
+      name: string,
+      triples: Array<{ subject: string; predicate: string; object: string }>,
+      opts?: { subGraphName?: string },
+    ) => Promise<void>;
+  };
+  capturedWrites: CapturedAssertionWrite[];
+  createdAssertions: Array<{ contextGraphId: string; name: string; subGraphName?: string }>;
+}
+
+function makeMockAgent(peerId = '0xMockAgentPeerId'): MockAgent {
+  const capturedWrites: CapturedAssertionWrite[] = [];
+  const createdAssertions: Array<{ contextGraphId: string; name: string; subGraphName?: string }> = [];
+  return {
+    peerId,
+    capturedWrites,
+    createdAssertions,
+    assertion: {
+      async create(contextGraphId: string, name: string, opts?: { subGraphName?: string }): Promise<string> {
+        createdAssertions.push({ contextGraphId, name, subGraphName: opts?.subGraphName });
+        return contextGraphAssertionUri(contextGraphId, peerId, name, opts?.subGraphName);
+      },
+      async write(
+        contextGraphId: string,
+        name: string,
+        triples: Array<{ subject: string; predicate: string; object: string }>,
+        opts?: { subGraphName?: string },
+      ): Promise<void> {
+        capturedWrites.push({ contextGraphId, name, triples, subGraphName: opts?.subGraphName });
+      },
+    },
+  };
+}
+
+// ── The orchestration under test (matches daemon.ts import-file handler) ──
+
+interface ImportFileResult {
+  assertionUri: string;
+  fileHash: string;
+  detectedContentType: string;
+  extraction: {
+    status: 'completed' | 'skipped' | 'failed';
+    tripleCount: number;
+    pipelineUsed: string | null;
+    mdIntermediateHash?: string;
+    error?: string;
+  };
+}
+
+async function runImportFileOrchestration(params: {
+  agent: MockAgent;
+  fileStore: FileStore;
+  extractionRegistry: ExtractionPipelineRegistry;
+  extractionStatus: Map<string, ExtractionStatusRecord>;
+  multipartBody: Buffer;
+  boundary: string;
+  assertionName: string;
+}): Promise<ImportFileResult> {
+  const { agent, fileStore, extractionRegistry, extractionStatus, multipartBody, boundary, assertionName } = params;
+
+  const fields = parseMultipart(multipartBody, boundary);
+  const filePart = fields.find(f => f.name === 'file' && f.filename !== undefined)!;
+  const textField = (name: string): string | undefined => {
+    const f = fields.find(x => x.name === name && x.filename === undefined);
+    return f ? f.content.toString('utf-8') : undefined;
+  };
+  const contextGraphId = textField('contextGraphId')!;
+  const contentTypeOverride = textField('contentType');
+  const ontologyRef = textField('ontologyRef');
+  const subGraphName = textField('subGraphName');
+  const detectedContentType = contentTypeOverride ?? filePart.contentType ?? 'application/octet-stream';
+
+  const fileStoreEntry = await fileStore.put(filePart.content, detectedContentType);
+  const assertionUri = contextGraphAssertionUri(contextGraphId, agent.peerId, assertionName, subGraphName);
+  const startedAt = new Date().toISOString();
+
+  let mdIntermediate: string | null = null;
+  let pipelineUsed: string | null = null;
+  let mdIntermediateHash: string | undefined;
+
+  if (detectedContentType === 'text/markdown') {
+    mdIntermediate = filePart.content.toString('utf-8');
+    pipelineUsed = 'text/markdown';
+  } else {
+    const converter = extractionRegistry.get(detectedContentType);
+    if (converter) {
+      const { mdIntermediate: md } = await converter.extract({
+        filePath: fileStoreEntry.path,
+        contentType: detectedContentType,
+        ontologyRef,
+        agentDid: `did:dkg:agent:${agent.peerId}`,
+      });
+      mdIntermediate = md;
+      pipelineUsed = detectedContentType;
+      const mdEntry = await fileStore.put(Buffer.from(md, 'utf-8'), 'text/markdown');
+      mdIntermediateHash = mdEntry.hash;
+    }
+  }
+
+  // Graceful degrade
+  if (mdIntermediate === null) {
+    const skippedRecord: ExtractionStatusRecord = {
+      status: 'skipped',
+      fileHash: fileStoreEntry.hash,
+      detectedContentType,
+      pipelineUsed: null,
+      tripleCount: 0,
+      startedAt,
+      completedAt: new Date().toISOString(),
+    };
+    extractionStatus.set(assertionUri, skippedRecord);
+    return {
+      assertionUri,
+      fileHash: fileStoreEntry.hash,
+      detectedContentType,
+      extraction: { status: 'skipped', tripleCount: 0, pipelineUsed: null },
+    };
+  }
+
+  // Phase 2
+  const { triples, provenance } = extractFromMarkdown({
+    markdown: mdIntermediate,
+    agentDid: `did:dkg:agent:${agent.peerId}`,
+    ontologyRef,
+    documentIri: assertionUri,
+  });
+
+  const allTriples = [...triples, ...provenance];
+  if (allTriples.length > 0) {
+    await agent.assertion.create(contextGraphId, assertionName, subGraphName ? { subGraphName } : undefined);
+    await agent.assertion.write(
+      contextGraphId,
+      assertionName,
+      allTriples.map(t => ({ subject: t.subject, predicate: t.predicate, object: t.object })),
+      subGraphName ? { subGraphName } : undefined,
+    );
+  }
+
+  const completedRecord: ExtractionStatusRecord = {
+    status: 'completed',
+    fileHash: fileStoreEntry.hash,
+    detectedContentType,
+    pipelineUsed,
+    tripleCount: triples.length,
+    mdIntermediateHash,
+    startedAt,
+    completedAt: new Date().toISOString(),
+  };
+  extractionStatus.set(assertionUri, completedRecord);
+
+  return {
+    assertionUri,
+    fileHash: fileStoreEntry.hash,
+    detectedContentType,
+    extraction: {
+      status: 'completed',
+      tripleCount: triples.length,
+      pipelineUsed,
+      ...(mdIntermediateHash ? { mdIntermediateHash } : {}),
+    },
+  };
+}
+
+// ── Multipart body builder for tests ──
+
+const BOUNDARY = '----dkgimporttest';
+const CRLF = '\r\n';
+
+function buildMultipart(parts: Array<
+  | { kind: 'text'; name: string; value: string }
+  | { kind: 'file'; name: string; filename: string; contentType: string; content: Buffer }
+>): Buffer {
+  const segments: Buffer[] = [];
+  for (const p of parts) {
+    segments.push(Buffer.from(`--${BOUNDARY}${CRLF}`));
+    if (p.kind === 'text') {
+      segments.push(Buffer.from(`Content-Disposition: form-data; name="${p.name}"${CRLF}${CRLF}${p.value}`));
+    } else {
+      segments.push(Buffer.from(
+        `Content-Disposition: form-data; name="${p.name}"; filename="${p.filename}"${CRLF}` +
+        `Content-Type: ${p.contentType}${CRLF}${CRLF}`,
+      ));
+      segments.push(p.content);
+    }
+    segments.push(Buffer.from(CRLF));
+  }
+  segments.push(Buffer.from(`--${BOUNDARY}--${CRLF}`));
+  return Buffer.concat(segments);
+}
+
+// ── Tests ──
+
+describe('import-file orchestration — happy paths', () => {
+  let tmpDir: string;
+  let fileStore: FileStore;
+  let registry: ExtractionPipelineRegistry;
+  let status: Map<string, ExtractionStatusRecord>;
+  let agent: MockAgent;
+
+  beforeEach(async () => {
+    tmpDir = await mkdtemp(join(tmpdir(), 'dkg-importfile-test-'));
+    fileStore = new FileStore(join(tmpDir, 'files'));
+    registry = new ExtractionPipelineRegistry();
+    status = new Map();
+    agent = makeMockAgent();
+  });
+
+  afterEach(async () => {
+    await rm(tmpDir, { recursive: true, force: true });
+  });
+
+  it('text/markdown upload — skips Phase 1, runs Phase 2, writes triples to assertion', async () => {
+    const markdown = [
+      '---',
+      'id: research-note',
+      'type: ScholarlyArticle',
+      'title: Climate Report 2026',
+      'description: A short climate analysis',
+      '---',
+      '',
+      '# Climate Report 2026',
+      '',
+      'Global temperature rose by 1.2°C. See [[Paris Agreement]] and #climate topics.',
+      '',
+      '## Background',
+      '',
+      'status:: draft',
+      '',
+      '## Methods',
+      '',
+      'Sampled historical records.',
+      '',
+    ].join('\n');
+
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'research-cg' },
+      { kind: 'file', name: 'file', filename: 'climate.md', contentType: 'text/markdown', content: Buffer.from(markdown, 'utf-8') },
+    ]);
+
+    const result = await runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'climate-report',
+    });
+
+    // Response shape
+    expect(result.extraction.status).toBe('completed');
+    expect(result.extraction.pipelineUsed).toBe('text/markdown');
+    expect(result.extraction.tripleCount).toBeGreaterThan(0);
+    expect(result.fileHash).toMatch(/^sha256:[0-9a-f]{64}$/);
+    expect(result.detectedContentType).toBe('text/markdown');
+    expect(result.extraction.mdIntermediateHash).toBeUndefined(); // no Phase 1, no MD intermediate stored separately
+    expect(result.assertionUri).toBe(contextGraphAssertionUri('research-cg', agent.peerId, 'climate-report'));
+
+    // Assertion write happened
+    expect(agent.createdAssertions).toHaveLength(1);
+    expect(agent.createdAssertions[0]).toEqual({ contextGraphId: 'research-cg', name: 'climate-report', subGraphName: undefined });
+    expect(agent.capturedWrites).toHaveLength(1);
+    expect(agent.capturedWrites[0].contextGraphId).toBe('research-cg');
+    expect(agent.capturedWrites[0].name).toBe('climate-report');
+
+    // Triples reflect the markdown structure
+    const writtenTriples = agent.capturedWrites[0].triples;
+    // rdf:type ScholarlyArticle
+    expect(writtenTriples.some(t =>
+      t.predicate === 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' &&
+      t.object === 'http://schema.org/ScholarlyArticle',
+    )).toBe(true);
+    // schema:name from frontmatter title
+    expect(writtenTriples.some(t =>
+      t.predicate === 'http://schema.org/name' &&
+      t.object === '"Climate Report 2026"',
+    )).toBe(true);
+    // wikilink mention
+    expect(writtenTriples.some(t =>
+      t.predicate === 'http://schema.org/mentions' &&
+      t.object === 'urn:dkg:md:paris-agreement',
+    )).toBe(true);
+    // hashtag as keyword
+    expect(writtenTriples.some(t =>
+      t.predicate === 'http://schema.org/keywords' &&
+      t.object === '"climate"',
+    )).toBe(true);
+    // dataview field
+    expect(writtenTriples.some(t =>
+      t.predicate === 'http://schema.org/status' &&
+      t.object === '"draft"',
+    )).toBe(true);
+    // section headings
+    expect(writtenTriples.some(t =>
+      t.predicate === 'http://dkg.io/ontology/hasSection',
+    )).toBe(true);
+
+    // Status map populated
+    expect(status.size).toBe(1);
+    const record = status.get(result.assertionUri)!;
+    expect(record.status).toBe('completed');
+    expect(record.fileHash).toBe(result.fileHash);
+    expect(record.pipelineUsed).toBe('text/markdown');
+    expect(record.tripleCount).toBe(result.extraction.tripleCount);
+  });
+
+  it('text/markdown upload uses filePart content type when contentType field is not provided', async () => {
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'file', name: 'file', filename: 'doc.md', contentType: 'text/markdown', content: Buffer.from('# Title\n\nBody.\n', 'utf-8') },
+    ]);
+
+    const result = await runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'doc',
+    });
+
+    expect(result.extraction.status).toBe('completed');
+    expect(result.extraction.pipelineUsed).toBe('text/markdown');
+    expect(result.detectedContentType).toBe('text/markdown');
+  });
+
+  it('contentType text field overrides the file part Content-Type header', async () => {
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'text', name: 'contentType', value: 'text/markdown' },
+      // File reports application/octet-stream, but the override tells the handler to treat it as markdown
+      { kind: 'file', name: 'file', filename: 'doc.bin', contentType: 'application/octet-stream', content: Buffer.from('# Hello\n\nWorld.\n', 'utf-8') },
+    ]);
+
+    const result = await runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'override-test',
+    });
+
+    expect(result.detectedContentType).toBe('text/markdown');
+    expect(result.extraction.status).toBe('completed');
+    expect(result.extraction.pipelineUsed).toBe('text/markdown');
+  });
+
+  it('registered converter path — runs Phase 1, stores MD intermediate, runs Phase 2', async () => {
+    // Register a stub converter for application/pdf that converts "fake-pdf" bytes to real markdown
+    const stubConverter: ExtractionPipeline = {
+      contentTypes: ['application/pdf'],
+      async extract(_input: ExtractionInput): Promise<ConverterOutput> {
+        return {
+          mdIntermediate: [
+            '---',
+            'id: stub-doc',
+            'type: Report',
+            '---',
+            '',
+            '# Stub Document',
+            '',
+            'Body with #tag1 and [[Reference]].',
+            '',
+          ].join('\n'),
+        };
+      },
+    };
+    registry.register(stubConverter);
+
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'research' },
+      { kind: 'file', name: 'file', filename: 'paper.pdf', contentType: 'application/pdf', content: Buffer.from('fake-pdf-bytes', 'utf-8') },
+    ]);
+
+    const result = await runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'paper',
+    });
+
+    expect(result.extraction.status).toBe('completed');
+    expect(result.extraction.pipelineUsed).toBe('application/pdf');
+    expect(result.extraction.mdIntermediateHash).toBeDefined();
+    expect(result.extraction.mdIntermediateHash).toMatch(/^sha256:[0-9a-f]{64}$/);
+    expect(result.extraction.mdIntermediateHash).not.toBe(result.fileHash); // stored separately
+
+    // MD intermediate is retrievable from the file store
+    const mdBytes = await fileStore.get(result.extraction.mdIntermediateHash!);
+    expect(mdBytes).not.toBeNull();
+    expect(mdBytes!.toString('utf-8')).toContain('# Stub Document');
+
+    // Triples reflect the Phase 2 extraction of the stub's MD intermediate
+    const triples = agent.capturedWrites[0].triples;
+    expect(triples.some(t => t.object === 'http://schema.org/Report')).toBe(true);
+    expect(triples.some(t => t.object === '"tag1"')).toBe(true);
+    expect(triples.some(t => t.object === 'urn:dkg:md:reference')).toBe(true);
+  });
+
+  it('passes ontologyRef through to the converter and Phase 2 extractor', async () => {
+    let capturedOntologyRef: string | undefined;
+    const stubConverter: ExtractionPipeline = {
+      contentTypes: ['application/pdf'],
+      async extract(input: ExtractionInput): Promise<ConverterOutput> {
+        capturedOntologyRef = input.ontologyRef;
+        return { mdIntermediate: '# Doc\n\nBody.\n' };
+      },
+    };
+    registry.register(stubConverter);
+
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'research' },
+      { kind: 'text', name: 'ontologyRef', value: 'did:dkg:context-graph:research/_ontology' },
+      { kind: 'file', name: 'file', filename: 'paper.pdf', contentType: 'application/pdf', content: Buffer.from('pdf', 'utf-8') },
+    ]);
+
+    await runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'paper',
+    });
+
+    expect(capturedOntologyRef).toBe('did:dkg:context-graph:research/_ontology');
+  });
+
+  it('passes subGraphName through to assertion.create and assertion.write', async () => {
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'text', name: 'subGraphName', value: 'decisions' },
+      { kind: 'file', name: 'file', filename: 'doc.md', contentType: 'text/markdown', content: Buffer.from('# Title\n\nBody.\n', 'utf-8') },
+    ]);
+
+    await runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'decision-1',
+    });
+
+    expect(agent.createdAssertions[0]).toEqual({ contextGraphId: 'cg', name: 'decision-1', subGraphName: 'decisions' });
+    expect(agent.capturedWrites[0].subGraphName).toBe('decisions');
+  });
+});
+
+describe('import-file orchestration — graceful degrade', () => {
+  let tmpDir: string;
+  let fileStore: FileStore;
+  let registry: ExtractionPipelineRegistry;
+  let status: Map<string, ExtractionStatusRecord>;
+  let agent: MockAgent;
+
+  beforeEach(async () => {
+    tmpDir = await mkdtemp(join(tmpdir(), 'dkg-importfile-test-'));
+    fileStore = new FileStore(join(tmpDir, 'files'));
+    registry = new ExtractionPipelineRegistry();
+    status = new Map();
+    agent = makeMockAgent();
+  });
+
+  afterEach(async () => {
+    await rm(tmpDir, { recursive: true, force: true });
+  });
+
+  it('unregistered content type — stores file, returns status="skipped", writes no triples', async () => {
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'file', name: 'file', filename: 'photo.png', contentType: 'image/png', content: Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]) },
+    ]);
+
+    const result = await runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'photo',
+    });
+
+    expect(result.extraction.status).toBe('skipped');
+    expect(result.extraction.tripleCount).toBe(0);
+    expect(result.extraction.pipelineUsed).toBeNull();
+    expect(result.extraction.mdIntermediateHash).toBeUndefined();
+    expect(result.detectedContentType).toBe('image/png');
+
+    // File is still stored (retrievable via fileHash)
+    const retrieved = await fileStore.get(result.fileHash);
+    expect(retrieved).not.toBeNull();
+    expect(retrieved![0]).toBe(0x89); // PNG magic byte preserved
+
+    // No triples written to the assertion
+    expect(agent.createdAssertions).toHaveLength(0);
+    expect(agent.capturedWrites).toHaveLength(0);
+
+    // Status record reflects the skip
+    const record = status.get(result.assertionUri)!;
+    expect(record.status).toBe('skipped');
+    expect(record.pipelineUsed).toBeNull();
+    expect(record.tripleCount).toBe(0);
+  });
+
+  it('unregistered content type with no content-type header — defaults to application/octet-stream and skips', async () => {
+    // File part without a Content-Type header — daemon defaults to application/octet-stream
+    const fileContent = Buffer.from('opaque', 'utf-8');
+    const segments: Buffer[] = [];
+    segments.push(Buffer.from(`--${BOUNDARY}${CRLF}`));
+    segments.push(Buffer.from(`Content-Disposition: form-data; name="contextGraphId"${CRLF}${CRLF}cg`));
+    segments.push(Buffer.from(CRLF));
+    segments.push(Buffer.from(`--${BOUNDARY}${CRLF}`));
+    segments.push(Buffer.from(`Content-Disposition: form-data; name="file"; filename="opaque.bin"${CRLF}${CRLF}`));
+    segments.push(fileContent);
+    segments.push(Buffer.from(CRLF));
+    segments.push(Buffer.from(`--${BOUNDARY}--${CRLF}`));
+    const body = Buffer.concat(segments);
+
+    const result = await runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'opaque-upload',
+    });
+
+    expect(result.detectedContentType).toBe('application/octet-stream');
+    expect(result.extraction.status).toBe('skipped');
+    expect(result.extraction.pipelineUsed).toBeNull();
+  });
+});
+
+describe('import-file orchestration — boundary parsing', () => {
+  it('parseBoundary extracts boundary from the daemon-style header', () => {
+    expect(parseBoundary(`multipart/form-data; boundary=${BOUNDARY}`)).toBe(BOUNDARY);
+  });
+
+  it('parseBoundary rejects non-multipart requests', () => {
+    expect(parseBoundary('application/json')).toBeNull();
+  });
+});
+
+describe('import-file orchestration — extraction-status semantics', () => {
+  let tmpDir: string;
+  let fileStore: FileStore;
+  let registry: ExtractionPipelineRegistry;
+  let status: Map<string, ExtractionStatusRecord>;
+  let agent: MockAgent;
+
+  beforeEach(async () => {
+    tmpDir = await mkdtemp(join(tmpdir(), 'dkg-importfile-test-'));
+    fileStore = new FileStore(join(tmpDir, 'files'));
+    registry = new ExtractionPipelineRegistry();
+    status = new Map();
+    agent = makeMockAgent();
+  });
+
+  afterEach(async () => {
+    await rm(tmpDir, { recursive: true, force: true });
+  });
+
+  it('populates the status record with startedAt/completedAt timestamps on success', async () => {
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'file', name: 'file', filename: 'doc.md', contentType: 'text/markdown', content: Buffer.from('# Title\n\nBody.\n', 'utf-8') },
+    ]);
+
+    const result = await runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'doc',
+    });
+
+    const record = status.get(result.assertionUri)!;
+    expect(record.startedAt).toBeTruthy();
+    expect(record.completedAt).toBeTruthy();
+    expect(new Date(record.startedAt).getTime()).toBeLessThanOrEqual(new Date(record.completedAt!).getTime());
+  });
+
+  it('keyed by assertionUri — separate imports to different assertions get separate records', async () => {
+    const body1 = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'file', name: 'file', filename: 'a.md', contentType: 'text/markdown', content: Buffer.from('# A\n\nBody a.\n', 'utf-8') },
+    ]);
+    const body2 = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'file', name: 'file', filename: 'b.md', contentType: 'text/markdown', content: Buffer.from('# B\n\nBody b.\n', 'utf-8') },
+    ]);
+
+    await runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body1, boundary: BOUNDARY, assertionName: 'doc-a',
+    });
+    await runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body2, boundary: BOUNDARY, assertionName: 'doc-b',
+    });
+
+    expect(status.size).toBe(2);
+    const keys = [...status.keys()];
+    expect(keys.some(k => k.endsWith('/doc-a'))).toBe(true);
+    expect(keys.some(k => k.endsWith('/doc-b'))).toBe(true);
+  });
+});
diff --git a/packages/cli/test/skill-endpoint.test.ts b/packages/cli/test/skill-endpoint.test.ts
index 893642ae1..b9ae248fc 100644
--- a/packages/cli/test/skill-endpoint.test.ts
+++ b/packages/cli/test/skill-endpoint.test.ts
@@ -96,9 +96,21 @@ describe('SKILL.md file', () => {
   });
 
   it('marks planned endpoints clearly', () => {
-    expect(skillContent).toContain('🚧 Planned');
+    // The Planned/🚧 markers in the skill doc cover context graph sub-resources
+    // and future agent profile endpoints — NOT the assertion API, which ships
+    // as of PR #108 (create/write/query/promote/discard) and this PR (import-file,
+    // extraction-status).
+    expect(skillContent).toContain('*(planned)*');
+  });
+
+  it('documents the now-shipped assertion API surface', () => {
     expect(skillContent).toContain('/api/assertion/create');
+    expect(skillContent).toContain('/api/assertion/{name}/write');
+    expect(skillContent).toContain('/api/assertion/{name}/query');
+    expect(skillContent).toContain('/api/assertion/{name}/promote');
+    expect(skillContent).toContain('/api/assertion/{name}/discard');
     expect(skillContent).toContain('/api/assertion/{name}/import-file');
+    expect(skillContent).toContain('/api/assertion/{name}/extraction-status');
   });
 
   it('documents error status codes', () => {

From 1cd9dae18c64cbb675136f6753d68adcc5d55295 Mon Sep 17 00:00:00 2001
From: Jurij Skornik <jurij.skornik@gmail.com>
Date: Fri, 10 Apr 2026 19:22:50 +0200
Subject: [PATCH 05/12] fix(cli): harden import-file extraction routing

---
 packages/cli/src/daemon.ts                    |  7 ++-
 .../cli/src/extraction/markdown-extractor.ts  | 48 ++++++++++++++++---
 packages/cli/test/extraction-markdown.test.ts | 47 ++++++++++++++++++
 .../cli/test/import-file-integration.test.ts  | 48 ++++++++++++++++++-
 packages/core/src/extraction-pipeline.ts      | 12 +++--
 .../core/test/extraction-pipeline.test.ts     | 10 ++++
 6 files changed, 161 insertions(+), 11 deletions(-)

diff --git a/packages/cli/src/daemon.ts b/packages/cli/src/daemon.ts
index ee380c8ac..c596df298 100644
--- a/packages/cli/src/daemon.ts
+++ b/packages/cli/src/daemon.ts
@@ -145,6 +145,11 @@ export function parseRequiredSignatures(raw: unknown): { value: number } | { err
   return { value: raw };
 }
 
+function normalizeDetectedContentType(contentType: string | undefined): string {
+  const normalized = contentType?.split(';', 1)[0]?.trim().toLowerCase();
+  return normalized && normalized.length > 0 ? normalized : 'application/octet-stream';
+}
+
 const lastUpdateCheck = { upToDate: true, checkedAt: 0, latestCommit: '', latestVersion: '' };
 let isUpdating = false;
 
@@ -2272,7 +2277,7 @@ async function handleRequest(
     if (!validateRequiredContextGraphId(contextGraphId, res)) return;
     if (!validateOptionalSubGraphName(subGraphName, res)) return;
 
-    const detectedContentType = contentTypeOverride ?? filePart.contentType ?? 'application/octet-stream';
+    const detectedContentType = normalizeDetectedContentType(contentTypeOverride ?? filePart.contentType);
 
     // Persist the original upload to the file store.
     let fileStoreEntry;
diff --git a/packages/cli/src/extraction/markdown-extractor.ts b/packages/cli/src/extraction/markdown-extractor.ts
index e83965e37..6c4a4497a 100644
--- a/packages/cli/src/extraction/markdown-extractor.ts
+++ b/packages/cli/src/extraction/markdown-extractor.ts
@@ -20,6 +20,7 @@
  * Spec: 05_PROTOCOL_EXTENSIONS.md §6.5.2, 19_MARKDOWN_CONTENT_TYPE.md
  */
 
+import { createHash } from 'node:crypto';
 import { load as loadYaml } from 'js-yaml';
 import type { ExtractionQuad as Quad } from '@origintrail-official/dkg-core';
 
@@ -101,7 +102,7 @@ function splitFrontmatter(markdown: string): { frontmatter: Record<string, unkno
 
 /** Extract the text of the first level-1 heading, if any. */
 function findFirstH1(body: string): string | null {
-  const m = body.match(/^#\s+(.+?)\s*$/m);
+  const m = stripCodeFences(body).match(/^#\s+(.+?)\s*$/m);
   return m ? m[1].trim() : null;
 }
 
@@ -109,11 +110,37 @@ function findFirstH1(body: string): string | null {
  * Slugify a string for use in an IRI fragment. Keeps alphanumerics and hyphens.
  */
 function slugify(input: string): string {
-  return input
+  const slug = input
     .toLowerCase()
     .replace(/[^a-z0-9]+/g, '-')
     .replace(/^-+|-+$/g, '')
     .slice(0, 80);
+  if (slug.length > 0) return slug;
+  return `hash-${shortHash(input)}`;
+}
+
+function shortHash(input: string): string {
+  return createHash('sha256').update(input).digest('hex').slice(0, 12);
+}
+
+function normalizeSchemaLocalName(raw: string, kind: 'property' | 'class'): string | null {
+  const stripped = raw.trim().replace(/\(([^)]*)\)/g, '$1');
+  if (stripped.length === 0) return null;
+
+  const asciiTokens = stripped.match(/[A-Za-z0-9]+/g);
+  if (asciiTokens && asciiTokens.length > 0) {
+    return asciiTokens
+      .map((token, index) => {
+        if (kind === 'property' && index === 0) {
+          return token[0]!.toLowerCase() + token.slice(1);
+        }
+        return token[0]!.toUpperCase() + token.slice(1);
+      })
+      .join('');
+  }
+
+  const encoded = encodeURIComponent(stripped);
+  return encoded.length > 0 ? encoded : null;
 }
 
 /**
@@ -149,7 +176,8 @@ function resolveTypeIri(typeValue: unknown): string | null {
   if (typeof typeValue !== 'string' || typeValue.length === 0) return null;
   if (/^(https?:|did:|urn:)/.test(typeValue)) return typeValue;
   // Treat bare identifiers as schema.org classes by convention (Report, Person, etc.)
-  return `http://schema.org/${typeValue}`;
+  const localName = normalizeSchemaLocalName(typeValue, 'class');
+  return localName ? `http://schema.org/${localName}` : null;
 }
 
 /** Resolve a frontmatter scalar value to a triple object literal or IRI. */
@@ -159,6 +187,10 @@ function resolveFrontmatterValue(value: unknown): string | null {
     if (/^(https?:|did:|urn:)/.test(value)) return value;
     return JSON.stringify(value);
   }
+  if (value instanceof Date) {
+    if (Number.isNaN(value.getTime())) return null;
+    return JSON.stringify(value.toISOString());
+  }
   if (typeof value === 'number' || typeof value === 'boolean') {
     return JSON.stringify(String(value));
   }
@@ -168,9 +200,10 @@ function resolveFrontmatterValue(value: unknown): string | null {
 /** Extract wikilinks `[[Target]]` or `[[Target|Alt]]` → IRIs using the `urn:dkg:md:` namespace. */
 function extractWikilinks(body: string): string[] {
   const out = new Set<string>();
+  const noFences = stripCodeFences(body);
   const re = /\[\[([^\]|#]+?)(?:#[^\]|]*)?(?:\|[^\]]*?)?\]\]/g;
   let m: RegExpExecArray | null;
-  while ((m = re.exec(body)) !== null) {
+  while ((m = re.exec(noFences)) !== null) {
     const target = m[1].trim();
     if (target.length === 0) continue;
     out.add(`urn:dkg:md:${slugify(target)}`);
@@ -256,6 +289,7 @@ export function extractFromMarkdown(input: MarkdownExtractInput): MarkdownExtrac
         const obj = resolveFrontmatterValue(v);
         if (obj === null) continue;
         const predicate = frontmatterKeyToPredicate(key);
+        if (predicate === null) continue;
         triples.push({ subject, predicate, object: obj });
       }
     }
@@ -280,6 +314,7 @@ export function extractFromMarkdown(input: MarkdownExtractInput): MarkdownExtrac
   // ── 4. Dataview inline fields → properties ─────────────────────────
   for (const { key, value } of extractDataviewFields(body)) {
     const predicate = frontmatterKeyToPredicate(key);
+    if (predicate === null) continue;
     const obj = /^(https?:|did:|urn:)/.test(value) ? value : JSON.stringify(value);
     triples.push({ subject, predicate, object: obj });
   }
@@ -303,12 +338,13 @@ export function extractFromMarkdown(input: MarkdownExtractInput): MarkdownExtrac
   return { triples, provenance, subjectIri: subject };
 }
 
-function frontmatterKeyToPredicate(key: string): string {
+function frontmatterKeyToPredicate(key: string): string | null {
   if (key === 'name' || key === 'title') return SCHEMA_NAME;
   if (key === 'description' || key === 'summary') return SCHEMA_DESCRIPTION;
   if (key === 'keywords' || key === 'tags') return SCHEMA_KEYWORDS;
   // Unknown keys fall back into the schema.org namespace (same convention as `type`).
-  return `http://schema.org/${key}`;
+  const localName = normalizeSchemaLocalName(key, 'property');
+  return localName ? `http://schema.org/${localName}` : null;
 }
 
 function buildProvenance(args: {
diff --git a/packages/cli/test/extraction-markdown.test.ts b/packages/cli/test/extraction-markdown.test.ts
index 77abc3b5b..46b819b99 100644
--- a/packages/cli/test/extraction-markdown.test.ts
+++ b/packages/cli/test/extraction-markdown.test.ts
@@ -47,6 +47,29 @@ describe('extractFromMarkdown — frontmatter', () => {
     expect(triples).toContainEqual({ subject: 'urn:dkg:md:doc-1', predicate: SCHEMA_DESCRIPTION, object: '"A short doc"' });
   });
 
+  it('normalizes unsafe frontmatter keys and bare type values into safe schema IRIs', () => {
+    const { triples, subjectIri } = extractFromMarkdown({
+      markdown: `---\nid: doc-1\ntype: Research Report\nrelease date: 2026-04-10\nauthor(s): Alice\n---\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(triples).toContainEqual({
+      subject: subjectIri,
+      predicate: RDF_TYPE,
+      object: 'http://schema.org/ResearchReport',
+    });
+    expect(triples).toContainEqual({
+      subject: subjectIri,
+      predicate: 'http://schema.org/releaseDate',
+      object: '"2026-04-10T00:00:00.000Z"',
+    });
+    expect(triples).toContainEqual({
+      subject: subjectIri,
+      predicate: 'http://schema.org/authors',
+      object: '"Alice"',
+    });
+  });
+
   it('emits one triple per element for array values in frontmatter', () => {
     const { triples } = extractFromMarkdown({
       markdown: `---\nid: doc\nauthors:\n  - Alice\n  - Bob\n---\n`,
@@ -108,6 +131,17 @@ describe('extractFromMarkdown — wikilinks', () => {
     const mentions = triples.filter(t => t.predicate === SCHEMA_MENTIONS);
     expect(mentions).toHaveLength(1);
   });
+
+  it('ignores wikilinks inside code fences and derives H1 from visible markdown only', () => {
+    const { triples, subjectIri } = extractFromMarkdown({
+      markdown: `\`\`\`md\n# Hidden Title\n[[Hidden Target]]\n\`\`\`\n\n# Visible Title\n\nSee [[Visible Target]].\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(subjectIri).toBe('urn:dkg:md:visible-title');
+    const mentions = triples.filter(t => t.predicate === SCHEMA_MENTIONS).map(t => t.object);
+    expect(mentions).toEqual(['urn:dkg:md:visible-target']);
+  });
 });
 
 describe('extractFromMarkdown — hashtags', () => {
@@ -255,6 +289,19 @@ describe('extractFromMarkdown — subject IRI resolution', () => {
     expect(subjectIri).toBe('urn:dkg:md:a-title-of-things');
   });
 
+  it('uses a hash fallback when non-ASCII titles and headings would slugify to empty strings', () => {
+    const { triples, subjectIri } = extractFromMarkdown({
+      markdown: `# 東京\n\nSee [[大阪]].\n\n## 感想\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(subjectIri).toMatch(/^urn:dkg:md:hash-[0-9a-f]{12}$/);
+    const mentions = triples.filter(t => t.predicate === SCHEMA_MENTIONS).map(t => t.object);
+    expect(mentions).toEqual([expect.stringMatching(/^urn:dkg:md:hash-[0-9a-f]{12}$/)]);
+    const sections = triples.filter(t => t.predicate === DKG_HAS_SECTION).map(t => t.object);
+    expect(sections).toEqual([expect.stringMatching(new RegExp(`^${subjectIri.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}#section-hash-[0-9a-f]{12}$`))]);
+  });
+
   it('produces a stable anonymous fallback when there is no title', () => {
     const { subjectIri } = extractFromMarkdown({
       markdown: `Just a body. No headings, no frontmatter.\n`,
diff --git a/packages/cli/test/import-file-integration.test.ts b/packages/cli/test/import-file-integration.test.ts
index d3a773101..20a142864 100644
--- a/packages/cli/test/import-file-integration.test.ts
+++ b/packages/cli/test/import-file-integration.test.ts
@@ -118,6 +118,11 @@ interface ImportFileResult {
   };
 }
 
+function normalizeDetectedContentType(contentType: string | undefined): string {
+  const normalized = contentType?.split(';', 1)[0]?.trim().toLowerCase();
+  return normalized && normalized.length > 0 ? normalized : 'application/octet-stream';
+}
+
 async function runImportFileOrchestration(params: {
   agent: MockAgent;
   fileStore: FileStore;
@@ -139,7 +144,7 @@ async function runImportFileOrchestration(params: {
   const contentTypeOverride = textField('contentType');
   const ontologyRef = textField('ontologyRef');
   const subGraphName = textField('subGraphName');
-  const detectedContentType = contentTypeOverride ?? filePart.contentType ?? 'application/octet-stream';
+  const detectedContentType = normalizeDetectedContentType(contentTypeOverride ?? filePart.contentType);
 
   const fileStoreEntry = await fileStore.put(filePart.content, detectedContentType);
   const assertionUri = contextGraphAssertionUri(contextGraphId, agent.peerId, assertionName, subGraphName);
@@ -386,6 +391,22 @@ describe('import-file orchestration — happy paths', () => {
     expect(result.detectedContentType).toBe('text/markdown');
   });
 
+  it('normalizes markdown media types with parameters and casing before Phase 1 routing', async () => {
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'file', name: 'file', filename: 'doc.md', contentType: 'Text/Markdown; charset=utf-8', content: Buffer.from('# Title\n\nBody.\n', 'utf-8') },
+    ]);
+
+    const result = await runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'doc',
+    });
+
+    expect(result.detectedContentType).toBe('text/markdown');
+    expect(result.extraction.status).toBe('completed');
+    expect(result.extraction.pipelineUsed).toBe('text/markdown');
+  });
+
   it('contentType text field overrides the file part Content-Type header', async () => {
     const body = buildMultipart([
       { kind: 'text', name: 'contextGraphId', value: 'cg' },
@@ -454,6 +475,31 @@ describe('import-file orchestration — happy paths', () => {
     expect(triples.some(t => t.object === 'urn:dkg:md:reference')).toBe(true);
   });
 
+  it('normalizes converter media types before registry lookup', async () => {
+    const stubConverter: ExtractionPipeline = {
+      contentTypes: ['application/pdf'],
+      async extract(_input: ExtractionInput): Promise<ConverterOutput> {
+        return { mdIntermediate: '# Converted\n\nBody.\n' };
+      },
+    };
+    registry.register(stubConverter);
+
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'research' },
+      { kind: 'file', name: 'file', filename: 'paper.pdf', contentType: 'Application/PDF; charset=binary', content: Buffer.from('fake-pdf-bytes', 'utf-8') },
+    ]);
+
+    const result = await runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'paper-normalized',
+    });
+
+    expect(result.detectedContentType).toBe('application/pdf');
+    expect(result.extraction.status).toBe('completed');
+    expect(result.extraction.pipelineUsed).toBe('application/pdf');
+    expect(result.extraction.mdIntermediateHash).toBeDefined();
+  });
+
   it('passes ontologyRef through to the converter and Phase 2 extractor', async () => {
     let capturedOntologyRef: string | undefined;
     const stubConverter: ExtractionPipeline = {
diff --git a/packages/core/src/extraction-pipeline.ts b/packages/core/src/extraction-pipeline.ts
index fd28ad03f..76e97569c 100644
--- a/packages/core/src/extraction-pipeline.ts
+++ b/packages/core/src/extraction-pipeline.ts
@@ -60,6 +60,10 @@ export interface ExtractionPipeline {
   extract(input: ExtractionInput): Promise<ConverterOutput>;
 }
 
+function normalizeContentType(contentType: string): string {
+  return contentType.split(';', 1)[0]?.trim().toLowerCase() ?? '';
+}
+
 /**
  * Registry that maps content types to converter pipelines.
  * Nodes register pipelines at startup; the import-file route handler
@@ -72,16 +76,18 @@ export class ExtractionPipelineRegistry {
 
   register(pipeline: ExtractionPipeline): void {
     for (const ct of pipeline.contentTypes) {
-      this.pipelines.set(ct, pipeline);
+      const normalized = normalizeContentType(ct);
+      if (normalized.length === 0) continue;
+      this.pipelines.set(normalized, pipeline);
     }
   }
 
   get(contentType: string): ExtractionPipeline | undefined {
-    return this.pipelines.get(contentType);
+    return this.pipelines.get(normalizeContentType(contentType));
   }
 
   has(contentType: string): boolean {
-    return this.pipelines.has(contentType);
+    return this.pipelines.has(normalizeContentType(contentType));
   }
 
   availableContentTypes(): string[] {
diff --git a/packages/core/test/extraction-pipeline.test.ts b/packages/core/test/extraction-pipeline.test.ts
index b78a7d919..6acd86c4e 100644
--- a/packages/core/test/extraction-pipeline.test.ts
+++ b/packages/core/test/extraction-pipeline.test.ts
@@ -69,6 +69,16 @@ describe('ExtractionPipelineRegistry', () => {
     expect(registry.get('text/markdown')).toBe(mdPipeline);
     expect(registry.get('application/pdf')).toBe(pdfPipeline);
   });
+
+  it('normalizes casing and media-type parameters on registration and lookup', () => {
+    const registry = new ExtractionPipelineRegistry();
+    const pipeline = makePipeline(['Application/PDF']);
+    registry.register(pipeline);
+
+    expect(registry.has('application/pdf')).toBe(true);
+    expect(registry.get('APPLICATION/PDF; charset=utf-8')).toBe(pipeline);
+    expect(registry.availableContentTypes()).toEqual(['application/pdf']);
+  });
 });
 
 describe('ExtractionPipeline interface (Phase 1 converter)', () => {

From 1ccd64b57ca5bf7299ac868a83800f0dfd8f631f Mon Sep 17 00:00:00 2001
From: Jurij Skornik <jurij.skornik@gmail.com>
Date: Fri, 10 Apr 2026 19:36:56 +0200
Subject: [PATCH 06/12] fix(cli): close follow-up import review gaps

---
 packages/cli/src/daemon.ts                    | 46 +++++++++----------
 .../cli/src/extraction/markdown-extractor.ts  |  6 +--
 packages/cli/src/http/multipart.ts            | 17 +++----
 packages/cli/test/extraction-markdown.test.ts | 16 +++++++
 .../cli/test/import-file-integration.test.ts  | 20 +++++++-
 packages/cli/test/multipart.test.ts           |  9 ++++
 6 files changed, 78 insertions(+), 36 deletions(-)

diff --git a/packages/cli/src/daemon.ts b/packages/cli/src/daemon.ts
index c596df298..86b3586ad 100644
--- a/packages/cli/src/daemon.ts
+++ b/packages/cli/src/daemon.ts
@@ -2417,39 +2417,39 @@ async function handleRequest(
     // The sub-graph registration check in assertionCreate/Write (finding 4 of #81)
     // will throw if subGraphName is provided but unregistered — that's intentional.
     const allTriples = [...triples, ...provenance];
-    if (allTriples.length > 0) {
+    try {
+      // Ensure the assertion graph exists even when Phase 2 yields zero triples,
+      // so a completed import always materializes the reported assertion URI.
       try {
-        // Ensure the assertion graph exists (idempotent — re-running import-file on
-        // the same assertion name simply adds new triples to the existing graph).
-        try {
-          await agent.assertion.create(
-            contextGraphId!,
-            assertionName,
-            subGraphName ? { subGraphName } : undefined,
-          );
-        } catch (err: any) {
-          // create() on an existing graph is idempotent in oxigraph, but if the
-          // error is about the sub-graph not being registered, propagate it.
-          if (err.message?.includes('has not been registered')) {
-            return jsonResponse(res, 400, { error: err.message });
-          }
-          // Other errors from create() can be ignored if the graph already exists.
-        }
-        await agent.assertion.write(
+        await agent.assertion.create(
           contextGraphId!,
           assertionName,
-          allTriples.map(t => ({ subject: t.subject, predicate: t.predicate, object: t.object })),
           subGraphName ? { subGraphName } : undefined,
         );
       } catch (err: any) {
+        // create() on an existing graph is idempotent in oxigraph, but if the
+        // error is about the sub-graph not being registered, propagate it.
         if (err.message?.includes('has not been registered')) {
           return jsonResponse(res, 400, { error: err.message });
         }
-        if (err.message?.includes('Invalid') || err.message?.includes('Unsafe')) {
-          return jsonResponse(res, 400, { error: err.message });
-        }
-        throw err;
+        // Other errors from create() can be ignored if the graph already exists.
+      }
+      if (allTriples.length > 0) {
+        await agent.assertion.write(
+          contextGraphId!,
+          assertionName,
+          allTriples.map(t => ({ subject: t.subject, predicate: t.predicate, object: t.object })),
+          subGraphName ? { subGraphName } : undefined,
+        );
       }
+    } catch (err: any) {
+      if (err.message?.includes('has not been registered')) {
+        return jsonResponse(res, 400, { error: err.message });
+      }
+      if (err.message?.includes('Invalid') || err.message?.includes('Unsafe')) {
+        return jsonResponse(res, 400, { error: err.message });
+      }
+      throw err;
     }
 
     const completedRecord: ExtractionStatusRecord = {
diff --git a/packages/cli/src/extraction/markdown-extractor.ts b/packages/cli/src/extraction/markdown-extractor.ts
index 6c4a4497a..fd4f7732e 100644
--- a/packages/cli/src/extraction/markdown-extractor.ts
+++ b/packages/cli/src/extraction/markdown-extractor.ts
@@ -148,7 +148,7 @@ function normalizeSchemaLocalName(raw: string, kind: 'property' | 'class'): stri
  *   1. explicit `documentIri` argument, or
  *   2. frontmatter `id` (if it looks like an IRI or a slug), or
  *   3. slugified first H1 heading with an `urn:dkg:md:` prefix, or
- *   4. stable fallback `urn:dkg:md:anonymous-{short-hash}`.
+ *   4. stable fallback `urn:dkg:md:anonymous-{short-hash}` derived from the full body.
  */
 function resolveSubjectIri(
   input: MarkdownExtractInput,
@@ -166,9 +166,7 @@ function resolveSubjectIri(
   const h1 = findFirstH1(body);
   if (h1) return `urn:dkg:md:${slugify(h1)}`;
 
-  // Stable fallback: hash-like suffix derived from content length and first chars
-  const snippet = body.slice(0, 32).replace(/\s+/g, '-').replace(/[^a-zA-Z0-9-]/g, '');
-  return `urn:dkg:md:anonymous-${snippet.slice(0, 16) || 'empty'}`;
+  return `urn:dkg:md:anonymous-${shortHash(body)}`;
 }
 
 /** Resolve a value from a frontmatter `type` field to a full IRI. */
diff --git a/packages/cli/src/http/multipart.ts b/packages/cli/src/http/multipart.ts
index f9af534ad..f24860df4 100644
--- a/packages/cli/src/http/multipart.ts
+++ b/packages/cli/src/http/multipart.ts
@@ -58,6 +58,7 @@ export function parseMultipart(body: Buffer, boundary: string): MultipartField[]
     throw new MultipartParseError('Empty boundary');
   }
   const delimiter = Buffer.from(`--${boundary}`);
+  const encapsulatedDelimiter = Buffer.from(`\r\n--${boundary}`);
   const crlf = Buffer.from('\r\n');
   const doubleCrlf = Buffer.from('\r\n\r\n');
 
@@ -96,16 +97,16 @@ export function parseMultipart(body: Buffer, boundary: string): MultipartField[]
     const headers = parseHeaders(headerBytes);
     const contentStart = headerEnd + doubleCrlf.length;
 
-    // Find next boundary — part body runs from contentStart to (next delimiter - CRLF)
-    const nextDelimiter = body.indexOf(delimiter, contentStart);
-    if (nextDelimiter < 0) {
+    // Find the next real multipart boundary. Per RFC 2046, encapsulated boundaries
+    // must start on a new line, so raw `--${boundary}` bytes inside the payload do
+    // not count unless they are preceded by CRLF.
+    const nextBoundary = body.indexOf(encapsulatedDelimiter, contentStart);
+    if (nextBoundary < 0) {
       throw new MultipartParseError('Malformed part: no closing boundary');
     }
-    // Strip the CRLF that precedes the next delimiter (part body ends at the CRLF).
-    let contentEnd = nextDelimiter;
-    if (contentEnd >= 2 && body[contentEnd - 2] === 0x0d && body[contentEnd - 1] === 0x0a) {
-      contentEnd -= 2;
-    }
+    const nextDelimiter = nextBoundary + crlf.length;
+    // Part body ends at the CRLF that introduces the next boundary.
+    const contentEnd = nextBoundary;
     const content = body.subarray(contentStart, contentEnd);
 
     const disposition = headers.get('content-disposition');
diff --git a/packages/cli/test/extraction-markdown.test.ts b/packages/cli/test/extraction-markdown.test.ts
index 46b819b99..39b31fa69 100644
--- a/packages/cli/test/extraction-markdown.test.ts
+++ b/packages/cli/test/extraction-markdown.test.ts
@@ -310,6 +310,22 @@ describe('extractFromMarkdown — subject IRI resolution', () => {
     });
     expect(subjectIri.startsWith('urn:dkg:md:anonymous-')).toBe(true);
   });
+
+  it('derives anonymous fallback subjects from the full body instead of a shared prefix', () => {
+    const first = extractFromMarkdown({
+      markdown: `Shared prefix line\nBut a different ending A\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    const second = extractFromMarkdown({
+      markdown: `Shared prefix line\nBut a different ending B\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(first.subjectIri).not.toBe(second.subjectIri);
+    expect(first.subjectIri).toMatch(/^urn:dkg:md:anonymous-[0-9a-f]{12}$/);
+    expect(second.subjectIri).toMatch(/^urn:dkg:md:anonymous-[0-9a-f]{12}$/);
+  });
 });
 
 describe('extractFromMarkdown — provenance', () => {
diff --git a/packages/cli/test/import-file-integration.test.ts b/packages/cli/test/import-file-integration.test.ts
index 20a142864..979187562 100644
--- a/packages/cli/test/import-file-integration.test.ts
+++ b/packages/cli/test/import-file-integration.test.ts
@@ -202,8 +202,8 @@ async function runImportFileOrchestration(params: {
   });
 
   const allTriples = [...triples, ...provenance];
+  await agent.assertion.create(contextGraphId, assertionName, subGraphName ? { subGraphName } : undefined);
   if (allTriples.length > 0) {
-    await agent.assertion.create(contextGraphId, assertionName, subGraphName ? { subGraphName } : undefined);
     await agent.assertion.write(
       contextGraphId,
       assertionName,
@@ -540,6 +540,24 @@ describe('import-file orchestration — happy paths', () => {
     expect(agent.createdAssertions[0]).toEqual({ contextGraphId: 'cg', name: 'decision-1', subGraphName: 'decisions' });
     expect(agent.capturedWrites[0].subGraphName).toBe('decisions');
   });
+
+  it('creates the assertion graph even when Phase 2 extracts zero triples', async () => {
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'file', name: 'file', filename: 'empty.md', contentType: 'text/markdown', content: Buffer.from('', 'utf-8') },
+    ]);
+
+    const result = await runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'empty-doc',
+    });
+
+    expect(result.extraction.status).toBe('completed');
+    expect(result.extraction.tripleCount).toBe(0);
+    expect(agent.createdAssertions).toHaveLength(1);
+    expect(agent.createdAssertions[0]).toEqual({ contextGraphId: 'cg', name: 'empty-doc', subGraphName: undefined });
+    expect(agent.capturedWrites).toHaveLength(0);
+  });
 });
 
 describe('import-file orchestration — graceful degrade', () => {
diff --git a/packages/cli/test/multipart.test.ts b/packages/cli/test/multipart.test.ts
index ba3a47e96..5638fd408 100644
--- a/packages/cli/test/multipart.test.ts
+++ b/packages/cli/test/multipart.test.ts
@@ -117,6 +117,15 @@ describe('parseMultipart — file fields', () => {
     expect(fields[0].content.equals(binary)).toBe(true);
   });
 
+  it('does not treat boundary bytes inside file payload as the next multipart boundary', () => {
+    const payload = Buffer.from(`prefix--${BOUNDARY}--suffix`, 'utf-8');
+    const body = buildBody(filePart('file', 'embedded-boundary.bin', 'application/octet-stream', payload));
+
+    const fields = parseMultipart(body, BOUNDARY);
+    expect(fields).toHaveLength(1);
+    expect(fields[0].content.equals(payload)).toBe(true);
+  });
+
   it('extracts mixed text and file parts in a single body', () => {
     const fileContent = Buffer.from('file body', 'utf-8');
     const body = buildBody(

From e798893b51d56ab3bfbf432268f398971dc9a9e7 Mon Sep 17 00:00:00 2001
From: Jurij Skornik <jurij.skornik@gmail.com>
Date: Fri, 10 Apr 2026 19:58:48 +0200
Subject: [PATCH 07/12] fix: address PR 113 follow-up review comments

---
 packages/cli/src/daemon.ts                    |  47 ++++----
 .../cli/src/extraction/markdown-extractor.ts  |   4 +-
 packages/cli/test/extraction-markdown.test.ts |  25 +++-
 .../cli/test/import-file-integration.test.ts  | 114 +++++++++++++++---
 packages/publisher/src/dkg-publisher.ts       |  42 ++++---
 .../publisher/test/draft-lifecycle.test.ts    |  43 ++++++-
 6 files changed, 212 insertions(+), 63 deletions(-)

diff --git a/packages/cli/src/daemon.ts b/packages/cli/src/daemon.ts
index 86b3586ad..ed052e4b7 100644
--- a/packages/cli/src/daemon.ts
+++ b/packages/cli/src/daemon.ts
@@ -2303,6 +2303,25 @@ async function handleRequest(
     let mdIntermediate: string | null = null;
     let pipelineUsed: string | null = null;
     let mdIntermediateHash: string | undefined;
+    const recordFailedExtraction = (
+      error: string,
+      tripleCount: number,
+      failedPipelineUsed: string | null = pipelineUsed,
+    ): ExtractionStatusRecord => {
+      const failedRecord: ExtractionStatusRecord = {
+        status: 'failed',
+        fileHash: fileStoreEntry.hash,
+        detectedContentType,
+        pipelineUsed: failedPipelineUsed,
+        tripleCount,
+        ...(mdIntermediateHash ? { mdIntermediateHash } : {}),
+        error,
+        startedAt,
+        completedAt: new Date().toISOString(),
+      };
+      extractionStatus.set(assertionUri, failedRecord);
+      return failedRecord;
+    };
 
     if (detectedContentType === 'text/markdown') {
       mdIntermediate = filePart.content.toString('utf-8');
@@ -2323,17 +2342,7 @@ async function handleRequest(
           mdIntermediateHash = mdEntry.hash;
         } catch (err: any) {
           // Phase 1 failure: record in status map, return error response
-          const failedRecord: ExtractionStatusRecord = {
-            status: 'failed',
-            fileHash: fileStoreEntry.hash,
-            detectedContentType,
-            pipelineUsed: detectedContentType,
-            tripleCount: 0,
-            error: `Phase 1 converter failed: ${err.message}`,
-            startedAt,
-            completedAt: new Date().toISOString(),
-          };
-          extractionStatus.set(assertionUri, failedRecord);
+          const failedRecord = recordFailedExtraction(`Phase 1 converter failed: ${err.message}`, 0, detectedContentType);
           return jsonResponse(res, 500, {
             assertionUri,
             fileHash: fileStoreEntry.hash,
@@ -2387,18 +2396,7 @@ async function handleRequest(
       triples = result.triples;
       provenance = result.provenance;
     } catch (err: any) {
-      const failedRecord: ExtractionStatusRecord = {
-        status: 'failed',
-        fileHash: fileStoreEntry.hash,
-        detectedContentType,
-        pipelineUsed,
-        tripleCount: 0,
-        mdIntermediateHash,
-        error: `Phase 2 extraction failed: ${err.message}`,
-        startedAt,
-        completedAt: new Date().toISOString(),
-      };
-      extractionStatus.set(assertionUri, failedRecord);
+      const failedRecord = recordFailedExtraction(`Phase 2 extraction failed: ${err.message}`, 0);
       return jsonResponse(res, 500, {
         assertionUri,
         fileHash: fileStoreEntry.hash,
@@ -2430,6 +2428,7 @@ async function handleRequest(
         // create() on an existing graph is idempotent in oxigraph, but if the
         // error is about the sub-graph not being registered, propagate it.
         if (err.message?.includes('has not been registered')) {
+          recordFailedExtraction(err.message, triples.length);
           return jsonResponse(res, 400, { error: err.message });
         }
         // Other errors from create() can be ignored if the graph already exists.
@@ -2444,9 +2443,11 @@ async function handleRequest(
       }
     } catch (err: any) {
       if (err.message?.includes('has not been registered')) {
+        recordFailedExtraction(err.message, triples.length);
         return jsonResponse(res, 400, { error: err.message });
       }
       if (err.message?.includes('Invalid') || err.message?.includes('Unsafe')) {
+        recordFailedExtraction(err.message, triples.length);
         return jsonResponse(res, 400, { error: err.message });
       }
       throw err;
diff --git a/packages/cli/src/extraction/markdown-extractor.ts b/packages/cli/src/extraction/markdown-extractor.ts
index fd4f7732e..f6a6bbdb6 100644
--- a/packages/cli/src/extraction/markdown-extractor.ts
+++ b/packages/cli/src/extraction/markdown-extractor.ts
@@ -318,9 +318,11 @@ export function extractFromMarkdown(input: MarkdownExtractInput): MarkdownExtrac
   }
 
   // ── 5. Headings → dkg:hasSection ───────────────────────────────────
+  let sectionIndex = 0;
   for (const heading of extractHeadings(body)) {
     if (heading.level === 1) continue; // H1 is the document title, not a section
-    const sectionIri = `${subject}#section-${slugify(heading.text)}`;
+    sectionIndex += 1;
+    const sectionIri = `${subject}#section-${sectionIndex}-${slugify(heading.text)}`;
     triples.push({ subject, predicate: DKG_HAS_SECTION, object: sectionIri });
     triples.push({ subject: sectionIri, predicate: SCHEMA_NAME, object: JSON.stringify(heading.text) });
   }
diff --git a/packages/cli/test/extraction-markdown.test.ts b/packages/cli/test/extraction-markdown.test.ts
index 39b31fa69..9c1166fe9 100644
--- a/packages/cli/test/extraction-markdown.test.ts
+++ b/packages/cli/test/extraction-markdown.test.ts
@@ -220,9 +220,9 @@ describe('extractFromMarkdown — headings', () => {
     const sections = triples.filter(t => t.predicate === DKG_HAS_SECTION);
     expect(sections).toHaveLength(3);
     expect(sections.map(t => t.object)).toEqual([
-      `${subjectIri}#section-intro`,
-      `${subjectIri}#section-methods`,
-      `${subjectIri}#section-sub-method`,
+      `${subjectIri}#section-1-intro`,
+      `${subjectIri}#section-2-methods`,
+      `${subjectIri}#section-3-sub-method`,
     ]);
     // Each section should have a schema:name
     for (const section of sections) {
@@ -230,6 +230,19 @@ describe('extractFromMarkdown — headings', () => {
     }
   });
 
+  it('disambiguates repeated headings by prefixing a stable section index', () => {
+    const { triples, subjectIri } = extractFromMarkdown({
+      markdown: `# Title\n\n## Overview\n\nText.\n\n## Overview\n\nMore text.\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    const sections = triples.filter(t => t.predicate === DKG_HAS_SECTION).map(t => t.object);
+    expect(sections).toEqual([
+      `${subjectIri}#section-1-overview`,
+      `${subjectIri}#section-2-overview`,
+    ]);
+  });
+
   it('H1 promotes to schema:name on the document subject', () => {
     const { triples, subjectIri } = extractFromMarkdown({
       markdown: `# My Document\n\nBody.\n`,
@@ -299,7 +312,7 @@ describe('extractFromMarkdown — subject IRI resolution', () => {
     const mentions = triples.filter(t => t.predicate === SCHEMA_MENTIONS).map(t => t.object);
     expect(mentions).toEqual([expect.stringMatching(/^urn:dkg:md:hash-[0-9a-f]{12}$/)]);
     const sections = triples.filter(t => t.predicate === DKG_HAS_SECTION).map(t => t.object);
-    expect(sections).toEqual([expect.stringMatching(new RegExp(`^${subjectIri.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}#section-hash-[0-9a-f]{12}$`))]);
+    expect(sections).toEqual([expect.stringMatching(new RegExp(`^${subjectIri.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}#section-1-hash-[0-9a-f]{12}$`))]);
   });
 
   it('produces a stable anonymous fallback when there is no title', () => {
@@ -437,8 +450,8 @@ Our method relies on [[SPARQL]] queries.
     // Sections
     const sections = triples.filter(t => t.predicate === DKG_HAS_SECTION).map(t => t.object);
     expect(sections).toEqual([
-      `${subjectIri}#section-background`,
-      `${subjectIri}#section-methods`,
+      `${subjectIri}#section-1-background`,
+      `${subjectIri}#section-2-methods`,
     ]);
 
     // Provenance present
diff --git a/packages/cli/test/import-file-integration.test.ts b/packages/cli/test/import-file-integration.test.ts
index 979187562..57bec2e68 100644
--- a/packages/cli/test/import-file-integration.test.ts
+++ b/packages/cli/test/import-file-integration.test.ts
@@ -79,7 +79,12 @@ interface MockAgent {
   createdAssertions: Array<{ contextGraphId: string; name: string; subGraphName?: string }>;
 }
 
-function makeMockAgent(peerId = '0xMockAgentPeerId'): MockAgent {
+interface MockAgentOptions {
+  createError?: Error;
+  writeError?: Error;
+}
+
+function makeMockAgent(peerId = '0xMockAgentPeerId', options: MockAgentOptions = {}): MockAgent {
   const capturedWrites: CapturedAssertionWrite[] = [];
   const createdAssertions: Array<{ contextGraphId: string; name: string; subGraphName?: string }> = [];
   return {
@@ -88,6 +93,7 @@ function makeMockAgent(peerId = '0xMockAgentPeerId'): MockAgent {
     createdAssertions,
     assertion: {
       async create(contextGraphId: string, name: string, opts?: { subGraphName?: string }): Promise<string> {
+        if (options.createError) throw options.createError;
         createdAssertions.push({ contextGraphId, name, subGraphName: opts?.subGraphName });
         return contextGraphAssertionUri(contextGraphId, peerId, name, opts?.subGraphName);
       },
@@ -97,6 +103,7 @@ function makeMockAgent(peerId = '0xMockAgentPeerId'): MockAgent {
         triples: Array<{ subject: string; predicate: string; object: string }>,
         opts?: { subGraphName?: string },
       ): Promise<void> {
+        if (options.writeError) throw options.writeError;
         capturedWrites.push({ contextGraphId, name, triples, subGraphName: opts?.subGraphName });
       },
     },
@@ -194,22 +201,52 @@ async function runImportFileOrchestration(params: {
   }
 
   // Phase 2
-  const { triples, provenance } = extractFromMarkdown({
-    markdown: mdIntermediate,
-    agentDid: `did:dkg:agent:${agent.peerId}`,
-    ontologyRef,
-    documentIri: assertionUri,
-  });
+  const recordFailed = (error: string, tripleCount: number): void => {
+    extractionStatus.set(assertionUri, {
+      status: 'failed',
+      fileHash: fileStoreEntry.hash,
+      detectedContentType,
+      pipelineUsed,
+      tripleCount,
+      ...(mdIntermediateHash ? { mdIntermediateHash } : {}),
+      error,
+      startedAt,
+      completedAt: new Date().toISOString(),
+    });
+  };
+
+  let triples: ReturnType<typeof extractFromMarkdown>['triples'];
+  let provenance: ReturnType<typeof extractFromMarkdown>['provenance'];
+  try {
+    const result = extractFromMarkdown({
+      markdown: mdIntermediate,
+      agentDid: `did:dkg:agent:${agent.peerId}`,
+      ontologyRef,
+      documentIri: assertionUri,
+    });
+    triples = result.triples;
+    provenance = result.provenance;
+  } catch (err: any) {
+    recordFailed(`Phase 2 extraction failed: ${err.message}`, 0);
+    throw err;
+  }
 
   const allTriples = [...triples, ...provenance];
-  await agent.assertion.create(contextGraphId, assertionName, subGraphName ? { subGraphName } : undefined);
-  if (allTriples.length > 0) {
-    await agent.assertion.write(
-      contextGraphId,
-      assertionName,
-      allTriples.map(t => ({ subject: t.subject, predicate: t.predicate, object: t.object })),
-      subGraphName ? { subGraphName } : undefined,
-    );
+  try {
+    await agent.assertion.create(contextGraphId, assertionName, subGraphName ? { subGraphName } : undefined);
+    if (allTriples.length > 0) {
+      await agent.assertion.write(
+        contextGraphId,
+        assertionName,
+        allTriples.map(t => ({ subject: t.subject, predicate: t.predicate, object: t.object })),
+        subGraphName ? { subGraphName } : undefined,
+      );
+    }
+  } catch (err: any) {
+    if (err.message?.includes('has not been registered') || err.message?.includes('Invalid') || err.message?.includes('Unsafe')) {
+      recordFailed(err.message, triples.length);
+    }
+    throw err;
   }
 
   const completedRecord: ExtractionStatusRecord = {
@@ -558,6 +595,53 @@ describe('import-file orchestration — happy paths', () => {
     expect(agent.createdAssertions[0]).toEqual({ contextGraphId: 'cg', name: 'empty-doc', subGraphName: undefined });
     expect(agent.capturedWrites).toHaveLength(0);
   });
+
+  it('records failed extraction status when assertion.create rejects an unregistered sub-graph', async () => {
+    agent = makeMockAgent('0xMockAgentPeerId', {
+      createError: new Error('Sub-graph "decisions" has not been registered in context graph "cg". Call createSubGraph() first.'),
+    });
+
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'text', name: 'subGraphName', value: 'decisions' },
+      { kind: 'file', name: 'file', filename: 'doc.md', contentType: 'text/markdown', content: Buffer.from('# Title\n\nBody.\n', 'utf-8') },
+    ]);
+
+    await expect(runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'decision-1',
+    })).rejects.toThrow('has not been registered');
+
+    const assertionUri = contextGraphAssertionUri('cg', agent.peerId, 'decision-1', 'decisions');
+    const record = status.get(assertionUri);
+    expect(record).toBeDefined();
+    expect(record?.status).toBe('failed');
+    expect(record?.error).toContain('has not been registered');
+    expect(record?.tripleCount).toBeGreaterThan(0);
+  });
+
+  it('records failed extraction status when assertion.write rejects invalid triples', async () => {
+    agent = makeMockAgent('0xMockAgentPeerId', {
+      writeError: new Error('Invalid triple object'),
+    });
+
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'file', name: 'file', filename: 'doc.md', contentType: 'text/markdown', content: Buffer.from('# Title\n\nBody.\n', 'utf-8') },
+    ]);
+
+    await expect(runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'invalid-write',
+    })).rejects.toThrow('Invalid triple object');
+
+    const assertionUri = contextGraphAssertionUri('cg', agent.peerId, 'invalid-write');
+    const record = status.get(assertionUri);
+    expect(record).toBeDefined();
+    expect(record?.status).toBe('failed');
+    expect(record?.error).toBe('Invalid triple object');
+    expect(record?.tripleCount).toBeGreaterThan(0);
+  });
 });
 
 describe('import-file orchestration — graceful degrade', () => {
diff --git a/packages/publisher/src/dkg-publisher.ts b/packages/publisher/src/dkg-publisher.ts
index 17848c60a..26e94eab5 100644
--- a/packages/publisher/src/dkg-publisher.ts
+++ b/packages/publisher/src/dkg-publisher.ts
@@ -708,20 +708,7 @@ export class DKGPublisher implements Publisher {
     // AccessHandler.lookupKAMeta() and DKGQueryEngine.resolveKA() can still discover
     // the KC without knowing which sub-graph holds the data triples.
     if (options.subGraphName && !options.targetGraphUri) {
-      const sgValidation = validateSubGraphName(options.subGraphName);
-      if (!sgValidation.valid) throw new Error(`Invalid sub-graph name: ${sgValidation.reason}`);
-
-      const sgUri = contextGraphSubGraphUri(options.contextGraphId, options.subGraphName);
-      const registered = await this.store.query(
-        `ASK { GRAPH <did:dkg:context-graph:${assertSafeIri(options.contextGraphId)}/_meta> { <${assertSafeIri(sgUri)}> ?p ?o } }`,
-      );
-      if (registered.type === 'boolean' && !registered.value) {
-        throw new Error(
-          `Sub-graph "${options.subGraphName}" has not been registered in context graph "${options.contextGraphId}". ` +
-          `Call createSubGraph() first.`,
-        );
-      }
-
+      const sgUri = await this.requireRegisteredSubGraph(options.contextGraphId, options.subGraphName);
       options = {
         ...options,
         targetGraphUri: sgUri,
@@ -1469,6 +1456,27 @@ export class DKGPublisher implements Publisher {
     }
   }
 
+  private async requireRegisteredSubGraph(
+    contextGraphId: string,
+    subGraphName: string | undefined,
+  ): Promise<string | undefined> {
+    DKGPublisher.validateOptionalSubGraph(subGraphName);
+    if (!subGraphName) return undefined;
+
+    const sgUri = contextGraphSubGraphUri(contextGraphId, subGraphName);
+    const registered = await this.store.query(
+      `ASK { GRAPH <did:dkg:context-graph:${assertSafeIri(contextGraphId)}/_meta> { <${assertSafeIri(sgUri)}> ?p ?o } }`,
+    );
+    if (registered.type === 'boolean' && !registered.value) {
+      throw new Error(
+        `Sub-graph "${subGraphName}" has not been registered in context graph "${contextGraphId}". ` +
+        `Call createSubGraph() first.`,
+      );
+    }
+
+    return sgUri;
+  }
+
   clearSubGraphOwnership(ownershipKey: string): void {
     this.sharedMemoryOwnedEntities.delete(ownershipKey);
     this.ownedEntities.delete(ownershipKey);
@@ -1476,7 +1484,7 @@ export class DKGPublisher implements Publisher {
   }
 
   async assertionCreate(contextGraphId: string, name: string, agentAddress: string, subGraphName?: string): Promise<string> {
-    DKGPublisher.validateOptionalSubGraph(subGraphName);
+    await this.requireRegisteredSubGraph(contextGraphId, subGraphName);
     const graphUri = contextGraphAssertionUri(contextGraphId, agentAddress, name, subGraphName);
     await this.store.createGraph(graphUri);
     return graphUri;
@@ -1489,7 +1497,7 @@ export class DKGPublisher implements Publisher {
     input: Quad[] | Array<{ subject: string; predicate: string; object: string }>,
     subGraphName?: string,
   ): Promise<void> {
-    DKGPublisher.validateOptionalSubGraph(subGraphName);
+    await this.requireRegisteredSubGraph(contextGraphId, subGraphName);
     const graphUri = contextGraphAssertionUri(contextGraphId, agentAddress, name, subGraphName);
     const quads = input.map((t) => ({
       subject: t.subject, predicate: t.predicate, object: t.object, graph: graphUri,
@@ -1517,7 +1525,7 @@ export class DKGPublisher implements Publisher {
     agentAddress: string,
     opts?: { entities?: string[] | 'all'; subGraphName?: string },
   ): Promise<{ promotedCount: number }> {
-    DKGPublisher.validateOptionalSubGraph(opts?.subGraphName);
+    await this.requireRegisteredSubGraph(contextGraphId, opts?.subGraphName);
     const graphUri = contextGraphAssertionUri(contextGraphId, agentAddress, name, opts?.subGraphName);
     const swmGraphUri = this.graphManager.sharedMemoryUri(contextGraphId, opts?.subGraphName);
 
diff --git a/packages/publisher/test/draft-lifecycle.test.ts b/packages/publisher/test/draft-lifecycle.test.ts
index 514801d79..f8d7aeeb2 100644
--- a/packages/publisher/test/draft-lifecycle.test.ts
+++ b/packages/publisher/test/draft-lifecycle.test.ts
@@ -2,7 +2,7 @@ import { describe, it, expect, beforeEach } from 'vitest';
 import { OxigraphStore, type Quad } from '@origintrail-official/dkg-storage';
 import { MockChainAdapter } from '@origintrail-official/dkg-chain';
 import { TypedEventBus, generateEd25519Keypair, contextGraphAssertionUri } from '@origintrail-official/dkg-core';
-import { DKGPublisher } from '../src/index.js';
+import { DKGPublisher, generateSubGraphRegistration } from '../src/index.js';
 import { ethers } from 'ethers';
 
 const CG_ID = 'test-assertion-cg';
@@ -10,6 +10,7 @@ const SWM_GRAPH = `did:dkg:context-graph:${CG_ID}/_shared_memory`;
 const AGENT = '0x1234567890abcdef1234567890abcdef12345678';
 const AGENT_B = '0xabcdefabcdefabcdefabcdefabcdefabcdefabcd';
 const ASSERTION_NAME = 'my-assertion';
+const SUB_GRAPH_NAME = 'code';
 
 const TRIPLES = [
   { subject: 'urn:test:entity:alice', predicate: 'http://schema.org/name', object: '"Alice"' },
@@ -21,6 +22,13 @@ describe('Working Memory Assertion Lifecycle', () => {
   let store: OxigraphStore;
   let publisher: DKGPublisher;
 
+  const subGraphRegistration = () => generateSubGraphRegistration({
+    contextGraphId: CG_ID,
+    subGraphName: SUB_GRAPH_NAME,
+    createdBy: AGENT,
+    timestamp: new Date('2026-04-10T00:00:00.000Z'),
+  });
+
   beforeEach(async () => {
     store = new OxigraphStore();
     const wallet = ethers.Wallet.createRandom();
@@ -41,6 +49,24 @@ describe('Working Memory Assertion Lifecycle', () => {
     expect(uri).toBe(contextGraphAssertionUri(CG_ID, AGENT, ASSERTION_NAME));
   });
 
+  it('requires registered sub-graphs before creating assertion graphs inside them', async () => {
+    await expect(
+      publisher.assertionCreate(CG_ID, ASSERTION_NAME, AGENT, SUB_GRAPH_NAME),
+    ).rejects.toThrow(`Sub-graph "${SUB_GRAPH_NAME}" has not been registered in context graph "${CG_ID}". Call createSubGraph() first.`);
+  });
+
+  it('requires registered sub-graphs before writing into sub-graph assertions', async () => {
+    await expect(
+      publisher.assertionWrite(CG_ID, ASSERTION_NAME, AGENT, TRIPLES, SUB_GRAPH_NAME),
+    ).rejects.toThrow(`Sub-graph "${SUB_GRAPH_NAME}" has not been registered in context graph "${CG_ID}". Call createSubGraph() first.`);
+  });
+
+  it('requires registered sub-graphs before promoting sub-graph assertions', async () => {
+    await expect(
+      publisher.assertionPromote(CG_ID, ASSERTION_NAME, AGENT, { subGraphName: SUB_GRAPH_NAME }),
+    ).rejects.toThrow(`Sub-graph "${SUB_GRAPH_NAME}" has not been registered in context graph "${CG_ID}". Call createSubGraph() first.`);
+  });
+
   it('write inserts triples into the assertion graph', async () => {
     await publisher.assertionCreate(CG_ID, ASSERTION_NAME, AGENT);
     await publisher.assertionWrite(CG_ID, ASSERTION_NAME, AGENT, TRIPLES);
@@ -140,6 +166,21 @@ describe('Working Memory Assertion Lifecycle', () => {
     expect(agentBQuads[0].subject).toBe('urn:test:bob');
   });
 
+  it('query and discard still work for orphaned sub-graph assertions after deregistration', async () => {
+    await store.insert(subGraphRegistration());
+    await publisher.assertionCreate(CG_ID, ASSERTION_NAME, AGENT, SUB_GRAPH_NAME);
+    await publisher.assertionWrite(CG_ID, ASSERTION_NAME, AGENT, TRIPLES, SUB_GRAPH_NAME);
+
+    await store.delete(subGraphRegistration());
+
+    const quads = await publisher.assertionQuery(CG_ID, ASSERTION_NAME, AGENT, SUB_GRAPH_NAME);
+    expect(quads).toHaveLength(3);
+
+    await publisher.assertionDiscard(CG_ID, ASSERTION_NAME, AGENT, SUB_GRAPH_NAME);
+    const afterDiscard = await publisher.assertionQuery(CG_ID, ASSERTION_NAME, AGENT, SUB_GRAPH_NAME);
+    expect(afterDiscard).toHaveLength(0);
+  });
+
   it('promote on empty assertion returns 0', async () => {
     await publisher.assertionCreate(CG_ID, ASSERTION_NAME, AGENT);
     const result = await publisher.assertionPromote(CG_ID, ASSERTION_NAME, AGENT);

From 51dd7cf4268d5458d85e3e0f03ac7715003513ef Mon Sep 17 00:00:00 2001
From: Jurij Skornik <jurij.skornik@gmail.com>
Date: Fri, 10 Apr 2026 20:48:58 +0200
Subject: [PATCH 08/12] fix: harden import-file extraction flow

---
 packages/cli/src/daemon.ts                    | 217 +++++++++++++-----
 .../cli/src/extraction/markdown-extractor.ts  |  37 ++-
 packages/cli/src/file-store.ts                |  24 +-
 packages/cli/test/extraction-markdown.test.ts |  62 ++++-
 packages/cli/test/file-store.test.ts          |  14 +-
 .../cli/test/import-file-integration.test.ts  | 194 ++++++++++++++--
 6 files changed, 447 insertions(+), 101 deletions(-)

diff --git a/packages/cli/src/daemon.ts b/packages/cli/src/daemon.ts
index ed052e4b7..fc2862202 100644
--- a/packages/cli/src/daemon.ts
+++ b/packages/cli/src/daemon.ts
@@ -2279,6 +2279,17 @@ async function handleRequest(
 
     const detectedContentType = normalizeDetectedContentType(contentTypeOverride ?? filePart.contentType);
 
+    if (subGraphName) {
+      try {
+        const registeredSubGraphs: Array<{ name: string }> = await agent.listSubGraphs(contextGraphId!);
+        if (!registeredSubGraphs.some(subGraph => subGraph.name === subGraphName)) {
+          return jsonResponse(res, 400, { error: unregisteredSubGraphError(contextGraphId!, subGraphName) });
+        }
+      } catch (err: any) {
+        return jsonResponse(res, 500, { error: `Failed to verify sub-graph registration: ${err.message}` });
+      }
+    }
+
     // Persist the original upload to the file store.
     let fileStoreEntry;
     try {
@@ -2303,6 +2314,28 @@ async function handleRequest(
     let mdIntermediate: string | null = null;
     let pipelineUsed: string | null = null;
     let mdIntermediateHash: string | undefined;
+    const respondWithImportFileResponse = (statusCode: number, extraction: ImportFileExtractionPayload) =>
+      jsonResponse(
+        res,
+        statusCode,
+        buildImportFileResponse({
+          assertionUri,
+          fileHash: fileStoreEntry.hash,
+          detectedContentType,
+          extraction,
+        }),
+      );
+    const recordInProgressExtraction = (): void => {
+      setExtractionStatusRecord(extractionStatus, assertionUri, {
+        status: 'in_progress',
+        fileHash: fileStoreEntry.hash,
+        detectedContentType,
+        pipelineUsed,
+        tripleCount: 0,
+        ...(mdIntermediateHash ? { mdIntermediateHash } : {}),
+        startedAt,
+      });
+    };
     const recordFailedExtraction = (
       error: string,
       tripleCount: number,
@@ -2319,13 +2352,31 @@ async function handleRequest(
         startedAt,
         completedAt: new Date().toISOString(),
       };
-      extractionStatus.set(assertionUri, failedRecord);
+      setExtractionStatusRecord(extractionStatus, assertionUri, failedRecord);
       return failedRecord;
     };
+    const respondWithFailedExtraction = (
+      statusCode: number,
+      error: string,
+      tripleCount: number,
+      failedPipelineUsed: string | null = pipelineUsed,
+    ) => {
+      const failedRecord = recordFailedExtraction(error, tripleCount, failedPipelineUsed);
+      return respondWithImportFileResponse(statusCode, {
+        status: 'failed',
+        tripleCount,
+        pipelineUsed: failedRecord.pipelineUsed,
+        ...(failedRecord.mdIntermediateHash ? { mdIntermediateHash: failedRecord.mdIntermediateHash } : {}),
+        error,
+      });
+    };
+
+    recordInProgressExtraction();
 
     if (detectedContentType === 'text/markdown') {
       mdIntermediate = filePart.content.toString('utf-8');
       pipelineUsed = 'text/markdown';
+      recordInProgressExtraction();
     } else {
       const converter = extractionRegistry.get(detectedContentType);
       if (converter) {
@@ -2340,20 +2391,9 @@ async function handleRequest(
           pipelineUsed = detectedContentType;
           const mdEntry = await fileStore.put(Buffer.from(md, 'utf-8'), 'text/markdown');
           mdIntermediateHash = mdEntry.hash;
+          recordInProgressExtraction();
         } catch (err: any) {
-          // Phase 1 failure: record in status map, return error response
-          const failedRecord = recordFailedExtraction(`Phase 1 converter failed: ${err.message}`, 0, detectedContentType);
-          return jsonResponse(res, 500, {
-            assertionUri,
-            fileHash: fileStoreEntry.hash,
-            detectedContentType,
-            extraction: {
-              status: 'failed' as const,
-              tripleCount: 0,
-              pipelineUsed: detectedContentType,
-              error: `Phase 1 converter failed: ${err.message}`,
-            },
-          });
+          return respondWithFailedExtraction(500, `Phase 1 converter failed: ${err.message}`, 0, detectedContentType);
         }
       }
     }
@@ -2370,16 +2410,11 @@ async function handleRequest(
         startedAt,
         completedAt: new Date().toISOString(),
       };
-      extractionStatus.set(assertionUri, skippedRecord);
-      return jsonResponse(res, 200, {
-        assertionUri,
-        fileHash: fileStoreEntry.hash,
-        detectedContentType,
-        extraction: {
-          status: 'skipped' as const,
-          tripleCount: 0,
-          pipelineUsed: null,
-        },
+      setExtractionStatusRecord(extractionStatus, assertionUri, skippedRecord);
+      return respondWithImportFileResponse(200, {
+        status: 'skipped',
+        tripleCount: 0,
+        pipelineUsed: null,
       });
     }
 
@@ -2396,19 +2431,7 @@ async function handleRequest(
       triples = result.triples;
       provenance = result.provenance;
     } catch (err: any) {
-      const failedRecord = recordFailedExtraction(`Phase 2 extraction failed: ${err.message}`, 0);
-      return jsonResponse(res, 500, {
-        assertionUri,
-        fileHash: fileStoreEntry.hash,
-        detectedContentType,
-        extraction: {
-          status: 'failed' as const,
-          tripleCount: 0,
-          pipelineUsed,
-          mdIntermediateHash,
-          error: `Phase 2 extraction failed: ${err.message}`,
-        },
-      });
+      return respondWithFailedExtraction(500, `Phase 2 extraction failed: ${err.message}`, 0);
     }
 
     // ── Write triples + provenance to the assertion graph ──
@@ -2428,8 +2451,7 @@ async function handleRequest(
         // create() on an existing graph is idempotent in oxigraph, but if the
         // error is about the sub-graph not being registered, propagate it.
         if (err.message?.includes('has not been registered')) {
-          recordFailedExtraction(err.message, triples.length);
-          return jsonResponse(res, 400, { error: err.message });
+          return respondWithFailedExtraction(400, err.message, triples.length);
         }
         // Other errors from create() can be ignored if the graph already exists.
       }
@@ -2443,12 +2465,10 @@ async function handleRequest(
       }
     } catch (err: any) {
       if (err.message?.includes('has not been registered')) {
-        recordFailedExtraction(err.message, triples.length);
-        return jsonResponse(res, 400, { error: err.message });
+        return respondWithFailedExtraction(400, err.message, triples.length);
       }
       if (err.message?.includes('Invalid') || err.message?.includes('Unsafe')) {
-        recordFailedExtraction(err.message, triples.length);
-        return jsonResponse(res, 400, { error: err.message });
+        return respondWithFailedExtraction(400, err.message, triples.length);
       }
       throw err;
     }
@@ -2463,18 +2483,13 @@ async function handleRequest(
       startedAt,
       completedAt: new Date().toISOString(),
     };
-    extractionStatus.set(assertionUri, completedRecord);
+    setExtractionStatusRecord(extractionStatus, assertionUri, completedRecord);
 
-    return jsonResponse(res, 200, {
-      assertionUri,
-      fileHash: fileStoreEntry.hash,
-      detectedContentType,
-      extraction: {
-        status: 'completed' as const,
-        tripleCount: triples.length,
-        pipelineUsed,
-        ...(mdIntermediateHash ? { mdIntermediateHash } : {}),
-      },
+    return respondWithImportFileResponse(200, {
+      status: 'completed',
+      tripleCount: triples.length,
+      pipelineUsed,
+      ...(mdIntermediateHash ? { mdIntermediateHash } : {}),
     });
   }
 
@@ -2500,7 +2515,7 @@ async function handleRequest(
       assertionName,
       subGraphName,
     );
-    const record = extractionStatus.get(assertionUri);
+    const record = getExtractionStatusRecord(extractionStatus, assertionUri);
     if (!record) {
       return jsonResponse(res, 404, {
         error: `No extraction record found for assertion "${assertionName}" in context graph "${contextGraphId}"`,
@@ -3280,9 +3295,9 @@ const MAX_UPLOAD_BYTES = 50 * 1024 * 1024; // 50 MB — for import-file document
 
 /**
  * In-memory extraction job tracking record. Populated at import-file time
- * and queried by the extraction-status endpoint. Keyed by the target
- * assertion URI (which is unique per agent × contextGraph × assertionName
- * × subGraphName).
+ * and queried by the extraction-status endpoint. Records are kept in a
+ * bounded, TTL-pruned map keyed by the target assertion URI (which is
+ * unique per agent × contextGraph × assertionName × subGraphName).
  */
 interface ExtractionStatusRecord {
   status: 'in_progress' | 'completed' | 'skipped' | 'failed';
@@ -3296,6 +3311,92 @@ interface ExtractionStatusRecord {
   completedAt?: string;
 }
 
+interface ImportFileExtractionPayload {
+  status: 'completed' | 'skipped' | 'failed';
+  tripleCount: number;
+  pipelineUsed: string | null;
+  mdIntermediateHash?: string;
+  error?: string;
+}
+
+const EXTRACTION_STATUS_TTL_MS = 24 * 60 * 60 * 1000;
+const MAX_EXTRACTION_STATUS_RECORDS = 1000;
+
+function buildImportFileResponse(args: {
+  assertionUri: string;
+  fileHash: string;
+  detectedContentType: string;
+  extraction: ImportFileExtractionPayload;
+}) {
+  return {
+    assertionUri: args.assertionUri,
+    fileHash: args.fileHash,
+    detectedContentType: args.detectedContentType,
+    extraction: {
+      status: args.extraction.status,
+      tripleCount: args.extraction.tripleCount,
+      pipelineUsed: args.extraction.pipelineUsed,
+      ...(args.extraction.mdIntermediateHash ? { mdIntermediateHash: args.extraction.mdIntermediateHash } : {}),
+      ...(args.extraction.error ? { error: args.extraction.error } : {}),
+    },
+  };
+}
+
+function extractionStatusSortKey(record: ExtractionStatusRecord): number {
+  const completedAtMs = record.completedAt ? Date.parse(record.completedAt) : Number.NaN;
+  if (Number.isFinite(completedAtMs)) return completedAtMs;
+  const startedAtMs = Date.parse(record.startedAt);
+  return Number.isFinite(startedAtMs) ? startedAtMs : 0;
+}
+
+function pruneExtractionStatusRecords(extractionStatus: Map<string, ExtractionStatusRecord>, nowMs = Date.now()): void {
+  for (const [assertionUri, record] of extractionStatus.entries()) {
+    const ageRefMs = extractionStatusSortKey(record);
+    if (ageRefMs > 0 && nowMs - ageRefMs > EXTRACTION_STATUS_TTL_MS) {
+      extractionStatus.delete(assertionUri);
+    }
+  }
+
+  if (extractionStatus.size <= MAX_EXTRACTION_STATUS_RECORDS) return;
+
+  const oldestFirst = [...extractionStatus.entries()].sort(
+    ([, left], [, right]) => extractionStatusSortKey(left) - extractionStatusSortKey(right),
+  );
+
+  for (const [assertionUri, record] of oldestFirst) {
+    if (extractionStatus.size <= MAX_EXTRACTION_STATUS_RECORDS) break;
+    if (record.status !== 'in_progress') {
+      extractionStatus.delete(assertionUri);
+    }
+  }
+
+  for (const [assertionUri] of oldestFirst) {
+    if (extractionStatus.size <= MAX_EXTRACTION_STATUS_RECORDS) break;
+    extractionStatus.delete(assertionUri);
+  }
+}
+
+function setExtractionStatusRecord(
+  extractionStatus: Map<string, ExtractionStatusRecord>,
+  assertionUri: string,
+  record: ExtractionStatusRecord,
+): void {
+  pruneExtractionStatusRecords(extractionStatus);
+  extractionStatus.set(assertionUri, record);
+}
+
+function getExtractionStatusRecord(
+  extractionStatus: Map<string, ExtractionStatusRecord>,
+  assertionUri: string,
+): ExtractionStatusRecord | undefined {
+  pruneExtractionStatusRecords(extractionStatus);
+  return extractionStatus.get(assertionUri);
+}
+
+function unregisteredSubGraphError(contextGraphId: string, subGraphName: string): string {
+  return `Sub-graph "${subGraphName}" has not been registered in context graph "${contextGraphId}". Call createSubGraph() first.`;
+}
+
 
 function readBody(req: IncomingMessage, maxBytes = MAX_BODY_BYTES): Promise<string> {
   return new Promise((resolve, reject) => {
diff --git a/packages/cli/src/extraction/markdown-extractor.ts b/packages/cli/src/extraction/markdown-extractor.ts
index f6a6bbdb6..a75aca95d 100644
--- a/packages/cli/src/extraction/markdown-extractor.ts
+++ b/packages/cli/src/extraction/markdown-extractor.ts
@@ -36,7 +36,11 @@ const DKG_EXTRACTED_BY = 'http://dkg.io/ontology/extractedBy';
 const DKG_EXTRACTION_RULE = 'http://dkg.io/ontology/extractionRule';
 const DKG_EXTRACTED_AT = 'http://dkg.io/ontology/extractedAt';
 const PROV_WAS_GENERATED_BY = 'http://www.w3.org/ns/prov#wasGeneratedBy';
+const XSD_BOOLEAN = 'http://www.w3.org/2001/XMLSchema#boolean';
+const XSD_DATE = 'http://www.w3.org/2001/XMLSchema#date';
 const XSD_DATE_TIME = 'http://www.w3.org/2001/XMLSchema#dateTime';
+const XSD_DECIMAL = 'http://www.w3.org/2001/XMLSchema#decimal';
+const XSD_INTEGER = 'http://www.w3.org/2001/XMLSchema#integer';
 
 export interface MarkdownExtractInput {
   /** Markdown source text (the Phase 1 mdIntermediate). */
@@ -123,6 +127,10 @@ function shortHash(input: string): string {
   return createHash('sha256').update(input).digest('hex').slice(0, 12);
 }
 
+function typedLiteral(lexicalForm: string, datatypeIri: string): string {
+  return `${JSON.stringify(lexicalForm)}^^<${datatypeIri}>`;
+}
+
 function normalizeSchemaLocalName(raw: string, kind: 'property' | 'class'): string | null {
   const stripped = raw.trim().replace(/\(([^)]*)\)/g, '$1');
   if (stripped.length === 0) return null;
@@ -187,10 +195,23 @@ function resolveFrontmatterValue(value: unknown): string | null {
   }
   if (value instanceof Date) {
     if (Number.isNaN(value.getTime())) return null;
-    return JSON.stringify(value.toISOString());
+    const isUtcDateOnly =
+      value.getUTCHours() === 0
+      && value.getUTCMinutes() === 0
+      && value.getUTCSeconds() === 0
+      && value.getUTCMilliseconds() === 0;
+    return isUtcDateOnly
+      ? typedLiteral(value.toISOString().slice(0, 10), XSD_DATE)
+      : typedLiteral(value.toISOString(), XSD_DATE_TIME);
   }
-  if (typeof value === 'number' || typeof value === 'boolean') {
-    return JSON.stringify(String(value));
+  if (typeof value === 'number') {
+    if (!Number.isFinite(value)) return null;
+    return Number.isInteger(value)
+      ? typedLiteral(String(value), XSD_INTEGER)
+      : typedLiteral(String(value), XSD_DECIMAL);
+  }
+  if (typeof value === 'boolean') {
+    return typedLiteral(value ? 'true' : 'false', XSD_BOOLEAN);
   }
   return null;
 }
@@ -319,12 +340,20 @@ export function extractFromMarkdown(input: MarkdownExtractInput): MarkdownExtrac
 
   // ── 5. Headings → dkg:hasSection ───────────────────────────────────
   let sectionIndex = 0;
+  const sectionStack: Array<{ level: number; iri: string }> = [];
   for (const heading of extractHeadings(body)) {
     if (heading.level === 1) continue; // H1 is the document title, not a section
     sectionIndex += 1;
     const sectionIri = `${subject}#section-${sectionIndex}-${slugify(heading.text)}`;
-    triples.push({ subject, predicate: DKG_HAS_SECTION, object: sectionIri });
+    while (sectionStack.length > 0 && sectionStack[sectionStack.length - 1]!.level >= heading.level) {
+      sectionStack.pop();
+    }
+    const parentSection = sectionStack.length > 0
+      ? sectionStack[sectionStack.length - 1]!.iri
+      : subject;
+    triples.push({ subject: parentSection, predicate: DKG_HAS_SECTION, object: sectionIri });
     triples.push({ subject: sectionIri, predicate: SCHEMA_NAME, object: JSON.stringify(heading.text) });
+    sectionStack.push({ level: heading.level, iri: sectionIri });
   }
 
   // ── Provenance ─────────────────────────────────────────────────────
diff --git a/packages/cli/src/file-store.ts b/packages/cli/src/file-store.ts
index be577ead1..ee70b0689 100644
--- a/packages/cli/src/file-store.ts
+++ b/packages/cli/src/file-store.ts
@@ -14,7 +14,7 @@
  */
 
 import { createHash } from 'node:crypto';
-import { mkdir, readFile, stat, writeFile } from 'node:fs/promises';
+import { mkdir, readFile, rename, stat, unlink, writeFile } from 'node:fs/promises';
 import { existsSync } from 'node:fs';
 import { join, resolve } from 'node:path';
 
@@ -38,8 +38,8 @@ export class FileStore {
 
   /**
    * Persist `bytes` to the store and return the resulting entry. Idempotent:
-   * re-putting the same bytes returns the same hash and overwrites the
-   * existing file with identical content. The `contentType` metadata is
+   * re-putting the same bytes returns the same hash without rewriting the
+   * existing blob. The `contentType` metadata is
    * attached to the return value but not persisted to disk — callers that
    * need durable content-type metadata should store it separately (e.g. in
    * an `_meta` triple keyed by hash).
@@ -49,7 +49,23 @@ export class FileStore {
     const hash = `sha256:${hex}`;
     const path = this.resolvePath(hex);
     await mkdir(join(this.rootDir, hex.slice(0, 2)), { recursive: true });
-    await writeFile(path, bytes);
+    if (!existsSync(path)) {
+      const tempPath = `${path}.tmp-${process.pid}-${Date.now()}-${Math.random().toString(16).slice(2)}`;
+      try {
+        await writeFile(tempPath, bytes, { flag: 'wx' });
+        try {
+          await rename(tempPath, path);
+        } catch (err: any) {
+          if (!existsSync(path)) {
+            throw err;
+          }
+        }
+      } finally {
+        if (existsSync(tempPath)) {
+          await unlink(tempPath).catch(() => {});
+        }
+      }
+    }
     return { hash, path, size: bytes.length, contentType };
   }
 
diff --git a/packages/cli/test/extraction-markdown.test.ts b/packages/cli/test/extraction-markdown.test.ts
index 9c1166fe9..44c94e6e2 100644
--- a/packages/cli/test/extraction-markdown.test.ts
+++ b/packages/cli/test/extraction-markdown.test.ts
@@ -12,6 +12,11 @@ const SCHEMA_KEYWORDS = 'http://schema.org/keywords';
 const DKG_HAS_SECTION = 'http://dkg.io/ontology/hasSection';
 const DKG_EXTRACTION_PROVENANCE = 'http://dkg.io/ontology/ExtractionProvenance';
 const PROV_WAS_GENERATED_BY = 'http://www.w3.org/ns/prov#wasGeneratedBy';
+const XSD_BOOLEAN = 'http://www.w3.org/2001/XMLSchema#boolean';
+const XSD_DATE = 'http://www.w3.org/2001/XMLSchema#date';
+const XSD_DATE_TIME = 'http://www.w3.org/2001/XMLSchema#dateTime';
+const XSD_DECIMAL = 'http://www.w3.org/2001/XMLSchema#decimal';
+const XSD_INTEGER = 'http://www.w3.org/2001/XMLSchema#integer';
 
 describe('extractFromMarkdown — frontmatter', () => {
   it('extracts rdf:type from frontmatter `type` key (schema.org convention)', () => {
@@ -61,7 +66,7 @@ describe('extractFromMarkdown — frontmatter', () => {
     expect(triples).toContainEqual({
       subject: subjectIri,
       predicate: 'http://schema.org/releaseDate',
-      object: '"2026-04-10T00:00:00.000Z"',
+      object: `"2026-04-10"^^<${XSD_DATE}>`,
     });
     expect(triples).toContainEqual({
       subject: subjectIri,
@@ -80,14 +85,40 @@ describe('extractFromMarkdown — frontmatter', () => {
     expect(authors.map(t => t.object).sort()).toEqual(['"Alice"', '"Bob"']);
   });
 
-  it('handles numeric and boolean scalars', () => {
+  it('emits typed literals for numeric and boolean YAML scalars', () => {
     const { triples } = extractFromMarkdown({
-      markdown: `---\nid: doc\npageCount: 42\npublished: true\n---\n`,
+      markdown: `---\nid: doc\npageCount: 42\nscore: 3.14\npublished: true\n---\n`,
       agentDid: AGENT,
       now: FIXED_NOW,
     });
-    expect(triples).toContainEqual({ subject: 'urn:dkg:md:doc', predicate: 'http://schema.org/pageCount', object: '"42"' });
-    expect(triples).toContainEqual({ subject: 'urn:dkg:md:doc', predicate: 'http://schema.org/published', object: '"true"' });
+    expect(triples).toContainEqual({
+      subject: 'urn:dkg:md:doc',
+      predicate: 'http://schema.org/pageCount',
+      object: `"42"^^<${XSD_INTEGER}>`,
+    });
+    expect(triples).toContainEqual({
+      subject: 'urn:dkg:md:doc',
+      predicate: 'http://schema.org/score',
+      object: `"3.14"^^<${XSD_DECIMAL}>`,
+    });
+    expect(triples).toContainEqual({
+      subject: 'urn:dkg:md:doc',
+      predicate: 'http://schema.org/published',
+      object: `"true"^^<${XSD_BOOLEAN}>`,
+    });
+  });
+
+  it('emits xsd:dateTime for YAML timestamps with a time component', () => {
+    const { triples } = extractFromMarkdown({
+      markdown: `---\nid: doc\nupdatedAt: 2026-04-10T15:45:30Z\n---\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(triples).toContainEqual({
+      subject: 'urn:dkg:md:doc',
+      predicate: 'http://schema.org/updatedAt',
+      object: `"2026-04-10T15:45:30.000Z"^^<${XSD_DATE_TIME}>`,
+    });
   });
 
   it('ignores frontmatter with invalid YAML (fallthrough to body)', () => {
@@ -211,21 +242,28 @@ describe('extractFromMarkdown — Dataview inline fields', () => {
 });
 
 describe('extractFromMarkdown — headings', () => {
-  it('emits dkg:hasSection triples for H2+ headings but not H1', () => {
+  it('preserves heading nesting by attaching deeper headings to their nearest parent section', () => {
     const { triples, subjectIri } = extractFromMarkdown({
       markdown: `# Title\n\n## Intro\n\n## Methods\n\n### Sub-method\n`,
       agentDid: AGENT,
       now: FIXED_NOW,
     });
-    const sections = triples.filter(t => t.predicate === DKG_HAS_SECTION);
-    expect(sections).toHaveLength(3);
-    expect(sections.map(t => t.object)).toEqual([
+    const rootSections = triples.filter(t => t.subject === subjectIri && t.predicate === DKG_HAS_SECTION);
+    expect(rootSections).toHaveLength(2);
+    expect(rootSections.map(t => t.object)).toEqual([
       `${subjectIri}#section-1-intro`,
       `${subjectIri}#section-2-methods`,
-      `${subjectIri}#section-3-sub-method`,
     ]);
-    // Each section should have a schema:name
-    for (const section of sections) {
+    expect(triples).toContainEqual({
+      subject: `${subjectIri}#section-2-methods`,
+      predicate: DKG_HAS_SECTION,
+      object: `${subjectIri}#section-3-sub-method`,
+    });
+    for (const section of [...rootSections, {
+      subject: `${subjectIri}#section-2-methods`,
+      predicate: DKG_HAS_SECTION,
+      object: `${subjectIri}#section-3-sub-method`,
+    }]) {
       expect(triples.some(t => t.subject === section.object && t.predicate === SCHEMA_NAME)).toBe(true);
     }
   });
diff --git a/packages/cli/test/file-store.test.ts b/packages/cli/test/file-store.test.ts
index 4a9c58bc4..d7b399c1a 100644
--- a/packages/cli/test/file-store.test.ts
+++ b/packages/cli/test/file-store.test.ts
@@ -1,5 +1,5 @@
 import { describe, it, expect, beforeEach, afterEach } from 'vitest';
-import { mkdtemp, rm, readFile } from 'node:fs/promises';
+import { mkdtemp, readdir, rm, readFile } from 'node:fs/promises';
 import { tmpdir } from 'node:os';
 import { join } from 'node:path';
 import { createHash } from 'node:crypto';
@@ -55,6 +55,18 @@ describe('FileStore.put', () => {
     expect(second.contentType).toBe('application/octet-stream');
   });
 
+  it('leaves only the final blob after repeated puts of the same content', async () => {
+    const store = new FileStore(rootDir);
+    const bytes = Buffer.from('atomic-write', 'utf-8');
+
+    const first = await store.put(bytes, 'text/plain');
+    const second = await store.put(bytes, 'text/plain');
+
+    expect(second.path).toBe(first.path);
+    const shardEntries = await readdir(join(rootDir, first.hash.slice('sha256:'.length, 'sha256:'.length + 2)));
+    expect(shardEntries).toEqual([first.hash.slice('sha256:'.length + 2)]);
+  });
+
   it('handles empty input', async () => {
     const store = new FileStore(rootDir);
     const entry = await store.put(Buffer.alloc(0), 'application/octet-stream');
diff --git a/packages/cli/test/import-file-integration.test.ts b/packages/cli/test/import-file-integration.test.ts
index 57bec2e68..a5c8bfb1c 100644
--- a/packages/cli/test/import-file-integration.test.ts
+++ b/packages/cli/test/import-file-integration.test.ts
@@ -28,6 +28,7 @@ import { describe, it, expect, beforeEach, afterEach } from 'vitest';
 import { mkdtemp, rm, readFile } from 'node:fs/promises';
 import { tmpdir } from 'node:os';
 import { join } from 'node:path';
+import { existsSync } from 'node:fs';
 import {
   ExtractionPipelineRegistry,
   type ExtractionPipeline,
@@ -62,6 +63,7 @@ interface CapturedAssertionWrite {
 
 interface MockAgent {
   peerId: string;
+  listSubGraphs: (contextGraphId: string) => Promise<Array<{ name: string }>>;
   assertion: {
     create: (
       contextGraphId: string,
@@ -82,6 +84,7 @@ interface MockAgent {
 interface MockAgentOptions {
   createError?: Error;
   writeError?: Error;
+  registeredSubGraphs?: string[];
 }
 
 function makeMockAgent(peerId = '0xMockAgentPeerId', options: MockAgentOptions = {}): MockAgent {
@@ -91,6 +94,9 @@ function makeMockAgent(peerId = '0xMockAgentPeerId', options: MockAgentOptions =
     peerId,
     capturedWrites,
     createdAssertions,
+    async listSubGraphs(): Promise<Array<{ name: string }>> {
+      return (options.registeredSubGraphs ?? []).map(name => ({ name }));
+    },
     assertion: {
       async create(contextGraphId: string, name: string, opts?: { subGraphName?: string }): Promise<string> {
         if (options.createError) throw options.createError;
@@ -125,6 +131,37 @@ interface ImportFileResult {
   };
 }
 
+class ImportFileRouteError extends Error {
+  readonly statusCode: number;
+  readonly body: ImportFileResult;
+
+  constructor(statusCode: number, body: ImportFileResult) {
+    super(body.extraction.error ?? `Import-file request failed with status ${statusCode}`);
+    this.statusCode = statusCode;
+    this.body = body;
+  }
+}
+
+function buildImportFileResponse(args: {
+  assertionUri: string;
+  fileHash: string;
+  detectedContentType: string;
+  extraction: ImportFileResult['extraction'];
+}): ImportFileResult {
+  return {
+    assertionUri: args.assertionUri,
+    fileHash: args.fileHash,
+    detectedContentType: args.detectedContentType,
+    extraction: {
+      status: args.extraction.status,
+      tripleCount: args.extraction.tripleCount,
+      pipelineUsed: args.extraction.pipelineUsed,
+      ...(args.extraction.mdIntermediateHash ? { mdIntermediateHash: args.extraction.mdIntermediateHash } : {}),
+      ...(args.extraction.error ? { error: args.extraction.error } : {}),
+    },
+  };
+}
+
 function normalizeDetectedContentType(contentType: string | undefined): string {
   const normalized = contentType?.split(';', 1)[0]?.trim().toLowerCase();
   return normalized && normalized.length > 0 ? normalized : 'application/octet-stream';
@@ -138,8 +175,9 @@ async function runImportFileOrchestration(params: {
   multipartBody: Buffer;
   boundary: string;
   assertionName: string;
+  onInProgress?: (assertionUri: string, record: ExtractionStatusRecord) => void | Promise<void>;
 }): Promise<ImportFileResult> {
-  const { agent, fileStore, extractionRegistry, extractionStatus, multipartBody, boundary, assertionName } = params;
+  const { agent, fileStore, extractionRegistry, extractionStatus, multipartBody, boundary, assertionName, onInProgress } = params;
 
   const fields = parseMultipart(multipartBody, boundary);
   const filePart = fields.find(f => f.name === 'file' && f.filename !== undefined)!;
@@ -152,6 +190,12 @@ async function runImportFileOrchestration(params: {
   const ontologyRef = textField('ontologyRef');
   const subGraphName = textField('subGraphName');
   const detectedContentType = normalizeDetectedContentType(contentTypeOverride ?? filePart.contentType);
+  if (subGraphName) {
+    const registeredSubGraphs = await agent.listSubGraphs(contextGraphId);
+    if (!registeredSubGraphs.some(subGraph => subGraph.name === subGraphName)) {
+      throw new Error(`Sub-graph "${subGraphName}" has not been registered in context graph "${contextGraphId}". Call createSubGraph() first.`);
+    }
+  }
 
   const fileStoreEntry = await fileStore.put(filePart.content, detectedContentType);
   const assertionUri = contextGraphAssertionUri(contextGraphId, agent.peerId, assertionName, subGraphName);
@@ -160,10 +204,56 @@ async function runImportFileOrchestration(params: {
   let mdIntermediate: string | null = null;
   let pipelineUsed: string | null = null;
   let mdIntermediateHash: string | undefined;
+  const recordInProgress = async (): Promise<void> => {
+    const record: ExtractionStatusRecord = {
+      status: 'in_progress',
+      fileHash: fileStoreEntry.hash,
+      detectedContentType,
+      pipelineUsed,
+      tripleCount: 0,
+      ...(mdIntermediateHash ? { mdIntermediateHash } : {}),
+      startedAt,
+    };
+    extractionStatus.set(assertionUri, record);
+    if (onInProgress) {
+      await onInProgress(assertionUri, record);
+    }
+  };
+  const recordFailed = (error: string, tripleCount: number, failedPipelineUsed: string | null = pipelineUsed): void => {
+    extractionStatus.set(assertionUri, {
+      status: 'failed',
+      fileHash: fileStoreEntry.hash,
+      detectedContentType,
+      pipelineUsed: failedPipelineUsed,
+      tripleCount,
+      ...(mdIntermediateHash ? { mdIntermediateHash } : {}),
+      error,
+      startedAt,
+      completedAt: new Date().toISOString(),
+    });
+  };
+  const fail = (statusCode: number, error: string, tripleCount: number, failedPipelineUsed: string | null = pipelineUsed): never => {
+    recordFailed(error, tripleCount, failedPipelineUsed);
+    throw new ImportFileRouteError(statusCode, buildImportFileResponse({
+      assertionUri,
+      fileHash: fileStoreEntry.hash,
+      detectedContentType,
+      extraction: {
+        status: 'failed',
+        tripleCount,
+        pipelineUsed: failedPipelineUsed,
+        ...(mdIntermediateHash ? { mdIntermediateHash } : {}),
+        error,
+      },
+    }));
+  };
+
+  await recordInProgress();
 
   if (detectedContentType === 'text/markdown') {
     mdIntermediate = filePart.content.toString('utf-8');
     pipelineUsed = 'text/markdown';
+    await recordInProgress();
   } else {
     const converter = extractionRegistry.get(detectedContentType);
     if (converter) {
@@ -177,6 +267,7 @@ async function runImportFileOrchestration(params: {
       pipelineUsed = detectedContentType;
       const mdEntry = await fileStore.put(Buffer.from(md, 'utf-8'), 'text/markdown');
       mdIntermediateHash = mdEntry.hash;
+      await recordInProgress();
     }
   }
 
@@ -192,29 +283,15 @@ async function runImportFileOrchestration(params: {
       completedAt: new Date().toISOString(),
     };
     extractionStatus.set(assertionUri, skippedRecord);
-    return {
+    return buildImportFileResponse({
       assertionUri,
       fileHash: fileStoreEntry.hash,
       detectedContentType,
       extraction: { status: 'skipped', tripleCount: 0, pipelineUsed: null },
-    };
+    });
   }
 
   // Phase 2
-  const recordFailed = (error: string, tripleCount: number): void => {
-    extractionStatus.set(assertionUri, {
-      status: 'failed',
-      fileHash: fileStoreEntry.hash,
-      detectedContentType,
-      pipelineUsed,
-      tripleCount,
-      ...(mdIntermediateHash ? { mdIntermediateHash } : {}),
-      error,
-      startedAt,
-      completedAt: new Date().toISOString(),
-    });
-  };
-
   let triples: ReturnType<typeof extractFromMarkdown>['triples'];
   let provenance: ReturnType<typeof extractFromMarkdown>['provenance'];
   try {
@@ -227,8 +304,7 @@ async function runImportFileOrchestration(params: {
     triples = result.triples;
     provenance = result.provenance;
   } catch (err: any) {
-    recordFailed(`Phase 2 extraction failed: ${err.message}`, 0);
-    throw err;
+    fail(500, `Phase 2 extraction failed: ${err.message}`, 0);
   }
 
   const allTriples = [...triples, ...provenance];
@@ -244,7 +320,7 @@ async function runImportFileOrchestration(params: {
     }
   } catch (err: any) {
     if (err.message?.includes('has not been registered') || err.message?.includes('Invalid') || err.message?.includes('Unsafe')) {
-      recordFailed(err.message, triples.length);
+      fail(400, err.message, triples.length);
     }
     throw err;
   }
@@ -261,7 +337,7 @@ async function runImportFileOrchestration(params: {
   };
   extractionStatus.set(assertionUri, completedRecord);
 
-  return {
+  return buildImportFileResponse({
     assertionUri,
     fileHash: fileStoreEntry.hash,
     detectedContentType,
@@ -271,7 +347,7 @@ async function runImportFileOrchestration(params: {
       pipelineUsed,
       ...(mdIntermediateHash ? { mdIntermediateHash } : {}),
     },
-  };
+  });
 }
 
 // ── Multipart body builder for tests ──
@@ -563,6 +639,10 @@ describe('import-file orchestration — happy paths', () => {
   });
 
   it('passes subGraphName through to assertion.create and assertion.write', async () => {
+    agent = makeMockAgent('0xMockAgentPeerId', {
+      registeredSubGraphs: ['decisions'],
+    });
+
     const body = buildMultipart([
       { kind: 'text', name: 'contextGraphId', value: 'cg' },
       { kind: 'text', name: 'subGraphName', value: 'decisions' },
@@ -578,6 +658,29 @@ describe('import-file orchestration — happy paths', () => {
     expect(agent.capturedWrites[0].subGraphName).toBe('decisions');
   });
 
+  it('seeds an in-progress extraction status before the terminal record is written', async () => {
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'file', name: 'file', filename: 'doc.md', contentType: 'text/markdown', content: Buffer.from('# Title\n\nBody.\n', 'utf-8') },
+    ]);
+
+    let observedInProgress = false;
+    const result = await runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'in-progress-doc',
+      async onInProgress(assertionUri, record) {
+        observedInProgress = true;
+        expect(assertionUri).toBe(contextGraphAssertionUri('cg', agent.peerId, 'in-progress-doc'));
+        expect(record.status).toBe('in_progress');
+        expect(record.completedAt).toBeUndefined();
+        expect(status.get(assertionUri)?.status).toBe('in_progress');
+      },
+    });
+
+    expect(observedInProgress).toBe(true);
+    expect(status.get(result.assertionUri)?.status).toBe('completed');
+  });
+
   it('creates the assertion graph even when Phase 2 extracts zero triples', async () => {
     const body = buildMultipart([
       { kind: 'text', name: 'contextGraphId', value: 'cg' },
@@ -598,6 +701,7 @@ describe('import-file orchestration — happy paths', () => {
 
   it('records failed extraction status when assertion.create rejects an unregistered sub-graph', async () => {
     agent = makeMockAgent('0xMockAgentPeerId', {
+      registeredSubGraphs: ['decisions'],
       createError: new Error('Sub-graph "decisions" has not been registered in context graph "cg". Call createSubGraph() first.'),
     });
 
@@ -620,6 +724,21 @@ describe('import-file orchestration — happy paths', () => {
     expect(record?.tripleCount).toBeGreaterThan(0);
   });
 
+  it('rejects an unregistered sub-graph before storing the upload blob', async () => {
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'text', name: 'subGraphName', value: 'decisions' },
+      { kind: 'file', name: 'file', filename: 'doc.md', contentType: 'text/markdown', content: Buffer.from('# Title\n\nBody.\n', 'utf-8') },
+    ]);
+
+    await expect(runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'unregistered-preflight',
+    })).rejects.toThrow('has not been registered');
+
+    expect(existsSync(fileStore.directory)).toBe(false);
+  });
+
   it('records failed extraction status when assertion.write rejects invalid triples', async () => {
     agent = makeMockAgent('0xMockAgentPeerId', {
       writeError: new Error('Invalid triple object'),
@@ -642,6 +761,37 @@ describe('import-file orchestration — happy paths', () => {
     expect(record?.error).toBe('Invalid triple object');
     expect(record?.tripleCount).toBeGreaterThan(0);
   });
+
+  it('returns the full import-file envelope for write-stage validation failures', async () => {
+    agent = makeMockAgent('0xMockAgentPeerId', {
+      writeError: new Error('Invalid triple object'),
+    });
+
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'file', name: 'file', filename: 'doc.md', contentType: 'text/markdown', content: Buffer.from('# Title\n\nBody.\n', 'utf-8') },
+    ]);
+
+    let caught: unknown;
+    try {
+      await runImportFileOrchestration({
+        agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+        multipartBody: body, boundary: BOUNDARY, assertionName: 'invalid-write-envelope',
+      });
+    } catch (err) {
+      caught = err;
+    }
+
+    expect(caught).toBeInstanceOf(ImportFileRouteError);
+    const routeError = caught as ImportFileRouteError;
+    expect(routeError.statusCode).toBe(400);
+    expect(routeError.body.assertionUri).toBe(contextGraphAssertionUri('cg', agent.peerId, 'invalid-write-envelope'));
+    expect(routeError.body.fileHash).toMatch(/^sha256:[0-9a-f]{64}$/);
+    expect(routeError.body.detectedContentType).toBe('text/markdown');
+    expect(routeError.body.extraction.status).toBe('failed');
+    expect(routeError.body.extraction.error).toBe('Invalid triple object');
+    expect(routeError.body.extraction.tripleCount).toBeGreaterThan(0);
+  });
 });
 
 describe('import-file orchestration — graceful degrade', () => {

From b6db100bee7dcd121f6b326c40ede6b2434fd902 Mon Sep 17 00:00:00 2001
From: Jurij Skornik <jurij.skornik@gmail.com>
Date: Fri, 10 Apr 2026 23:37:24 +0200
Subject: [PATCH 09/12] fix: tighten import-file extraction parsing

---
 .../cli/src/extraction/markdown-extractor.ts  | 32 ++++++++++++++++++-
 packages/cli/src/http/multipart.ts            | 19 ++++++++++-
 packages/cli/test/extraction-markdown.test.ts | 25 +++++++++++++++
 packages/cli/test/multipart.test.ts           |  9 ++++++
 4 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/packages/cli/src/extraction/markdown-extractor.ts b/packages/cli/src/extraction/markdown-extractor.ts
index a75aca95d..91f2a81c1 100644
--- a/packages/cli/src/extraction/markdown-extractor.ts
+++ b/packages/cli/src/extraction/markdown-extractor.ts
@@ -277,7 +277,37 @@ function extractHeadings(body: string): Array<{ level: number; text: string }> {
 
 /** Strip ``` fenced code blocks (and ~~~ variants) from the markdown. */
 function stripCodeFences(body: string): string {
-  return body.replace(/^(```|~~~)[\s\S]*?^\1\s*$/gm, '');
+  const lines = body.split(/\r?\n/);
+  const keptLines: string[] = [];
+  let activeFence: { char: '`' | '~'; length: number } | null = null;
+
+  for (const line of lines) {
+    const trimmed = line.trimEnd();
+    const fenceMarker = trimmed.match(/^([`~])\1{2,}/)?.[0] ?? null;
+
+    if (!activeFence) {
+      if (fenceMarker) {
+        activeFence = {
+          char: fenceMarker[0] as '`' | '~',
+          length: fenceMarker.length,
+        };
+        continue;
+      }
+      keptLines.push(line);
+      continue;
+    }
+
+    if (
+      fenceMarker
+      && fenceMarker[0] === activeFence.char
+      && fenceMarker.length >= activeFence.length
+      && trimmed.slice(fenceMarker.length).trim().length === 0
+    ) {
+      activeFence = null;
+    }
+  }
+
+  return keptLines.join('\n');
 }
 
 /**
diff --git a/packages/cli/src/http/multipart.ts b/packages/cli/src/http/multipart.ts
index f24860df4..22702510b 100644
--- a/packages/cli/src/http/multipart.ts
+++ b/packages/cli/src/http/multipart.ts
@@ -100,7 +100,7 @@ export function parseMultipart(body: Buffer, boundary: string): MultipartField[]
     // Find the next real multipart boundary. Per RFC 2046, encapsulated boundaries
     // must start on a new line, so raw `--${boundary}` bytes inside the payload do
     // not count unless they are preceded by CRLF.
-    const nextBoundary = body.indexOf(encapsulatedDelimiter, contentStart);
+    const nextBoundary = findNextBoundary(body, encapsulatedDelimiter, contentStart);
     if (nextBoundary < 0) {
       throw new MultipartParseError('Malformed part: no closing boundary');
     }
@@ -131,6 +131,23 @@ export function parseMultipart(body: Buffer, boundary: string): MultipartField[]
   throw new MultipartParseError('Unexpected end of body');
 }
 
+function findNextBoundary(body: Buffer, encapsulatedDelimiter: Buffer, start: number): number {
+  let candidate = body.indexOf(encapsulatedDelimiter, start);
+  while (candidate >= 0) {
+    const boundaryEnd = candidate + encapsulatedDelimiter.length;
+    const nextFirstByte = body[boundaryEnd];
+    const nextSecondByte = body[boundaryEnd + 1];
+    const isBoundaryTerminator =
+      (nextFirstByte === 0x0d && nextSecondByte === 0x0a)
+      || (nextFirstByte === 0x2d && nextSecondByte === 0x2d);
+    if (isBoundaryTerminator) {
+      return candidate;
+    }
+    candidate = body.indexOf(encapsulatedDelimiter, candidate + 1);
+  }
+  return -1;
+}
+
 /**
  * Parse a raw header block (CRLF-delimited) into a lower-cased key → value map.
  * Multi-line folded headers are not supported (RFC 7578 §5.3 says field names
diff --git a/packages/cli/test/extraction-markdown.test.ts b/packages/cli/test/extraction-markdown.test.ts
index 44c94e6e2..53c4f6315 100644
--- a/packages/cli/test/extraction-markdown.test.ts
+++ b/packages/cli/test/extraction-markdown.test.ts
@@ -173,6 +173,31 @@ describe('extractFromMarkdown — wikilinks', () => {
     const mentions = triples.filter(t => t.predicate === SCHEMA_MENTIONS).map(t => t.object);
     expect(mentions).toEqual(['urn:dkg:md:visible-target']);
   });
+
+  it('ignores variable-length info-string fences across structural extraction passes', () => {
+    const { triples, subjectIri } = extractFromMarkdown({
+      markdown: `\`\`\`\`md\n# Hidden Title\n[[Hidden Target]]\n#hidden\nfield:: hidden\n\`\`\`\`\n\n# Visible Title\n\n[[Visible Target]] #visible\nfield:: shown\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(subjectIri).toBe('urn:dkg:md:visible-title');
+    expect(triples.filter(t => t.predicate === SCHEMA_MENTIONS).map(t => t.object)).toEqual([
+      'urn:dkg:md:visible-target',
+    ]);
+    expect(triples.filter(t => t.predicate === SCHEMA_KEYWORDS).map(t => t.object)).toEqual([
+      '"visible"',
+    ]);
+    expect(triples).toContainEqual({
+      subject: subjectIri,
+      predicate: 'http://schema.org/field',
+      object: '"shown"',
+    });
+    expect(triples).not.toContainEqual({
+      subject: subjectIri,
+      predicate: 'http://schema.org/field',
+      object: '"hidden"',
+    });
+  });
 });
 
 describe('extractFromMarkdown — hashtags', () => {
diff --git a/packages/cli/test/multipart.test.ts b/packages/cli/test/multipart.test.ts
index 5638fd408..e2a87f3f4 100644
--- a/packages/cli/test/multipart.test.ts
+++ b/packages/cli/test/multipart.test.ts
@@ -126,6 +126,15 @@ describe('parseMultipart — file fields', () => {
     expect(fields[0].content.equals(payload)).toBe(true);
   });
 
+  it('does not treat CRLF-prefixed boundary-like payload bytes as a real boundary unless followed by CRLF or --', () => {
+    const payload = Buffer.from(`prefix${CRLF}--${BOUNDARY}junk${CRLF}suffix`, 'utf-8');
+    const body = buildBody(filePart('file', 'embedded-delimiter.bin', 'application/octet-stream', payload));
+
+    const fields = parseMultipart(body, BOUNDARY);
+    expect(fields).toHaveLength(1);
+    expect(fields[0].content.equals(payload)).toBe(true);
+  });
+
   it('extracts mixed text and file parts in a single body', () => {
     const fileContent = Buffer.from('file body', 'utf-8');
     const body = buildBody(

From 7d50a986cbd477076386f81f14b77a8d7e4939a0 Mon Sep 17 00:00:00 2001
From: Jurij Skornik <jurij.skornik@gmail.com>
Date: Sat, 11 Apr 2026 00:07:07 +0200
Subject: [PATCH 10/12] fix: close import-file review gaps

---
 packages/cli/src/daemon.ts                    | 83 +++----------------
 packages/cli/src/extraction-status.ts         | 63 ++++++++++++++
 .../cli/src/extraction/markdown-extractor.ts  | 30 +++----
 packages/cli/test/extraction-markdown.test.ts | 38 +++++++++
 packages/cli/test/extraction-status.test.ts   | 53 ++++++++++++
 .../cli/test/import-file-integration.test.ts  | 78 ++++++++++++++---
 6 files changed, 247 insertions(+), 98 deletions(-)
 create mode 100644 packages/cli/src/extraction-status.ts
 create mode 100644 packages/cli/test/extraction-status.test.ts

diff --git a/packages/cli/src/daemon.ts b/packages/cli/src/daemon.ts
index fc2862202..153d7b25f 100644
--- a/packages/cli/src/daemon.ts
+++ b/packages/cli/src/daemon.ts
@@ -55,6 +55,7 @@ import { startPublisherRuntimeIfEnabled, type PublisherRuntime } from './publish
 import { loadTokens, httpAuthGuard, extractBearerToken } from './auth.js';
 import { ExtractionPipelineRegistry } from '@origintrail-official/dkg-core';
 import { MarkItDownConverter, isMarkItDownAvailable, extractFromMarkdown } from './extraction/index.js';
+import { type ExtractionStatusRecord, getExtractionStatusRecord, setExtractionStatusRecord } from './extraction-status.js';
 import { FileStore } from './file-store.js';
 import { parseBoundary, parseMultipart, MultipartParseError } from './http/multipart.js';
 import { handleCapture, EpcisValidationError, handleEventsQuery, EpcisQueryError, type Publisher as EpcisPublisher } from '@origintrail-official/dkg-epcis';
@@ -2448,12 +2449,18 @@ async function handleRequest(
           subGraphName ? { subGraphName } : undefined,
         );
       } catch (err: any) {
-        // create() on an existing graph is idempotent in oxigraph, but if the
-        // error is about the sub-graph not being registered, propagate it.
-        if (err.message?.includes('has not been registered')) {
-          return respondWithFailedExtraction(400, err.message, triples.length);
+        const message = err?.message ?? String(err);
+        if (message.includes('already exists') || message.includes('duplicate') || message.includes('conflict')) {
+          // create() is idempotent when the graph already exists.
+        } else if (
+          message.includes('has not been registered')
+          || message.includes('Invalid')
+          || message.includes('Unsafe')
+        ) {
+          return respondWithFailedExtraction(400, message, triples.length);
+        } else {
+          return respondWithFailedExtraction(500, message, triples.length);
         }
-        // Other errors from create() can be ignored if the graph already exists.
       }
       if (allTriples.length > 0) {
         await agent.assertion.write(
@@ -3299,18 +3306,6 @@ const MAX_UPLOAD_BYTES = 50 * 1024 * 1024; // 50 MB — for import-file document
  * bounded, TTL-pruned map keyed by the target assertion URI (which is
  * unique per agent × contextGraph × assertionName × subGraphName).
  */
-interface ExtractionStatusRecord {
-  status: 'in_progress' | 'completed' | 'skipped' | 'failed';
-  fileHash: string;
-  detectedContentType: string;
-  pipelineUsed: string | null;
-  tripleCount: number;
-  mdIntermediateHash?: string;
-  error?: string;
-  startedAt: string;
-  completedAt?: string;
-}
-
 interface ImportFileExtractionPayload {
   status: 'completed' | 'skipped' | 'failed';
   tripleCount: number;
@@ -3319,9 +3314,6 @@ interface ImportFileExtractionPayload {
   error?: string;
 }
 
-const EXTRACTION_STATUS_TTL_MS = 24 * 60 * 60 * 1000;
-const MAX_EXTRACTION_STATUS_RECORDS = 1000;
-
 function buildImportFileResponse(args: {
   assertionUri: string;
   fileHash: string;
@@ -3342,57 +3334,6 @@ function buildImportFileResponse(args: {
   };
 }
 
-function extractionStatusSortKey(record: ExtractionStatusRecord): number {
-  const completedAtMs = record.completedAt ? Date.parse(record.completedAt) : Number.NaN;
-  if (Number.isFinite(completedAtMs)) return completedAtMs;
-  const startedAtMs = Date.parse(record.startedAt);
-  return Number.isFinite(startedAtMs) ? startedAtMs : 0;
-}
-
-function pruneExtractionStatusRecords(extractionStatus: Map<string, ExtractionStatusRecord>, nowMs = Date.now()): void {
-  for (const [assertionUri, record] of extractionStatus.entries()) {
-    const ageRefMs = extractionStatusSortKey(record);
-    if (ageRefMs > 0 && nowMs - ageRefMs > EXTRACTION_STATUS_TTL_MS) {
-      extractionStatus.delete(assertionUri);
-    }
-  }
-
-  if (extractionStatus.size <= MAX_EXTRACTION_STATUS_RECORDS) return;
-
-  const oldestFirst = [...extractionStatus.entries()].sort(
-    ([, left], [, right]) => extractionStatusSortKey(left) - extractionStatusSortKey(right),
-  );
-
-  for (const [assertionUri, record] of oldestFirst) {
-    if (extractionStatus.size <= MAX_EXTRACTION_STATUS_RECORDS) break;
-    if (record.status !== 'in_progress') {
-      extractionStatus.delete(assertionUri);
-    }
-  }
-
-  for (const [assertionUri] of oldestFirst) {
-    if (extractionStatus.size <= MAX_EXTRACTION_STATUS_RECORDS) break;
-    extractionStatus.delete(assertionUri);
-  }
-}
-
-function setExtractionStatusRecord(
-  extractionStatus: Map<string, ExtractionStatusRecord>,
-  assertionUri: string,
-  record: ExtractionStatusRecord,
-): void {
-  pruneExtractionStatusRecords(extractionStatus);
-  extractionStatus.set(assertionUri, record);
-}
-
-function getExtractionStatusRecord(
-  extractionStatus: Map<string, ExtractionStatusRecord>,
-  assertionUri: string,
-): ExtractionStatusRecord | undefined {
-  pruneExtractionStatusRecords(extractionStatus);
-  return extractionStatus.get(assertionUri);
-}
-
 function unregisteredSubGraphError(contextGraphId: string, subGraphName: string): string {
   return `Sub-graph "${subGraphName}" has not been registered in context graph "${contextGraphId}". Call createSubGraph() first.`;
 }
diff --git a/packages/cli/src/extraction-status.ts b/packages/cli/src/extraction-status.ts
new file mode 100644
index 000000000..9f716432d
--- /dev/null
+++ b/packages/cli/src/extraction-status.ts
@@ -0,0 +1,63 @@
+export interface ExtractionStatusRecord {
+  status: 'in_progress' | 'completed' | 'skipped' | 'failed';
+  fileHash: string;
+  detectedContentType: string;
+  pipelineUsed: string | null;
+  tripleCount: number;
+  mdIntermediateHash?: string;
+  error?: string;
+  startedAt: string;
+  completedAt?: string;
+}
+
+export const EXTRACTION_STATUS_TTL_MS = 24 * 60 * 60 * 1000;
+export const MAX_EXTRACTION_STATUS_RECORDS = 1000;
+
+function extractionStatusSortKey(record: ExtractionStatusRecord): number {
+  const completedAtMs = record.completedAt ? Date.parse(record.completedAt) : Number.NaN;
+  if (Number.isFinite(completedAtMs)) return completedAtMs;
+  const startedAtMs = Date.parse(record.startedAt);
+  return Number.isFinite(startedAtMs) ? startedAtMs : 0;
+}
+
+export function pruneExtractionStatusRecords(
+  extractionStatus: Map<string, ExtractionStatusRecord>,
+  nowMs = Date.now(),
+): void {
+  for (const [assertionUri, record] of extractionStatus.entries()) {
+    const ageRefMs = extractionStatusSortKey(record);
+    if (ageRefMs > 0 && nowMs - ageRefMs > EXTRACTION_STATUS_TTL_MS) {
+      extractionStatus.delete(assertionUri);
+    }
+  }
+
+  if (extractionStatus.size <= MAX_EXTRACTION_STATUS_RECORDS) return;
+
+  const oldestFirst = [...extractionStatus.entries()].sort(
+    ([, left], [, right]) => extractionStatusSortKey(left) - extractionStatusSortKey(right),
+  );
+
+  for (const [assertionUri, record] of oldestFirst) {
+    if (extractionStatus.size <= MAX_EXTRACTION_STATUS_RECORDS) break;
+    if (record.status !== 'in_progress') {
+      extractionStatus.delete(assertionUri);
+    }
+  }
+}
+
+export function setExtractionStatusRecord(
+  extractionStatus: Map<string, ExtractionStatusRecord>,
+  assertionUri: string,
+  record: ExtractionStatusRecord,
+): void {
+  extractionStatus.set(assertionUri, record);
+  pruneExtractionStatusRecords(extractionStatus);
+}
+
+export function getExtractionStatusRecord(
+  extractionStatus: Map<string, ExtractionStatusRecord>,
+  assertionUri: string,
+): ExtractionStatusRecord | undefined {
+  pruneExtractionStatusRecords(extractionStatus);
+  return extractionStatus.get(assertionUri);
+}
diff --git a/packages/cli/src/extraction/markdown-extractor.ts b/packages/cli/src/extraction/markdown-extractor.ts
index 91f2a81c1..953ed3fe7 100644
--- a/packages/cli/src/extraction/markdown-extractor.ts
+++ b/packages/cli/src/extraction/markdown-extractor.ts
@@ -249,16 +249,18 @@ function extractHashtags(body: string): string[] {
 }
 
 /**
- * Extract Dataview inline fields: `key:: value` at line-start (allowing leading whitespace).
+ * Extract Dataview inline fields: `key:: value` anywhere in a visible line.
  * Returns key-value pairs with raw string values; the caller translates to triples.
  */
 function extractDataviewFields(body: string): Array<{ key: string; value: string }> {
   const out: Array<{ key: string; value: string }> = [];
   const noFences = stripCodeFences(body);
-  const re = /^[\s>*-]*([a-zA-Z][\w-]*)::\s*(.+?)\s*$/gm;
-  let m: RegExpExecArray | null;
-  while ((m = re.exec(noFences)) !== null) {
-    out.push({ key: m[1], value: m[2] });
+  for (const line of noFences.split(/\r?\n/)) {
+    const re = /(?:^|[^\w])([a-zA-Z][\w-]*)::\s*(.+?)(?=(?:\s+[a-zA-Z][\w-]*::)|$)/g;
+    let m: RegExpExecArray | null;
+    while ((m = re.exec(line)) !== null) {
+      out.push({ key: m[1], value: m[2].trim() });
+    }
   }
   return out;
 }
@@ -282,14 +284,14 @@ function stripCodeFences(body: string): string {
   let activeFence: { char: '`' | '~'; length: number } | null = null;
 
   for (const line of lines) {
-    const trimmed = line.trimEnd();
-    const fenceMarker = trimmed.match(/^([`~])\1{2,}/)?.[0] ?? null;
+    const trimmedEnd = line.trimEnd();
+    const fenceMatch = trimmedEnd.match(/^ {0,3}(([`~])\2{2,})(.*)$/);
 
     if (!activeFence) {
-      if (fenceMarker) {
+      if (fenceMatch) {
         activeFence = {
-          char: fenceMarker[0] as '`' | '~',
-          length: fenceMarker.length,
+          char: fenceMatch[2] as '`' | '~',
+          length: fenceMatch[1].length,
         };
         continue;
       }
@@ -298,10 +300,10 @@ function stripCodeFences(body: string): string {
     }
 
     if (
-      fenceMarker
-      && fenceMarker[0] === activeFence.char
-      && fenceMarker.length >= activeFence.length
-      && trimmed.slice(fenceMarker.length).trim().length === 0
+      fenceMatch
+      && fenceMatch[2] === activeFence.char
+      && fenceMatch[1].length >= activeFence.length
+      && fenceMatch[3].trim().length === 0
     ) {
       activeFence = null;
     }
diff --git a/packages/cli/test/extraction-markdown.test.ts b/packages/cli/test/extraction-markdown.test.ts
index 53c4f6315..863c67792 100644
--- a/packages/cli/test/extraction-markdown.test.ts
+++ b/packages/cli/test/extraction-markdown.test.ts
@@ -198,6 +198,31 @@ describe('extractFromMarkdown — wikilinks', () => {
       object: '"hidden"',
     });
   });
+
+  it('ignores fences indented by up to three spaces', () => {
+    const { triples, subjectIri } = extractFromMarkdown({
+      markdown: `  \`\`\`md\n  # Hidden Title\n  [[Hidden Target]]\n  #hidden\n  field:: hidden\n  \`\`\`\n\n# Visible Title\n\n[[Visible Target]] #visible\nfield:: shown\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(subjectIri).toBe('urn:dkg:md:visible-title');
+    expect(triples.filter(t => t.predicate === SCHEMA_MENTIONS).map(t => t.object)).toEqual([
+      'urn:dkg:md:visible-target',
+    ]);
+    expect(triples.filter(t => t.predicate === SCHEMA_KEYWORDS).map(t => t.object)).toEqual([
+      '"visible"',
+    ]);
+    expect(triples).toContainEqual({
+      subject: subjectIri,
+      predicate: 'http://schema.org/field',
+      object: '"shown"',
+    });
+    expect(triples).not.toContainEqual({
+      subject: subjectIri,
+      predicate: 'http://schema.org/field',
+      object: '"hidden"',
+    });
+  });
 });
 
 describe('extractFromMarkdown — hashtags', () => {
@@ -245,6 +270,19 @@ describe('extractFromMarkdown — Dataview inline fields', () => {
     expect(triples).toContainEqual({ subject: subjectIri, predicate: 'http://schema.org/status', object: '"draft"' });
   });
 
+  it('extracts inline `key:: value` fields embedded in prose', () => {
+    const { triples, subjectIri } = extractFromMarkdown({
+      markdown: `# Doc\n\nSentence with status:: draft\n`,
+      agentDid: AGENT,
+      now: FIXED_NOW,
+    });
+    expect(triples).toContainEqual({
+      subject: subjectIri,
+      predicate: 'http://schema.org/status',
+      object: '"draft"',
+    });
+  });
+
   it('preserves IRI values as IRIs (not literals)', () => {
     const { triples, subjectIri } = extractFromMarkdown({
       markdown: `# Doc\n\nhomepage:: https://example.org/home\n`,
diff --git a/packages/cli/test/extraction-status.test.ts b/packages/cli/test/extraction-status.test.ts
new file mode 100644
index 000000000..de274b674
--- /dev/null
+++ b/packages/cli/test/extraction-status.test.ts
@@ -0,0 +1,53 @@
+import { describe, expect, it } from 'vitest';
+import {
+  MAX_EXTRACTION_STATUS_RECORDS,
+  pruneExtractionStatusRecords,
+  type ExtractionStatusRecord,
+} from '../src/extraction-status.js';
+
+const BASE_MS = Date.UTC(2026, 3, 10, 12, 0, 0);
+
+function makeRecord(status: ExtractionStatusRecord['status'], index: number): ExtractionStatusRecord {
+  const startedAt = new Date(BASE_MS + (index * 1000)).toISOString();
+  return {
+    status,
+    fileHash: `sha256:${index.toString(16).padStart(64, '0')}`,
+    detectedContentType: 'text/markdown',
+    pipelineUsed: status === 'skipped' ? null : 'text/markdown',
+    tripleCount: 0,
+    startedAt,
+    ...(status === 'in_progress' ? {} : { completedAt: new Date(BASE_MS + (index * 1000) + 500).toISOString() }),
+  };
+}
+
+describe('extraction-status pruning', () => {
+  it('does not evict in-progress records when only active jobs remain above capacity', () => {
+    const status = new Map<string, ExtractionStatusRecord>();
+    for (let i = 0; i < MAX_EXTRACTION_STATUS_RECORDS + 1; i += 1) {
+      status.set(`assertion-${i}`, makeRecord('in_progress', i));
+    }
+
+    pruneExtractionStatusRecords(status, BASE_MS + ((MAX_EXTRACTION_STATUS_RECORDS + 2) * 1000));
+
+    expect(status.size).toBe(MAX_EXTRACTION_STATUS_RECORDS + 1);
+    expect(status.has('assertion-0')).toBe(true);
+    expect([...status.values()].every(record => record.status === 'in_progress')).toBe(true);
+  });
+
+  it('evicts completed records before active ones when capacity is exceeded', () => {
+    const status = new Map<string, ExtractionStatusRecord>();
+    for (let i = 0; i < 5; i += 1) {
+      status.set(`active-${i}`, makeRecord('in_progress', i));
+    }
+    for (let i = 0; i < MAX_EXTRACTION_STATUS_RECORDS; i += 1) {
+      status.set(`completed-${i}`, makeRecord('completed', i + 10));
+    }
+
+    pruneExtractionStatusRecords(status, BASE_MS + ((MAX_EXTRACTION_STATUS_RECORDS + 20) * 1000));
+
+    expect(status.size).toBe(MAX_EXTRACTION_STATUS_RECORDS);
+    for (let i = 0; i < 5; i += 1) {
+      expect(status.has(`active-${i}`)).toBe(true);
+    }
+  });
+});
diff --git a/packages/cli/test/import-file-integration.test.ts b/packages/cli/test/import-file-integration.test.ts
index a5c8bfb1c..938f68174 100644
--- a/packages/cli/test/import-file-integration.test.ts
+++ b/packages/cli/test/import-file-integration.test.ts
@@ -37,23 +37,12 @@ import {
   contextGraphAssertionUri,
 } from '@origintrail-official/dkg-core';
 import { FileStore } from '../src/file-store.js';
+import type { ExtractionStatusRecord } from '../src/extraction-status.js';
 import { parseBoundary, parseMultipart } from '../src/http/multipart.js';
 import { extractFromMarkdown } from '../src/extraction/markdown-extractor.js';
 
 // ── Test fixture types (mirroring the ExtractionStatusRecord in daemon.ts) ──
 
-interface ExtractionStatusRecord {
-  status: 'in_progress' | 'completed' | 'skipped' | 'failed';
-  fileHash: string;
-  detectedContentType: string;
-  pipelineUsed: string | null;
-  tripleCount: number;
-  mdIntermediateHash?: string;
-  error?: string;
-  startedAt: string;
-  completedAt?: string;
-}
-
 interface CapturedAssertionWrite {
   contextGraphId: string;
   name: string;
@@ -309,7 +298,17 @@ async function runImportFileOrchestration(params: {
 
   const allTriples = [...triples, ...provenance];
   try {
-    await agent.assertion.create(contextGraphId, assertionName, subGraphName ? { subGraphName } : undefined);
+    try {
+      await agent.assertion.create(contextGraphId, assertionName, subGraphName ? { subGraphName } : undefined);
+    } catch (err: any) {
+      const message = err?.message ?? String(err);
+      if (!(message.includes('already exists') || message.includes('duplicate') || message.includes('conflict'))) {
+        if (message.includes('has not been registered') || message.includes('Invalid') || message.includes('Unsafe')) {
+          fail(400, message, triples.length);
+        }
+        fail(500, message, triples.length);
+      }
+    }
     if (allTriples.length > 0) {
       await agent.assertion.write(
         contextGraphId,
@@ -724,6 +723,59 @@ describe('import-file orchestration — happy paths', () => {
     expect(record?.tripleCount).toBeGreaterThan(0);
   });
 
+  it('surfaces non-idempotent assertion.create failures as failed imports', async () => {
+    agent = makeMockAgent('0xMockAgentPeerId', {
+      createError: new Error('Storage backend unavailable'),
+    });
+
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'file', name: 'file', filename: 'empty.md', contentType: 'text/markdown', content: Buffer.from('', 'utf-8') },
+    ]);
+
+    let caught: unknown;
+    try {
+      await runImportFileOrchestration({
+        agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+        multipartBody: body, boundary: BOUNDARY, assertionName: 'create-runtime-failure',
+      });
+    } catch (err) {
+      caught = err;
+    }
+
+    expect(caught).toBeInstanceOf(ImportFileRouteError);
+    const routeError = caught as ImportFileRouteError;
+    expect(routeError.statusCode).toBe(500);
+    expect(routeError.body.extraction.status).toBe('failed');
+    expect(routeError.body.extraction.error).toBe('Storage backend unavailable');
+
+    const assertionUri = contextGraphAssertionUri('cg', agent.peerId, 'create-runtime-failure');
+    const record = status.get(assertionUri);
+    expect(record?.status).toBe('failed');
+    expect(record?.error).toBe('Storage backend unavailable');
+    expect(record?.tripleCount).toBe(0);
+  });
+
+  it('treats explicit already-exists assertion.create failures as idempotent', async () => {
+    agent = makeMockAgent('0xMockAgentPeerId', {
+      createError: new Error('Assertion graph already exists'),
+    });
+
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'file', name: 'file', filename: 'doc.md', contentType: 'text/markdown', content: Buffer.from('# Title\n\nBody.\n', 'utf-8') },
+    ]);
+
+    const result = await runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'create-idempotent',
+    });
+
+    expect(result.extraction.status).toBe('completed');
+    expect(agent.capturedWrites).toHaveLength(1);
+    expect(status.get(result.assertionUri)?.status).toBe('completed');
+  });
+
   it('rejects an unregistered sub-graph before storing the upload blob', async () => {
     const body = buildMultipart([
       { kind: 'text', name: 'contextGraphId', value: 'cg' },

From 307f20f4bc0bc43262d66cf568ddf4442e420a89 Mon Sep 17 00:00:00 2001
From: Jurij Skornik <jurij.skornik@gmail.com>
Date: Sat, 11 Apr 2026 00:28:32 +0200
Subject: [PATCH 11/12] fix: harden import-file error handling and multipart
 Content-Type parsing

Two PR #113 review findings:

1. parseBoundary() crashed on duplicated Content-Type headers because the
   parameter type didn't admit string[] and .toLowerCase() blew up at runtime.
   Widen the signature to string | string[] | undefined and reject array
   values as ambiguous so the route handler returns a clean 400 instead of
   500-ing inside the parser.

2. The outer write-stage catch in the import-file handler only matched
   has-not-been-registered / Invalid / Unsafe errors and rethrew everything
   else without updating the extraction status record. That left
   /extraction-status stuck reporting in_progress on unexpected agent.write()
   failures even after the import had failed. Record the failure via
   recordFailedExtraction(...) before rethrowing so the status reflects
   reality. Mirror the same fix in the import-file orchestration test
   helper, which had the same shape.

Adds two tests:
- parseBoundary returns null for array values
- import-file orchestration records failed status on unexpected
  write-stage errors (e.g. "Connection refused")

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 packages/cli/src/daemon.ts                    | 13 +++++---
 packages/cli/src/http/multipart.ts            | 13 ++++++--
 .../cli/test/import-file-integration.test.ts  | 32 +++++++++++++++++++
 packages/cli/test/multipart.test.ts           |  8 +++++
 4 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/packages/cli/src/daemon.ts b/packages/cli/src/daemon.ts
index 153d7b25f..eebe954ea 100644
--- a/packages/cli/src/daemon.ts
+++ b/packages/cli/src/daemon.ts
@@ -2471,12 +2471,17 @@ async function handleRequest(
         );
       }
     } catch (err: any) {
-      if (err.message?.includes('has not been registered')) {
-        return respondWithFailedExtraction(400, err.message, triples.length);
+      const message = err?.message ?? String(err);
+      if (message.includes('has not been registered')) {
+        return respondWithFailedExtraction(400, message, triples.length);
       }
-      if (err.message?.includes('Invalid') || err.message?.includes('Unsafe')) {
-        return respondWithFailedExtraction(400, err.message, triples.length);
+      if (message.includes('Invalid') || message.includes('Unsafe')) {
+        return respondWithFailedExtraction(400, message, triples.length);
       }
+      // Unexpected write-stage failure: record the failure on the extraction
+      // status map before rethrowing so /extraction-status doesn't stay stuck
+      // at in_progress when the top-level 500 handler takes over.
+      recordFailedExtraction(message, triples.length);
       throw err;
     }
 
diff --git a/packages/cli/src/http/multipart.ts b/packages/cli/src/http/multipart.ts
index 22702510b..523d44495 100644
--- a/packages/cli/src/http/multipart.ts
+++ b/packages/cli/src/http/multipart.ts
@@ -38,10 +38,17 @@ export interface MultipartField {
 
 /**
  * Extract the boundary token from a `Content-Type: multipart/form-data; boundary=...` header.
- * Returns null if the header is missing, malformed, or not multipart/form-data.
+ * Returns null if the header is missing, malformed, ambiguous, or not multipart/form-data.
+ *
+ * Accepts the full `IncomingHttpHeaders['content-type']` shape (`string | string[] | undefined`)
+ * so that callers can pass `req.headers['content-type']` directly. Array values — which Node
+ * can deliver when a client sends duplicated Content-Type headers — are rejected as ambiguous
+ * rather than coerced, so the route handler returns a clean 400 instead of crashing inside
+ * `.toLowerCase()`.
  */
-export function parseBoundary(contentTypeHeader: string | undefined): string | null {
-  if (!contentTypeHeader) return null;
+export function parseBoundary(contentTypeHeader: string | string[] | undefined): string | null {
+  if (contentTypeHeader === undefined) return null;
+  if (Array.isArray(contentTypeHeader)) return null;
   const lower = contentTypeHeader.toLowerCase();
   if (!lower.startsWith('multipart/form-data')) return null;
   const match = contentTypeHeader.match(/boundary\s*=\s*(?:"([^"]+)"|([^\s;]+))/i);
diff --git a/packages/cli/test/import-file-integration.test.ts b/packages/cli/test/import-file-integration.test.ts
index 938f68174..ff0a84448 100644
--- a/packages/cli/test/import-file-integration.test.ts
+++ b/packages/cli/test/import-file-integration.test.ts
@@ -321,6 +321,10 @@ async function runImportFileOrchestration(params: {
     if (err.message?.includes('has not been registered') || err.message?.includes('Invalid') || err.message?.includes('Unsafe')) {
       fail(400, err.message, triples.length);
     }
+    // Unexpected write-stage failure: mirror the daemon by recording the
+    // failure before rethrowing, so the extraction status map doesn't stay
+    // stuck at in_progress.
+    recordFailed(err?.message ?? String(err), triples.length);
     throw err;
   }
 
@@ -814,6 +818,34 @@ describe('import-file orchestration — happy paths', () => {
     expect(record?.tripleCount).toBeGreaterThan(0);
   });
 
+  it('records failed extraction status when assertion.write throws an unexpected error', async () => {
+    // Errors that don't match the known has-not-been-registered / Invalid / Unsafe
+    // patterns must still update the extraction status record from in_progress to
+    // failed before the orchestration rethrows. Otherwise /extraction-status would
+    // stay stuck reporting in_progress even though the import already failed.
+    agent = makeMockAgent('0xMockAgentPeerId', {
+      writeError: new Error('Connection refused'),
+    });
+
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'file', name: 'file', filename: 'doc.md', contentType: 'text/markdown', content: Buffer.from('# Title\n\nBody.\n', 'utf-8') },
+    ]);
+
+    await expect(runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'unexpected-write',
+    })).rejects.toThrow('Connection refused');
+
+    const assertionUri = contextGraphAssertionUri('cg', agent.peerId, 'unexpected-write');
+    const record = status.get(assertionUri);
+    expect(record).toBeDefined();
+    expect(record?.status).toBe('failed');
+    expect(record?.error).toBe('Connection refused');
+    expect(record?.tripleCount).toBeGreaterThan(0);
+    expect(record?.completedAt).toBeDefined();
+  });
+
   it('returns the full import-file envelope for write-stage validation failures', async () => {
     agent = makeMockAgent('0xMockAgentPeerId', {
       writeError: new Error('Invalid triple object'),
diff --git a/packages/cli/test/multipart.test.ts b/packages/cli/test/multipart.test.ts
index e2a87f3f4..70c4b04d5 100644
--- a/packages/cli/test/multipart.test.ts
+++ b/packages/cli/test/multipart.test.ts
@@ -57,6 +57,14 @@ describe('parseBoundary', () => {
   it('returns null when boundary parameter is missing', () => {
     expect(parseBoundary('multipart/form-data')).toBeNull();
   });
+
+  it('returns null for an array value (duplicated Content-Type headers)', () => {
+    // Node may deliver IncomingHttpHeaders['content-type'] as string[] when
+    // the client sends duplicated headers. Reject as ambiguous so the route
+    // handler returns a clean 400 instead of crashing in toLowerCase().
+    expect(parseBoundary(['multipart/form-data; boundary=abc', 'application/json'])).toBeNull();
+    expect(parseBoundary([] as unknown as string[])).toBeNull();
+  });
 });
 
 describe('parseMultipart — text fields', () => {

From ef383ab45cca88230185f50ed69ffe6842485d53 Mon Sep 17 00:00:00 2001
From: Jurij Skornik <jurij.skornik@gmail.com>
Date: Sat, 11 Apr 2026 00:40:32 +0200
Subject: [PATCH 12/12] fix: tighten multipart parsing, contentType override,
 and skill discovery

Three PR #113 round 2 review findings:

1. multipart.ts Content-Disposition parser: the `name=` parameter regex
   could match the `name=` substring inside `filename=`, so a part with
   only `Content-Disposition: form-data; filename="x"` would be silently
   accepted as a field named `"x"` instead of being rejected as malformed.
   Anchor both `name=` and `filename=` matches to a real `;` parameter
   boundary (or start of string).

2. import-file route: an empty `contentType=` form field was treated as a
   real override because `??` only catches null/undefined, not empty
   string. A client sending `contentType=` would downgrade a valid
   text/markdown / application/pdf upload to application/octet-stream and
   trigger graceful-degrade. Treat blank/whitespace overrides as absent
   in both the daemon route handler and the test orchestration helper.

3. /.well-known/skill.md discovery: text/markdown is hard-coded as a
   supported native ingestion type by the import-file route (skip
   Phase 1, run Phase 2 markdown extractor directly), but
   extractionRegistry.availableContentTypes() only listed registered
   Phase 1 converters. Skill clients reading the discovery surface
   would think Markdown ingestion was unavailable when it was actually
   always supported. Surface text/markdown alongside the registered
   converters in both the skill.md endpoint and the startup log.

Adds 5 tests:
- parseMultipart rejects parts with only filename= and no name=
- parseMultipart parses filename-first ordering correctly
- parseMultipart parses name= and filename= independently
- import-file orchestration treats blank contentType= as absent
- import-file orchestration treats whitespace-only contentType= as absent

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 packages/cli/src/daemon.ts                    | 28 ++++++++++--
 packages/cli/src/http/multipart.ts            |  8 +++-
 .../cli/test/import-file-integration.test.ts  | 44 ++++++++++++++++++-
 packages/cli/test/multipart.test.ts           | 41 +++++++++++++++++
 4 files changed, 114 insertions(+), 7 deletions(-)

diff --git a/packages/cli/src/daemon.ts b/packages/cli/src/daemon.ts
index eebe954ea..010c10f26 100644
--- a/packages/cli/src/daemon.ts
+++ b/packages/cli/src/daemon.ts
@@ -818,8 +818,15 @@ async function runDaemonInner(foreground: boolean, config: Awaited<ReturnType<ty
   const extractionRegistry = new ExtractionPipelineRegistry();
   if (isMarkItDownAvailable()) {
     extractionRegistry.register(new MarkItDownConverter());
-    log(`Extraction pipelines: ${extractionRegistry.availableContentTypes().join(', ')}`);
-  } else {
+  }
+  // text/markdown is always natively handled by the import-file route
+  // regardless of converter registration; report the full effective set so
+  // operators see the same list that /.well-known/skill.md advertises.
+  const supportedIngestionTypes = [
+    ...new Set(['text/markdown', ...extractionRegistry.availableContentTypes()]),
+  ];
+  log(`Extraction pipelines: ${supportedIngestionTypes.join(', ')}`);
+  if (!isMarkItDownAvailable()) {
     log('MarkItDown binary not found — non-markdown document extraction unavailable (files stored as blobs)');
   }
 
@@ -1261,13 +1268,18 @@ async function handleRequest(
     const proto = req.headers['x-forwarded-proto'] ?? 'http';
     const host = req.headers['x-forwarded-host'] ?? req.headers.host ?? `localhost:${config.listenPort ?? 9200}`;
     const baseUrl = `${proto}://${host}`;
+    // text/markdown is always handled natively by the import-file route
+    // (skip Phase 1, run the Phase 2 markdown extractor directly), even when
+    // no Phase 1 converter is registered. Surface it in the discovery list so
+    // skill-driven clients see Markdown ingestion as supported regardless of
+    // converter availability.
     const pipelines = extractionRegistry.availableContentTypes();
     const content = buildSkillMd({
       version: nodeVersion,
       baseUrl,
       peerId: agent.peerId,
       nodeRole: config.nodeRole ?? 'edge',
-      extractionPipelines: [...new Set(pipelines)],
+      extractionPipelines: [...new Set(['text/markdown', ...pipelines])],
     });
     const etag = skillEtag(content);
     if (req.headers['if-none-match'] === etag) {
@@ -2271,7 +2283,15 @@ async function handleRequest(
       return f ? f.content.toString('utf-8') : undefined;
     };
     const contextGraphId = textField('contextGraphId');
-    const contentTypeOverride = textField('contentType');
+    const contentTypeOverrideRaw = textField('contentType');
+    // Treat blank (`contentType=` with empty/whitespace value) as absent so we
+    // fall through to the file part's own Content-Type header instead of
+    // downgrading a real text/markdown / application/pdf upload to
+    // application/octet-stream and silently skipping extraction.
+    const contentTypeOverride =
+      contentTypeOverrideRaw && contentTypeOverrideRaw.trim().length > 0
+        ? contentTypeOverrideRaw
+        : undefined;
     const ontologyRef = textField('ontologyRef');
     const subGraphName = textField('subGraphName');
 
diff --git a/packages/cli/src/http/multipart.ts b/packages/cli/src/http/multipart.ts
index 523d44495..104415419 100644
--- a/packages/cli/src/http/multipart.ts
+++ b/packages/cli/src/http/multipart.ts
@@ -120,11 +120,15 @@ export function parseMultipart(body: Buffer, boundary: string): MultipartField[]
     if (!disposition) {
       throw new MultipartParseError('Malformed part: missing Content-Disposition');
     }
-    const nameMatch = disposition.match(/name\s*=\s*(?:"([^"]*)"|([^;]+))/i);
+    // Anchor parameter matches to a real `;` boundary (or start of string) so
+    // `name=` doesn't accidentally match the `name=` substring inside `filename=`,
+    // and vice versa. Without this, a part with only `filename="x"` (no `name`)
+    // would be silently mis-routed as `name="x"`.
+    const nameMatch = disposition.match(/(?:^|;)\s*name\s*=\s*(?:"([^"]*)"|([^;]+))/i);
     if (!nameMatch) {
       throw new MultipartParseError('Malformed part: Content-Disposition without name');
     }
-    const filenameMatch = disposition.match(/filename\s*=\s*(?:"([^"]*)"|([^;]+))/i);
+    const filenameMatch = disposition.match(/(?:^|;)\s*filename\s*=\s*(?:"([^"]*)"|([^;]+))/i);
     fields.push({
       name: (nameMatch[1] ?? nameMatch[2] ?? '').trim(),
       filename: filenameMatch ? (filenameMatch[1] ?? filenameMatch[2] ?? '').trim() : undefined,
diff --git a/packages/cli/test/import-file-integration.test.ts b/packages/cli/test/import-file-integration.test.ts
index ff0a84448..1b6f038c7 100644
--- a/packages/cli/test/import-file-integration.test.ts
+++ b/packages/cli/test/import-file-integration.test.ts
@@ -175,7 +175,12 @@ async function runImportFileOrchestration(params: {
     return f ? f.content.toString('utf-8') : undefined;
   };
   const contextGraphId = textField('contextGraphId')!;
-  const contentTypeOverride = textField('contentType');
+  const contentTypeOverrideRaw = textField('contentType');
+  // Mirror the daemon: blank `contentType=` is treated as absent.
+  const contentTypeOverride =
+    contentTypeOverrideRaw && contentTypeOverrideRaw.trim().length > 0
+      ? contentTypeOverrideRaw
+      : undefined;
   const ontologyRef = textField('ontologyRef');
   const subGraphName = textField('subGraphName');
   const detectedContentType = normalizeDetectedContentType(contentTypeOverride ?? filePart.contentType);
@@ -818,6 +823,43 @@ describe('import-file orchestration — happy paths', () => {
     expect(record?.tripleCount).toBeGreaterThan(0);
   });
 
+  it('treats a blank contentType form field as absent and falls back to the file part Content-Type', async () => {
+    // A client that submits `contentType=` (empty string) must NOT downgrade
+    // a real text/markdown upload to application/octet-stream — the empty
+    // override should be ignored and the file part's own Content-Type used.
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'text', name: 'contentType', value: '' },
+      { kind: 'file', name: 'file', filename: 'note.md', contentType: 'text/markdown', content: Buffer.from('# Heading\n\nBody text.\n', 'utf-8') },
+    ]);
+
+    const result = await runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'blank-override',
+    });
+
+    expect(result.detectedContentType).toBe('text/markdown');
+    expect(result.extraction.status).toBe('completed');
+    expect(result.extraction.pipelineUsed).toBe('text/markdown');
+    expect(result.extraction.tripleCount).toBeGreaterThan(0);
+  });
+
+  it('treats a whitespace-only contentType form field as absent', async () => {
+    const body = buildMultipart([
+      { kind: 'text', name: 'contextGraphId', value: 'cg' },
+      { kind: 'text', name: 'contentType', value: '   ' },
+      { kind: 'file', name: 'file', filename: 'note.md', contentType: 'text/markdown', content: Buffer.from('# Heading\n', 'utf-8') },
+    ]);
+
+    const result = await runImportFileOrchestration({
+      agent, fileStore, extractionRegistry: registry, extractionStatus: status,
+      multipartBody: body, boundary: BOUNDARY, assertionName: 'whitespace-override',
+    });
+
+    expect(result.detectedContentType).toBe('text/markdown');
+    expect(result.extraction.status).toBe('completed');
+  });
+
   it('records failed extraction status when assertion.write throws an unexpected error', async () => {
     // Errors that don't match the known has-not-been-registered / Invalid / Unsafe
     // patterns must still update the extraction status record from in_progress to
diff --git a/packages/cli/test/multipart.test.ts b/packages/cli/test/multipart.test.ts
index 70c4b04d5..29202c284 100644
--- a/packages/cli/test/multipart.test.ts
+++ b/packages/cli/test/multipart.test.ts
@@ -67,6 +67,47 @@ describe('parseBoundary', () => {
   });
 });
 
+describe('parseMultipart — Content-Disposition parameter parsing', () => {
+  it('rejects a part that has only filename= and no name=', () => {
+    // The `name=` parameter regex must be anchored to a real `;` boundary so
+    // it does not silently match the `name=` substring inside `filename=`.
+    // A part with only `filename="x"` should be rejected, not mis-routed as
+    // a field named "x".
+    const malformed = Buffer.concat([
+      Buffer.from(`--${BOUNDARY}${CRLF}`),
+      Buffer.from(`Content-Disposition: form-data; filename="lonely.txt"${CRLF}${CRLF}contents`),
+      Buffer.from(CRLF),
+      Buffer.from(`--${BOUNDARY}--${CRLF}`),
+    ]);
+    expect(() => parseMultipart(malformed, BOUNDARY)).toThrow(MultipartParseError);
+    expect(() => parseMultipart(malformed, BOUNDARY)).toThrow(/without name/);
+  });
+
+  it('parses name= and filename= independently when both are present', () => {
+    const body = buildBody(filePart('attachment', 'doc.pdf', 'application/pdf', Buffer.from('PDF', 'utf-8')));
+    const fields = parseMultipart(body, BOUNDARY);
+    expect(fields).toHaveLength(1);
+    expect(fields[0].name).toBe('attachment');
+    expect(fields[0].filename).toBe('doc.pdf');
+  });
+
+  it('parses name= when filename= comes first in the Content-Disposition', () => {
+    // Order-independence: filename before name should still work because the
+    // anchored regex looks for `;\s*name=` (or start-of-string) regardless of
+    // position.
+    const body = Buffer.concat([
+      Buffer.from(`--${BOUNDARY}${CRLF}`),
+      Buffer.from(`Content-Disposition: form-data; filename="doc.pdf"; name="attachment"${CRLF}${CRLF}body`),
+      Buffer.from(CRLF),
+      Buffer.from(`--${BOUNDARY}--${CRLF}`),
+    ]);
+    const fields = parseMultipart(body, BOUNDARY);
+    expect(fields).toHaveLength(1);
+    expect(fields[0].name).toBe('attachment');
+    expect(fields[0].filename).toBe('doc.pdf');
+  });
+});
+
 describe('parseMultipart — text fields', () => {
   it('extracts a single text field', () => {
     const body = buildBody(textPart('greeting', 'hello'));