From 737c03c56e46ee84eafdd2830c14b3049aff9ce4 Mon Sep 17 00:00:00 2001 From: Stanislau Niadbailau Date: Sun, 12 Apr 2026 12:05:15 -0400 Subject: [PATCH] Parse plain-text catalog dependency cells into typed relation edges MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The parseLabeledRelations function only matched **bold:** markdown format, but catalog dependency cells use plain text like "Builds on: A.1, A.2. Constrains: B.3." This caused 148 of 237 patterns to lose typed semantic relations (builds_on, constrains, coordinates_with, etc.) from catalog metadata. Add a plain-text fallback parser that activates when the bold regex finds nothing. Uses lookahead on known RELATION_LABELS keys to split multi-label cells correctly. Extract pushRelationEdges helper to deduplicate the edge-creation logic. Impact: builds_on 353→1132, coordinates_with 167→583, prerequisite_for 37→68. Total +1396 typed relation edges. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/runtime/compiler.ts | 57 +++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 14 deletions(-) diff --git a/src/runtime/compiler.ts b/src/runtime/compiler.ts index 0c810c9..1974f17 100644 --- a/src/runtime/compiler.ts +++ b/src/runtime/compiler.ts @@ -1191,26 +1191,55 @@ function parseLabeledRelations( sourceCitation: string, ): RelationEdge[] { const relationEdges: RelationEdge[] = []; - const relationRegex = + + // Bold markdown format: **Builds on:** A.1, A.2. + const boldRegex = /\*\*([^:*]+):\*\*\s*([\s\S]*?)(?=(?:\n\s*[*-]\s*\*\*[^:*]+:\*\*|\s+\*\*[^:*]+:\*\*|$))/g; - for (const match of text.matchAll(relationRegex)) { - const label = normalizeForLookup(match[1] ?? ''); - const relation = RELATION_LABELS[label]; - if (!relation) { - continue; - } - for (const target of extractIds(match[2] ?? '')) { - relationEdges.push({ - from: sourceId, - relation, - to: target, - source: sourceCitation, - }); + for (const match of text.matchAll(boldRegex)) { + pushRelationEdges(relationEdges, sourceId, match[1] ?? '', match[2] ?? '', sourceCitation); + } + + // Plain-text catalog format: Builds on: A.1, A.2. Constrains: B.3. + // Uses lookahead to split on the next known label or end-of-string. + if (relationEdges.length === 0) { + const escapedLabels = Object.keys(RELATION_LABELS).map( + (label) => label.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), + ); + const labelAlternation = escapedLabels.join('|'); + const plainRegex = new RegExp( + `(${labelAlternation}):\\s*(.*?)(?=(?:${labelAlternation}):|$)`, + 'gis', + ); + for (const match of text.matchAll(plainRegex)) { + pushRelationEdges(relationEdges, sourceId, match[1] ?? '', match[2] ?? '', sourceCitation); } } + return relationEdges; } +function pushRelationEdges( + edges: RelationEdge[], + sourceId: string, + rawLabel: string, + rawTargets: string, + sourceCitation: string, +): void { + const label = normalizeForLookup(rawLabel); + const relation = RELATION_LABELS[label]; + if (!relation) { + return; + } + for (const target of extractIds(rawTargets)) { + edges.push({ + from: sourceId, + relation, + to: target, + source: sourceCitation, + }); + } +} + function parseKeywords(cell: string): string[] { const match = cell.match(/Keywords:\s*(.+?)(?:Queries:|$)/i); if (!match) {