diff --git a/packages/core/src/analysis/analyze.ts b/packages/core/src/analysis/analyze.ts index b9bade9..eda4b54 100644 --- a/packages/core/src/analysis/analyze.ts +++ b/packages/core/src/analysis/analyze.ts @@ -288,8 +288,15 @@ export async function analyzeSite(url: string, options: AnalyzeOptions, context? } if (options.orphans) { - const edges = crawlData.graph.getEdges(); - annotateOrphans(crawlData.graph.getNodes(), edges, { + const iterableEdges = { + *[Symbol.iterator]() { + for (const [edgeKey, weight] of crawlData.graph.edges.entries()) { + const splitIndex = edgeKey.indexOf('\x00'); + yield { source: edgeKey.slice(0, splitIndex), target: edgeKey.slice(splitIndex + 1), weight }; + } + } + }; + annotateOrphans(crawlData.graph.getNodes(), iterableEdges, { enabled: true, severityEnabled: !!options.orphanSeverity, includeSoftOrphans: !!options.includeSoftOrphans, diff --git a/packages/core/src/analysis/duplicate.ts b/packages/core/src/analysis/duplicate.ts index afd31e0..a1016ad 100644 --- a/packages/core/src/analysis/duplicate.ts +++ b/packages/core/src/analysis/duplicate.ts @@ -321,22 +321,21 @@ export class DuplicateService { } private collapseEdges(graph: Graph) { - const edges = graph.getEdges(); const updatedEdges = new Map(); - for (const edge of edges) { - const targetNode = graph.nodes.get(edge.target); - if (!targetNode) continue; + graph.forEachEdge((source, target, weight) => { + const targetNode = graph.nodes.get(target); + if (!targetNode) return; - const actualSource = edge.source; - const actualTarget = (targetNode as any).isCollapsed && (targetNode as any).collapseInto ? (targetNode as any).collapseInto : edge.target; + const actualSource = source; + const actualTarget = (targetNode as any).isCollapsed && (targetNode as any).collapseInto ? (targetNode as any).collapseInto : target; - if (actualSource === actualTarget) continue; + if (actualSource === actualTarget) return; const edgeKey = Graph.getEdgeKey(actualSource, actualTarget); const existingWeight = updatedEdges.get(edgeKey) || 0; - updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight)); - } + updatedEdges.set(edgeKey, Math.max(existingWeight, weight)); + }); graph.edges = updatedEdges; diff --git a/packages/core/src/analysis/orphan.ts b/packages/core/src/analysis/orphan.ts index a296660..e2391e6 100644 --- a/packages/core/src/analysis/orphan.ts +++ b/packages/core/src/analysis/orphan.ts @@ -111,7 +111,7 @@ function consolidateInboundByCanonical(nodes: ExtendedGraphNode[]): Map | GraphEdge[], options: OrphanScoringOptions): AnnotatedNode[] { if (!options.enabled) { return nodes.map((node) => ({ ...node, orphan: false } as AnnotatedNode)); } @@ -119,6 +119,17 @@ export function annotateOrphans(nodes: ExtendedGraphNode[], edges: GraphEdge[], const canonicalInbound = consolidateInboundByCanonical(nodes); const nodeByUrl = new Map(nodes.map((node) => [node.url, node])); + // Pre-calculate inbound sources to avoid iterating over all edges per node + const inboundEdgesByTarget = new Map(); + for (const edge of edges) { + let sources = inboundEdgesByTarget.get(edge.target); + if (!sources) { + sources = []; + inboundEdgesByTarget.set(edge.target, sources); + } + sources.push(edge.source); + } + return nodes.map((node) => { const isHomepage = node.isHomepage || (options.rootUrl ? node.url === options.rootUrl : node.depth === 0); if (isHomepage || node.robotsExcluded) { @@ -137,9 +148,9 @@ export function annotateOrphans(nodes: ExtendedGraphNode[], edges: GraphEdge[], } if (!orphanType && options.includeSoftOrphans && inbound > 0) { - const inboundSources = edges - .filter((edge) => edge.target === node.url) - .map((edge) => nodeByUrl.get(edge.source)) + const sources = inboundEdgesByTarget.get(node.url) || []; + const inboundSources = sources + .map(sourceUrl => nodeByUrl.get(sourceUrl)) .filter((source): source is GraphNode => Boolean(source)); if (inboundSources.length > 0 && inboundSources.every((source) => isLowValuePage(source))) { diff --git a/packages/core/src/crawler/metricsRunner.ts b/packages/core/src/crawler/metricsRunner.ts index c7b13e4..69c24bc 100644 --- a/packages/core/src/crawler/metricsRunner.ts +++ b/packages/core/src/crawler/metricsRunner.ts @@ -136,7 +136,15 @@ export function runPostCrawlMetrics(snapshotId: number, maxDepth: number, option includeSoftOrphans: options.includeSoftOrphans ?? true, minInbound: options.minInbound ?? 2 }; - annotatedNodes = annotateOrphans(graph.getNodes(), graph.getEdges(), orphanOptions) as any[]; + const iterableEdges = { + *[Symbol.iterator]() { + for (const [edgeKey, weight] of graph.edges.entries()) { + const splitIndex = edgeKey.indexOf('\x00'); + yield { source: edgeKey.slice(0, splitIndex), target: edgeKey.slice(splitIndex + 1), weight }; + } + } + }; + annotatedNodes = annotateOrphans(graph.getNodes(), iterableEdges, orphanOptions) as any[]; } const soft404Service = new Soft404Service(); diff --git a/packages/core/src/graph/graph.ts b/packages/core/src/graph/graph.ts index 81be83d..ee4952a 100644 --- a/packages/core/src/graph/graph.ts +++ b/packages/core/src/graph/graph.ts @@ -158,6 +158,17 @@ export class Graph { return Array.from(this.nodes.values()); } + /** + * Iterates over all edges efficiently without allocating an array of objects. + * Useful for performance-critical hot loops. + */ + forEachEdge(callback: (source: string, target: string, weight: number) => void): void { + for (const [edgeKey, weight] of this.edges.entries()) { + const splitIndex = edgeKey.indexOf('\x00'); + callback(edgeKey.slice(0, splitIndex), edgeKey.slice(splitIndex + 1), weight); + } + } + getEdges(): GraphEdge[] { return Array.from(this.edges.entries()).map(([edge, weight]) => { const { source, target } = Graph.parseEdgeKey(edge); diff --git a/packages/core/src/graph/hits.ts b/packages/core/src/graph/hits.ts index 0319ff1..ac11e4c 100644 --- a/packages/core/src/graph/hits.ts +++ b/packages/core/src/graph/hits.ts @@ -48,19 +48,18 @@ export class HITSService { const incoming: { sourceIndex: number, weight: number }[][] = new Array(N).fill(null).map(() => []); const outgoing: { targetIndex: number, weight: number }[][] = new Array(N).fill(null).map(() => []); - const allEdges = graph.getEdges(); - for (const edge of allEdges) { - if (edge.source === edge.target) continue; + graph.forEachEdge((source, target, weight) => { + if (source === target) return; - const sourceIndex = urlToIndex.get(edge.source); - const targetIndex = urlToIndex.get(edge.target); + const sourceIndex = urlToIndex.get(source); + const targetIndex = urlToIndex.get(target); if (sourceIndex !== undefined && targetIndex !== undefined) { - const weight = edge.weight || 1.0; - incoming[targetIndex].push({ sourceIndex, weight }); - outgoing[sourceIndex].push({ targetIndex, weight }); + const edgeWeight = weight || 1.0; + incoming[targetIndex].push({ sourceIndex, weight: edgeWeight }); + outgoing[sourceIndex].push({ targetIndex, weight: edgeWeight }); } - } + }); // Initialize Scores const authScores = new Float64Array(N).fill(1.0); diff --git a/packages/core/src/graph/metrics.ts b/packages/core/src/graph/metrics.ts index 608b163..3250145 100644 --- a/packages/core/src/graph/metrics.ts +++ b/packages/core/src/graph/metrics.ts @@ -23,24 +23,23 @@ export interface Metrics { export function calculateMetrics(graph: Graph, _maxDepth: number): Metrics { const nodes = graph.getNodes(); - const edges = graph.getEdges(); const totalPages = nodes.length; - const totalEdges = edges.length; + const totalEdges = graph.edges.size; // Identify broken nodes const brokenNodes = new Set(nodes.filter(n => n.status >= 400 || n.status === 0).map(n => n.url)); // Pre-compute outgoing edges per node for faster lookup const outgoingEdges = new Map(); - for (const edge of edges) { - let targets = outgoingEdges.get(edge.source); + graph.forEachEdge((source, target) => { + let targets = outgoingEdges.get(source); if (!targets) { targets = []; - outgoingEdges.set(edge.source, targets); + outgoingEdges.set(source, targets); } - targets.push(edge.target); - } + targets.push(target); + }); // Populate brokenLinks per node for (const node of nodes) { diff --git a/packages/core/src/graph/pagerank.ts b/packages/core/src/graph/pagerank.ts index c84a9a8..2bb957d 100644 --- a/packages/core/src/graph/pagerank.ts +++ b/packages/core/src/graph/pagerank.ts @@ -33,7 +33,6 @@ export class PageRankService { const neutralScoreWhenFlat = options.neutralScoreWhenFlat ?? 50; const allNodes = graph.getNodes(); - const allEdges = graph.getEdges(); // 1. Filter Eligible Nodes const eligibleNodes = allNodes.filter(node => { @@ -71,16 +70,16 @@ export class PageRankService { const outWeights = new Float64Array(nodeCount); const incoming: { sourceIndex: number, weight: number }[][] = new Array(nodeCount).fill(null).map(() => []); - for (const edge of allEdges) { - const sourceIndex = urlToIndex.get(edge.source); - const targetIndex = urlToIndex.get(edge.target); + graph.forEachEdge((source, target, weight) => { + const sourceIndex = urlToIndex.get(source); + const targetIndex = urlToIndex.get(target); if (sourceIndex !== undefined && targetIndex !== undefined) { - const weight = edge.weight || 1.0; - incoming[targetIndex].push({ sourceIndex, weight }); - outWeights[sourceIndex] += weight; + const edgeWeight = weight || 1.0; + incoming[targetIndex].push({ sourceIndex, weight: edgeWeight }); + outWeights[sourceIndex] += edgeWeight; } - } + }); // Identify sinks const sinks: number[] = [];