Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions packages/core/src/analysis/analyze.ts
Original file line number Diff line number Diff line change
Expand Up @@ -288,8 +288,15 @@ export async function analyzeSite(url: string, options: AnalyzeOptions, context?
}

if (options.orphans) {
const edges = crawlData.graph.getEdges();
annotateOrphans(crawlData.graph.getNodes(), edges, {
const iterableEdges = {
*[Symbol.iterator]() {
for (const [edgeKey, weight] of crawlData.graph.edges.entries()) {
const splitIndex = edgeKey.indexOf('\x00');
yield { source: edgeKey.slice(0, splitIndex), target: edgeKey.slice(splitIndex + 1), weight };
}
}
};
annotateOrphans(crawlData.graph.getNodes(), iterableEdges, {
enabled: true,
severityEnabled: !!options.orphanSeverity,
includeSoftOrphans: !!options.includeSoftOrphans,
Expand Down
17 changes: 8 additions & 9 deletions packages/core/src/analysis/duplicate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -321,22 +321,21 @@ export class DuplicateService {
}

private collapseEdges(graph: Graph) {
const edges = graph.getEdges();
const updatedEdges = new Map<string, number>();

for (const edge of edges) {
const targetNode = graph.nodes.get(edge.target);
if (!targetNode) continue;
graph.forEachEdge((source, target, weight) => {
const targetNode = graph.nodes.get(target);
if (!targetNode) return;

const actualSource = edge.source;
const actualTarget = (targetNode as any).isCollapsed && (targetNode as any).collapseInto ? (targetNode as any).collapseInto : edge.target;
const actualSource = source;
const actualTarget = (targetNode as any).isCollapsed && (targetNode as any).collapseInto ? (targetNode as any).collapseInto : target;

if (actualSource === actualTarget) continue;
if (actualSource === actualTarget) return;

const edgeKey = Graph.getEdgeKey(actualSource, actualTarget);
const existingWeight = updatedEdges.get(edgeKey) || 0;
updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight));
}
updatedEdges.set(edgeKey, Math.max(existingWeight, weight));
});

graph.edges = updatedEdges;

Expand Down
19 changes: 15 additions & 4 deletions packages/core/src/analysis/orphan.ts
Original file line number Diff line number Diff line change
Expand Up @@ -111,14 +111,25 @@ function consolidateInboundByCanonical(nodes: ExtendedGraphNode[]): Map<string,
return canonicalInbound;
}

export function annotateOrphans(nodes: ExtendedGraphNode[], edges: GraphEdge[], options: OrphanScoringOptions): AnnotatedNode[] {
export function annotateOrphans(nodes: ExtendedGraphNode[], edges: Iterable<GraphEdge> | GraphEdge[], options: OrphanScoringOptions): AnnotatedNode[] {
if (!options.enabled) {
return nodes.map((node) => ({ ...node, orphan: false } as AnnotatedNode));
}

const canonicalInbound = consolidateInboundByCanonical(nodes);
const nodeByUrl = new Map(nodes.map((node) => [node.url, node]));

// Pre-calculate inbound sources to avoid iterating over all edges per node
const inboundEdgesByTarget = new Map<string, string[]>();
for (const edge of edges) {
let sources = inboundEdgesByTarget.get(edge.target);
if (!sources) {
sources = [];
inboundEdgesByTarget.set(edge.target, sources);
}
sources.push(edge.source);
}

return nodes.map((node) => {
const isHomepage = node.isHomepage || (options.rootUrl ? node.url === options.rootUrl : node.depth === 0);
if (isHomepage || node.robotsExcluded) {
Expand All @@ -137,9 +148,9 @@ export function annotateOrphans(nodes: ExtendedGraphNode[], edges: GraphEdge[],
}

if (!orphanType && options.includeSoftOrphans && inbound > 0) {
const inboundSources = edges
.filter((edge) => edge.target === node.url)
.map((edge) => nodeByUrl.get(edge.source))
const sources = inboundEdgesByTarget.get(node.url) || [];
const inboundSources = sources
.map(sourceUrl => nodeByUrl.get(sourceUrl))
.filter((source): source is GraphNode => Boolean(source));

if (inboundSources.length > 0 && inboundSources.every((source) => isLowValuePage(source))) {
Expand Down
10 changes: 9 additions & 1 deletion packages/core/src/crawler/metricsRunner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,15 @@ export function runPostCrawlMetrics(snapshotId: number, maxDepth: number, option
includeSoftOrphans: options.includeSoftOrphans ?? true,
minInbound: options.minInbound ?? 2
};
annotatedNodes = annotateOrphans(graph.getNodes(), graph.getEdges(), orphanOptions) as any[];
const iterableEdges = {
*[Symbol.iterator]() {
for (const [edgeKey, weight] of graph.edges.entries()) {
const splitIndex = edgeKey.indexOf('\x00');
yield { source: edgeKey.slice(0, splitIndex), target: edgeKey.slice(splitIndex + 1), weight };
}
}
};
annotatedNodes = annotateOrphans(graph.getNodes(), iterableEdges, orphanOptions) as any[];
}

const soft404Service = new Soft404Service();
Expand Down
11 changes: 11 additions & 0 deletions packages/core/src/graph/graph.ts
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,17 @@ export class Graph {
return Array.from(this.nodes.values());
}

/**
* Iterates over all edges efficiently without allocating an array of objects.
* Useful for performance-critical hot loops.
*/
forEachEdge(callback: (source: string, target: string, weight: number) => void): void {
for (const [edgeKey, weight] of this.edges.entries()) {
const splitIndex = edgeKey.indexOf('\x00');
callback(edgeKey.slice(0, splitIndex), edgeKey.slice(splitIndex + 1), weight);
}
}

getEdges(): GraphEdge[] {
return Array.from(this.edges.entries()).map(([edge, weight]) => {
const { source, target } = Graph.parseEdgeKey(edge);
Expand Down
17 changes: 8 additions & 9 deletions packages/core/src/graph/hits.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,18 @@ export class HITSService {
const incoming: { sourceIndex: number, weight: number }[][] = new Array(N).fill(null).map(() => []);
const outgoing: { targetIndex: number, weight: number }[][] = new Array(N).fill(null).map(() => []);

const allEdges = graph.getEdges();
for (const edge of allEdges) {
if (edge.source === edge.target) continue;
graph.forEachEdge((source, target, weight) => {
if (source === target) return;

const sourceIndex = urlToIndex.get(edge.source);
const targetIndex = urlToIndex.get(edge.target);
const sourceIndex = urlToIndex.get(source);
const targetIndex = urlToIndex.get(target);

if (sourceIndex !== undefined && targetIndex !== undefined) {
const weight = edge.weight || 1.0;
incoming[targetIndex].push({ sourceIndex, weight });
outgoing[sourceIndex].push({ targetIndex, weight });
const edgeWeight = weight || 1.0;
incoming[targetIndex].push({ sourceIndex, weight: edgeWeight });
outgoing[sourceIndex].push({ targetIndex, weight: edgeWeight });
}
}
});

// Initialize Scores
const authScores = new Float64Array(N).fill(1.0);
Expand Down
13 changes: 6 additions & 7 deletions packages/core/src/graph/metrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,23 @@ export interface Metrics {

export function calculateMetrics(graph: Graph, _maxDepth: number): Metrics {
const nodes = graph.getNodes();
const edges = graph.getEdges();

const totalPages = nodes.length;
const totalEdges = edges.length;
const totalEdges = graph.edges.size;

// Identify broken nodes
const brokenNodes = new Set(nodes.filter(n => n.status >= 400 || n.status === 0).map(n => n.url));

// Pre-compute outgoing edges per node for faster lookup
const outgoingEdges = new Map<string, string[]>();
for (const edge of edges) {
let targets = outgoingEdges.get(edge.source);
graph.forEachEdge((source, target) => {
let targets = outgoingEdges.get(source);
if (!targets) {
targets = [];
outgoingEdges.set(edge.source, targets);
outgoingEdges.set(source, targets);
}
targets.push(edge.target);
}
targets.push(target);
});

// Populate brokenLinks per node
for (const node of nodes) {
Expand Down
15 changes: 7 additions & 8 deletions packages/core/src/graph/pagerank.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ export class PageRankService {
const neutralScoreWhenFlat = options.neutralScoreWhenFlat ?? 50;

const allNodes = graph.getNodes();
const allEdges = graph.getEdges();

// 1. Filter Eligible Nodes
const eligibleNodes = allNodes.filter(node => {
Expand Down Expand Up @@ -71,16 +70,16 @@ export class PageRankService {
const outWeights = new Float64Array(nodeCount);
const incoming: { sourceIndex: number, weight: number }[][] = new Array(nodeCount).fill(null).map(() => []);

for (const edge of allEdges) {
const sourceIndex = urlToIndex.get(edge.source);
const targetIndex = urlToIndex.get(edge.target);
graph.forEachEdge((source, target, weight) => {
const sourceIndex = urlToIndex.get(source);
const targetIndex = urlToIndex.get(target);

if (sourceIndex !== undefined && targetIndex !== undefined) {
const weight = edge.weight || 1.0;
incoming[targetIndex].push({ sourceIndex, weight });
outWeights[sourceIndex] += weight;
const edgeWeight = weight || 1.0;
incoming[targetIndex].push({ sourceIndex, weight: edgeWeight });
outWeights[sourceIndex] += edgeWeight;
}
}
});

// Identify sinks
const sinks: number[] = [];
Expand Down