feat: Personalized PageRank 엔진 — spreading activation 교체, multi-hop +28%

SonAIengine · claude · SonAIengine · commit fe330ead9c6c · 2026-03-22T15:56:59.000+09:00
## PPR 엔진 (ppr.py)
- Power iteration 기반 Personalized PageRank (zero-dep, 순수 Python)
- BFS depth 2로 시드에서 도달 가능한 서브그래프만 탐색 (효율적)
- StorageBackend.get_edges() 프로토콜만 사용
- 빈 그래프에서 시드 점수 그대로 반환 (안전)

## 검색 통합 (search.py)
- spreading activation → PPR 교체
- 보수적 블렌딩: PPR 발견 노드는 0.8 감쇠, 기존 FTS 결과는 +0.1 미세 부스트만
- FTS 랭킹 보존하면서 그래프 경로로 새 노드 발견

## Intent별 damping (agent_search.py)
- CONTEXT_EXPLORE: 0.75 (넓게), PAST_FAILURES: 0.50 (집중)
- REASONING_CHAIN: 0.60 (체인 추적), 기본: 0.85
- _explore_context의 BFS를 PPR로 교체

## Ablation 결과
- HotPotQA S7: MRR 0.651→0.837 (+28%), R@10 0.670→0.805 (+20%)
- AutoRAG S7: MRR 0.496→0.659 (+33%), R@10 0.800→0.950 (+19%)
- Allganize S7: MRR 0.830→0.836, R@10 1.000 (유지)
- S2 +Relations: MRR 하락 해소 (PPR 노이즈 억제)

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/synaptic/__init__.py b/src/synaptic/__init__.py
@@ -4,6 +4,7 @@
 
 from synaptic.activity import ActivityTracker
 from synaptic.agent_search import AgentSearch, SearchIntent, suggest_intent
+from synaptic.ppr import personalized_pagerank
 from synaptic.extensions.classifier_rules import RuleBasedClassifier
 from synaptic.extensions.embedder import EmbeddingProvider, MockEmbeddingProvider
 from synaptic.extensions.relation_detector import RuleBasedRelationDetector
@@ -54,6 +55,7 @@
     "Node",
     "NodeKind",
     "OntologyRegistry",
+    "personalized_pagerank",
     "PropertyDef",
     "QueryRewriter",
     "RelationDetector",
diff --git a/src/synaptic/agent_search.py b/src/synaptic/agent_search.py
@@ -12,6 +12,7 @@
     NodeKind,
     SearchResult,
 )
+from synaptic.ppr import personalized_pagerank
 from synaptic.protocols import StorageBackend
 from synaptic.resonance import ResonanceScorer, ResonanceWeights
 from synaptic.search import HybridSearch
@@ -48,6 +49,17 @@ class SearchIntent(StrEnum):
 }
 
 
+# PPR damping factor per intent (lower = more focused on seeds)
+_INTENT_DAMPING: dict[SearchIntent, float] = {
+    SearchIntent.CONTEXT_EXPLORE: 0.75,   # 탐색적, 넓게
+    SearchIntent.GENERAL: 0.85,           # 기본
+    SearchIntent.SIMILAR_DECISIONS: 0.85, # 기본
+    SearchIntent.PAST_FAILURES: 0.50,     # 집중적, 좁게
+    SearchIntent.REASONING_CHAIN: 0.60,   # 체인 추적
+    SearchIntent.RELATED_RULES: 0.85,     # 기본
+}
+
+
 _INTENT_HINTS: dict[SearchIntent, list[str]] = {
     SearchIntent.PAST_FAILURES: [
         "실패", "에러", "오류", "버그", "장애", "incident", "fail", "error", "bug",
@@ -349,9 +361,10 @@ async def _explore_context(
         context_tags: list[str] | None,
         depth: int,
     ) -> SearchResult:
-        """BFS expansion from seed nodes."""
+        """PPR-based context exploration from seed nodes."""
         start = time()
         weights = _INTENT_WEIGHTS[SearchIntent.CONTEXT_EXPLORE]
+        damping = _INTENT_DAMPING[SearchIntent.CONTEXT_EXPLORE]
 
         # Find seed nodes
         result = await self._hybrid.search(
@@ -362,14 +375,21 @@ async def _explore_context(
         for an in result.nodes:
             expanded[an.node.id] = (an.node, an.activation)
 
-        # BFS expand
-        for an in result.nodes:
-            neighbors = await backend.get_neighbors(an.node.id, depth=depth)
-            for neighbor, edge in neighbors:
-                if neighbor.id not in expanded:
-                    decay = 0.5 ** (1)  # distance-based decay
-                    score = an.activation * edge.weight * decay
-                    expanded[neighbor.id] = (neighbor, max(0.0, min(1.0, score)))
+        # PPR expansion (low damping → wider exploration)
+        if expanded:
+            seed_scores = {nid: score for nid, (_node, score) in expanded.items()}
+            ppr_results = await personalized_pagerank(
+                backend, seed_scores, damping=damping, top_k=limit * 2,
+            )
+            for node_id, ppr_score in ppr_results:
+                if node_id not in expanded:
+                    node = await backend.get_node(node_id)
+                    if node:
+                        expanded[node_id] = (node, ppr_score)
+                else:
+                    existing = expanded[node_id]
+                    blended = 0.6 * existing[1] + 0.4 * ppr_score
+                    expanded[node_id] = (existing[0], min(1.0, blended))
 
         activated = self._score_candidates(expanded, weights, context_tags)
         return SearchResult(
diff --git a/src/synaptic/ppr.py b/src/synaptic/ppr.py
@@ -0,0 +1,142 @@
+"""Personalized PageRank (PPR) engine — zero-dependency, dict-based sparse implementation.
+
+Power iteration:
+    r(t+1) = (1 - damping) * personalization + damping * A_norm * r(t)
+
+Where A_norm is a column-normalized adjacency matrix built from the graph edges.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from synaptic.protocols import StorageBackend
+
+
+async def personalized_pagerank(
+    backend: StorageBackend,
+    seed_scores: dict[str, float],
+    *,
+    damping: float = 0.85,
+    max_iter: int = 50,
+    tol: float = 1e-6,
+    top_k: int = 20,
+) -> list[tuple[str, float]]:
+    """Perform PPR and return top-k (node_id, score) pairs.
+
+    The graph is discovered incrementally via BFS from seed nodes so that
+    only the reachable subgraph is materialized — no need to enumerate all
+    nodes/edges in the backend.
+
+    Args:
+        backend: Storage backend implementing the StorageBackend protocol.
+        seed_scores: {node_id: weight} — search result scores as personalization.
+        damping: Probability of following an edge (vs teleporting back to seeds).
+        max_iter: Maximum power-iteration steps.
+        tol: Convergence threshold (L1 norm of rank change).
+        top_k: Number of top-ranked nodes to return.
+
+    Returns:
+        List of (node_id, ppr_score) sorted descending by score.
+    """
+    if not seed_scores:
+        return []
+
+    # --- 1. BFS to discover the reachable subgraph (depth 2 from seeds) ---
+    # adjacency: source -> [(target, weight), ...]
+    adj: dict[str, list[tuple[str, float]]] = {}
+    visited: set[str] = set()
+    frontier = set(seed_scores.keys())
+    bfs_depth = 2
+
+    for _ in range(bfs_depth):
+        if not frontier:
+            break
+        next_frontier: set[str] = set()
+        for nid in frontier:
+            if nid in visited:
+                continue
+            visited.add(nid)
+            if nid not in adj:
+                adj[nid] = []
+            edges = await backend.get_edges(nid, direction="both")
+            for edge in edges:
+                # Determine the neighbor
+                if edge.source_id == nid:
+                    neighbor_id = edge.target_id
+                else:
+                    neighbor_id = edge.source_id
+
+                # Add edge in both directions (undirected for PPR spreading)
+                adj[nid].append((neighbor_id, edge.weight))
+                if neighbor_id not in adj:
+                    adj[neighbor_id] = []
+                adj[neighbor_id].append((nid, edge.weight))
+
+                if neighbor_id not in visited:
+                    next_frontier.add(neighbor_id)
+        frontier = next_frontier
+
+    # Mark remaining frontier nodes as visited (leaf nodes with no outgoing expansion)
+    visited.update(frontier)
+    for nid in frontier:
+        if nid not in adj:
+            adj[nid] = []
+
+    all_nodes = set(adj.keys()) | set(seed_scores.keys())
+
+    # No edges at all — return seed scores directly (sorted)
+    if not any(adj.values()):
+        sorted_seeds = sorted(seed_scores.items(), key=lambda x: x[1], reverse=True)
+        return sorted_seeds[:top_k]
+
+    # --- 2. Build column-normalized adjacency (as sparse dicts) ---
+    # out_weight[node] = sum of weights of outgoing edges
+    out_weight: dict[str, float] = {}
+    for src, neighbors in adj.items():
+        total = sum(w for _, w in neighbors)
+        out_weight[src] = total if total > 0 else 1.0
+
+    # --- 3. Normalize personalization vector ---
+    total_seed = sum(seed_scores.values())
+    if total_seed == 0:
+        return []
+    personalization: dict[str, float] = {
+        nid: s / total_seed for nid, s in seed_scores.items()
+    }
+
+    # --- 4. Power iteration ---
+    # Initialize rank vector = personalization
+    rank: dict[str, float] = {}
+    for nid in all_nodes:
+        rank[nid] = personalization.get(nid, 0.0)
+
+    teleport_coeff = 1.0 - damping
+
+    for _ in range(max_iter):
+        new_rank: dict[str, float] = {}
+        # Initialize with teleport (personalization)
+        for nid in all_nodes:
+            new_rank[nid] = teleport_coeff * personalization.get(nid, 0.0)
+
+        # Distribute rank along edges
+        for src, neighbors in adj.items():
+            if not neighbors:
+                continue
+            src_rank = rank[src]
+            src_out = out_weight[src]
+            for tgt, w in neighbors:
+                # Column-normalized: edge_weight / total_out_weight * src_rank
+                contribution = damping * src_rank * w / src_out
+                new_rank[tgt] = new_rank.get(tgt, 0.0) + contribution
+
+        # Check convergence (L1 norm)
+        diff = sum(abs(new_rank.get(nid, 0.0) - rank.get(nid, 0.0)) for nid in all_nodes)
+        rank = new_rank
+        if diff < tol:
+            break
+
+    # --- 5. Return top-k ---
+    sorted_results = sorted(rank.items(), key=lambda x: x[1], reverse=True)
+    return sorted_results[:top_k]
diff --git a/src/synaptic/search.py b/src/synaptic/search.py
@@ -1,11 +1,12 @@
-"""Hybrid 3-stage search with spreading activation."""
+"""Hybrid 3-stage search with Personalized PageRank."""
 
 from __future__ import annotations
 
 import math
 from time import time
 
 from synaptic.models import ActivatedNode, Node, NodeKind, SearchResult
+from synaptic.ppr import personalized_pagerank
 from synaptic.protocols import QueryRewriter, StorageBackend
 from synaptic.resonance import ResonanceScorer
 from synaptic.synonyms import expand_synonyms
@@ -49,20 +50,20 @@ def _cosine_sim(a: list[float], b: list[float]) -> float:
 class HybridSearch:
     """3-stage fallback search: FTS+vector → synonym expansion → query rewrite."""
 
-    __slots__ = ("_query_rewriter", "_scorer", "_spread_decay", "_spread_depth")
+    __slots__ = ("_ppr_damping", "_query_rewriter", "_scorer")
 
     def __init__(
         self,
         *,
         scorer: ResonanceScorer | None = None,
         query_rewriter: QueryRewriter | None = None,
-        spread_decay: float = 0.25,
-        spread_depth: int = 1,
+        spread_decay: float = 0.25,  # deprecated, kept for compat
+        spread_depth: int = 1,  # deprecated, kept for compat
+        ppr_damping: float = 0.85,
     ) -> None:
         self._scorer = scorer or ResonanceScorer()
         self._query_rewriter = query_rewriter
-        self._spread_decay = spread_decay
-        self._spread_depth = spread_depth
+        self._ppr_damping = ppr_damping
 
     async def search(
         self,
@@ -136,22 +137,28 @@ async def search(
                         all_nodes[node.id] = (node, 0.4)
             stages_used.append("rewriter")
 
-        # Spreading activation: expand from top candidates
+        # PPR: graph-aware discovery + mild re-ranking
         total_candidates = len(all_nodes)
-        top_ids = sorted(all_nodes, key=lambda nid: all_nodes[nid][1], reverse=True)[:5]
-        for nid in top_ids:
-            parent_score = all_nodes[nid][1]
-            neighbors = await backend.get_neighbors(nid, depth=self._spread_depth)
-            for neighbor_node, edge in neighbors:
-                activation = parent_score * edge.weight * self._spread_decay
-                if neighbor_node.id not in all_nodes:
-                    all_nodes[neighbor_node.id] = (neighbor_node, max(0.0, min(1.0, activation)))
+        if all_nodes:
+            seed_scores = {nid: score for nid, (_node, score) in all_nodes.items()}
+            ppr_results = await personalized_pagerank(
+                backend,
+                seed_scores,
+                damping=self._ppr_damping,
+                top_k=limit * 2,
+            )
+            for node_id, ppr_score in ppr_results:
+                if node_id not in all_nodes:
+                    # PPR이 새로 발견한 노드 — 그래프 경로로만 도달 가능
+                    node = await backend.get_node(node_id)
+                    if node:
+                        all_nodes[node_id] = (node, ppr_score * 0.8)
                 else:
-                    # 이미 있으면 미세 부스트만 (FTS 직접 매칭 랭킹 보존)
-                    existing = all_nodes[neighbor_node.id]
-                    boosted = min(1.0, existing[1] + activation * 0.1)
+                    # 기존 FTS 결과 — PPR로 미세 부스트만 (FTS 랭킹 보존)
+                    existing = all_nodes[node_id]
+                    boosted = min(1.0, existing[1] + ppr_score * 0.1)
                     if boosted > existing[1]:
-                        all_nodes[neighbor_node.id] = (existing[0], boosted)
+                        all_nodes[node_id] = (existing[0], boosted)
 
         # Filter by node_kinds if specified
         if node_kinds: