Skip to content

Commit 3322c69

Browse files
SonAIengineclaude
andcommitted
fix: PhraseExtractor 검색 노이즈 해결 — phrase 필터링 + 최적화
## 문제 PhraseExtractor 활성화 시 Correctness 0.856→0.704 (-18%) phrase 노드가 검색 결과에 passage 대신 끼어들어 context 품질 저하 ## 수정 1. search.py: 최종 결과에서 _phrase 태그 노드 필터링 - PPR 단계에서는 bridge로 정상 동작 유지 - 결과 반환 시에만 passage 노드 우선 2. phrase_extractor.py: max_phrases 10→5, content 비움 (FTS 방지) 3. _is_meaningful() 강화: 1글자/숫자만 phrase 필터링 ## 결과 - Correctness: 0.704→0.849 (+21% 복구) - Retrieval: 36.8ms→16.5ms (56% 개선) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 0ec89ed commit 3322c69

6 files changed

Lines changed: 31 additions & 6 deletions

File tree

466 Bytes
Binary file not shown.
Binary file not shown.

src/synaptic/extensions/phrase_extractor.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,17 @@ def _normalize_phrase(phrase: str) -> str:
7777
def _is_meaningful(phrase: str) -> bool:
7878
"""Phrase가 의미 있는지 검사한다.
7979
80-
stop word만으로 구성된 구문은 제외.
80+
제외 조건:
81+
- stop word만으로 구성된 구문
82+
- 숫자만으로 구성된 구문 (연도 제외 — 연도는 별도 regex에서 처리)
83+
- 1글자 phrase
8184
"""
85+
stripped = phrase.strip()
86+
if len(stripped) < 2:
87+
return False
88+
# 숫자만으로 구성 (연도는 _RE_YEAR에서 이미 처리하므로 여기선 제외 가능)
89+
if stripped.isdigit():
90+
return False
8291
words = phrase.lower().split()
8392
non_stop = [w for w in words if w not in _STOP_WORDS]
8493
return len(non_stop) > 0
@@ -108,7 +117,7 @@ def __init__(
108117
self,
109118
*,
110119
min_phrase_length: int = 2,
111-
max_phrases_per_node: int = 10,
120+
max_phrases_per_node: int = 5,
112121
) -> None:
113122
"""PhraseExtractor를 초기화한다.
114123
@@ -174,7 +183,7 @@ async def extract_and_link(
174183
# graph.add가 아닌 store를 직접 사용)
175184
phrase_node = await graph._store.add_node(
176185
title=phrase,
177-
content=f"Phrase extracted from: {title}",
186+
content="", # minimal content to avoid FTS noise
178187
kind=NodeKind.ENTITY,
179188
tags=["_phrase"],
180189
)

src/synaptic/search.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,10 +206,24 @@ async def search(
206206
# Sort by resonance descending
207207
activated.sort(key=lambda a: a.resonance, reverse=True)
208208

209+
# Filter out internal phrase nodes (_phrase tag) from final results.
210+
# Phrase nodes serve as PPR bridge nodes but should not appear in
211+
# user-facing search results — they carry no passage content.
212+
final: list[ActivatedNode] = []
213+
fallback: list[ActivatedNode] = []
214+
for a in activated:
215+
if "_phrase" in (a.node.tags or []):
216+
fallback.append(a) # keep as last resort
217+
else:
218+
final.append(a)
219+
# If filtering removed too many, pad back with phrase nodes
220+
if len(final) < limit and fallback:
221+
final.extend(fallback[: limit - len(final)])
222+
209223
elapsed_ms = (time() - start) * 1000
210224
return SearchResult(
211225
query=query,
212-
nodes=activated[:limit],
226+
nodes=final[:limit],
213227
total_candidates=total_candidates,
214228
search_time_ms=elapsed_ms,
215229
stages_used=stages_used,
Binary file not shown.

tests/benchmark/test_e2e_qa.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -312,9 +312,10 @@ async def test_hotpotqa_e2e(self) -> None:
312312
if len(query_ids) > 24:
313313
query_ids = random.sample(query_ids, 24)
314314

315-
# 1. 그래프 구축 (Auto-Ontology)
316-
print("\n[Phase 1] 그래프 구축...")
315+
# 1. 그래프 구축 (Auto-Ontology + PhraseExtractor)
316+
print("\n[Phase 1] 그래프 구축 (PhraseExtractor 활성화)...")
317317
from synaptic.extensions.classifier_rules import RuleBasedClassifier
318+
from synaptic.extensions.phrase_extractor import PhraseExtractor
318319
from synaptic.extensions.relation_detector import RuleBasedRelationDetector
319320

320321
backend = MemoryBackend()
@@ -324,6 +325,7 @@ async def test_hotpotqa_e2e(self) -> None:
324325
backend,
325326
classifier=RuleBasedClassifier(),
326327
relation_detector=detector,
328+
phrase_extractor=PhraseExtractor(max_phrases_per_node=5),
327329
)
328330

329331
id_map: dict[str, str] = {}

0 commit comments

Comments
 (0)