From 9337faa0e13eab4cd11139727fe9d1ee5f3b7d46 Mon Sep 17 00:00:00 2001
From: Woojin Son <swj1265@naver.com>
Date: Sat, 28 Feb 2026 19:56:24 +0900
Subject: [PATCH 1/2] =?UTF-8?q?=E2=9A=A1=EF=B8=8Ffeat(resume=5Fingestor):?=
 =?UTF-8?q?=20add=20validation=20of=20semantic=20stopwords?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 casts/resume_ingestor/modules/nodes.py | 64 +++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/casts/resume_ingestor/modules/nodes.py b/casts/resume_ingestor/modules/nodes.py
index 9a2ce17..84c0867 100644
--- a/casts/resume_ingestor/modules/nodes.py
+++ b/casts/resume_ingestor/modules/nodes.py
@@ -685,6 +685,29 @@ class ValidateQuestionsNode(BaseNode):
         "system": 4,
         "deep-dive": 4,
     }
+    _SEMANTIC_STOPWORDS: set[str] = {
+        "a",
+        "an",
+        "and",
+        "are",
+        "at",
+        "did",
+        "do",
+        "for",
+        "how",
+        "in",
+        "of",
+        "on",
+        "or",
+        "that",
+        "the",
+        "to",
+        "what",
+        "when",
+        "with",
+        "you",
+        "your",
+    }
 
     def execute(self, state):
         existing_errors = list(state.get("errors", []))
@@ -726,16 +749,25 @@ def execute(self, state):
 
         deduped_questions: list[dict[str, object]] = []
         seen_questions: set[str] = set()
+        seen_semantic_tokens: list[set[str]] = []
         duplicate_count = 0
+        near_duplicate_count = 0
         for question in typed_questions:
-            normalized = self._normalize_text(str(question.get("question", "")))
+            raw_question = str(question.get("question", ""))
+            normalized = self._normalize_text(raw_question)
             if not normalized:
                 duplicate_count += 1
                 continue
+            semantic_tokens = self._semantic_tokens(raw_question)
             if normalized in seen_questions:
                 duplicate_count += 1
                 continue
+            if self._is_semantic_duplicate(semantic_tokens, seen_semantic_tokens):
+                near_duplicate_count += 1
+                continue
+
             seen_questions.add(normalized)
+            seen_semantic_tokens.append(semantic_tokens)
             deduped_questions.append(dict(question))
 
         generic_count = 0
@@ -769,6 +801,8 @@ def execute(self, state):
         notes: list[str] = []
         if duplicate_count:
             notes.append(f"removed {duplicate_count} duplicate/empty questions")
+        if near_duplicate_count:
+            notes.append(f"removed {near_duplicate_count} semantic near-duplicates")
         if generic_count:
             notes.append(f"rewrote {generic_count} generic questions")
         if fill_count:
@@ -806,6 +840,34 @@ def _extract_evidence_terms(self, signals: object) -> list[str]:
             evidence.append(item)
         return evidence
 
+    def _semantic_tokens(self, question_text: str) -> set[str]:
+        lowered = question_text.lower()
+        return {
+            token
+            for token in re.findall(r"[a-z0-9]+", lowered)
+            if len(token) >= 4 and token not in self._SEMANTIC_STOPWORDS
+        }
+
+    def _is_semantic_duplicate(
+        self, candidate: set[str], seen_candidates: list[set[str]]
+    ) -> bool:
+        if not candidate:
+            return False
+
+        for existing in seen_candidates:
+            if not existing:
+                continue
+
+            overlap = len(candidate & existing)
+            union = len(candidate | existing)
+            jaccard = overlap / union if union else 0.0
+            containment = overlap / min(len(candidate), len(existing))
+
+            if jaccard >= 0.68 or containment >= 0.8:
+                return True
+
+        return False
+
     def _is_generic_question(self, question: str, evidence_terms: list[str]) -> bool:
         normalized_question = question.lower()
         if len(normalized_question.strip()) < 25:

From 30fafc1f80fb5cc366020ce7de963b77e50af32f Mon Sep 17 00:00:00 2001
From: Woojin Son <swj1265@naver.com>
Date: Sat, 28 Feb 2026 19:56:52 +0900
Subject: [PATCH 2/2] =?UTF-8?q?=E2=9C=85test:=20test=20validation=20questi?=
 =?UTF-8?q?ons=20node=20removes=20semantic=20near=20duplicates?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/node_tests/test_node.py | 41 +++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/tests/node_tests/test_node.py b/tests/node_tests/test_node.py
index 3cb39d4..9f13c45 100644
--- a/tests/node_tests/test_node.py
+++ b/tests/node_tests/test_node.py
@@ -265,3 +265,44 @@ def test_validate_questions_node_rewrites_generic_question_using_evidence() -> N
     assert len(result["questions"]) == 15
     rewritten = result["questions"][0]["question"].lower()
     assert "kubernetes" in rewritten or "realtime fraud detection" in rewritten
+
+
+def test_validate_questions_node_removes_semantic_near_duplicates() -> None:
+    node = ValidateQuestionsNode()
+    result = node(
+        {
+            "signals": {
+                "skills": ["API Gateway", "Rate Limiting"],
+                "projects": ["Traffic shaping platform"],
+                "keywords": ["burst", "throughput"],
+            },
+            "questions": [
+                {
+                    "id": "q01",
+                    "category": "system",
+                    "difficulty": 4,
+                    "question": "How did you design API gateway rate limiting for burst traffic?",
+                    "expected_points": ["Context"],
+                    "followups": ["Why this design?"],
+                },
+                {
+                    "id": "q02",
+                    "category": "system",
+                    "difficulty": 4,
+                    "question": "What approach did you take to implement burst-traffic rate limiting at the API gateway?",
+                    "expected_points": ["Context"],
+                    "followups": ["What trade-offs?"],
+                },
+            ],
+            "errors": [],
+        }
+    )
+
+    assert len(result["questions"]) == 15
+    similar_count = sum(
+        1
+        for question in result["questions"]
+        if "api gateway" in question["question"].lower()
+        and "rate limiting" in question["question"].lower()
+    )
+    assert similar_count == 1