From 9337faa0e13eab4cd11139727fe9d1ee5f3b7d46 Mon Sep 17 00:00:00 2001 From: Woojin Son Date: Sat, 28 Feb 2026 19:56:24 +0900 Subject: [PATCH 1/2] =?UTF-8?q?=E2=9A=A1=EF=B8=8Ffeat(resume=5Fingestor):?= =?UTF-8?q?=20add=20validation=20of=20semantic=20stopwords?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- casts/resume_ingestor/modules/nodes.py | 64 +++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/casts/resume_ingestor/modules/nodes.py b/casts/resume_ingestor/modules/nodes.py index 9a2ce17..84c0867 100644 --- a/casts/resume_ingestor/modules/nodes.py +++ b/casts/resume_ingestor/modules/nodes.py @@ -685,6 +685,29 @@ class ValidateQuestionsNode(BaseNode): "system": 4, "deep-dive": 4, } + _SEMANTIC_STOPWORDS: set[str] = { + "a", + "an", + "and", + "are", + "at", + "did", + "do", + "for", + "how", + "in", + "of", + "on", + "or", + "that", + "the", + "to", + "what", + "when", + "with", + "you", + "your", + } def execute(self, state): existing_errors = list(state.get("errors", [])) @@ -726,16 +749,25 @@ def execute(self, state): deduped_questions: list[dict[str, object]] = [] seen_questions: set[str] = set() + seen_semantic_tokens: list[set[str]] = [] duplicate_count = 0 + near_duplicate_count = 0 for question in typed_questions: - normalized = self._normalize_text(str(question.get("question", ""))) + raw_question = str(question.get("question", "")) + normalized = self._normalize_text(raw_question) if not normalized: duplicate_count += 1 continue + semantic_tokens = self._semantic_tokens(raw_question) if normalized in seen_questions: duplicate_count += 1 continue + if self._is_semantic_duplicate(semantic_tokens, seen_semantic_tokens): + near_duplicate_count += 1 + continue + seen_questions.add(normalized) + seen_semantic_tokens.append(semantic_tokens) deduped_questions.append(dict(question)) generic_count = 0 @@ -769,6 +801,8 @@ def execute(self, state): notes: list[str] = [] if duplicate_count: notes.append(f"removed {duplicate_count} duplicate/empty questions") + if near_duplicate_count: + notes.append(f"removed {near_duplicate_count} semantic near-duplicates") if generic_count: notes.append(f"rewrote {generic_count} generic questions") if fill_count: @@ -806,6 +840,34 @@ def _extract_evidence_terms(self, signals: object) -> list[str]: evidence.append(item) return evidence + def _semantic_tokens(self, question_text: str) -> set[str]: + lowered = question_text.lower() + return { + token + for token in re.findall(r"[a-z0-9]+", lowered) + if len(token) >= 4 and token not in self._SEMANTIC_STOPWORDS + } + + def _is_semantic_duplicate( + self, candidate: set[str], seen_candidates: list[set[str]] + ) -> bool: + if not candidate: + return False + + for existing in seen_candidates: + if not existing: + continue + + overlap = len(candidate & existing) + union = len(candidate | existing) + jaccard = overlap / union if union else 0.0 + containment = overlap / min(len(candidate), len(existing)) + + if jaccard >= 0.68 or containment >= 0.8: + return True + + return False + def _is_generic_question(self, question: str, evidence_terms: list[str]) -> bool: normalized_question = question.lower() if len(normalized_question.strip()) < 25: From 30fafc1f80fb5cc366020ce7de963b77e50af32f Mon Sep 17 00:00:00 2001 From: Woojin Son Date: Sat, 28 Feb 2026 19:56:52 +0900 Subject: [PATCH 2/2] =?UTF-8?q?=E2=9C=85test:=20test=20validation=20questi?= =?UTF-8?q?ons=20node=20removes=20semantic=20near=20duplicates?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/node_tests/test_node.py | 41 +++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/node_tests/test_node.py b/tests/node_tests/test_node.py index 3cb39d4..9f13c45 100644 --- a/tests/node_tests/test_node.py +++ b/tests/node_tests/test_node.py @@ -265,3 +265,44 @@ def test_validate_questions_node_rewrites_generic_question_using_evidence() -> N assert len(result["questions"]) == 15 rewritten = result["questions"][0]["question"].lower() assert "kubernetes" in rewritten or "realtime fraud detection" in rewritten + + +def test_validate_questions_node_removes_semantic_near_duplicates() -> None: + node = ValidateQuestionsNode() + result = node( + { + "signals": { + "skills": ["API Gateway", "Rate Limiting"], + "projects": ["Traffic shaping platform"], + "keywords": ["burst", "throughput"], + }, + "questions": [ + { + "id": "q01", + "category": "system", + "difficulty": 4, + "question": "How did you design API gateway rate limiting for burst traffic?", + "expected_points": ["Context"], + "followups": ["Why this design?"], + }, + { + "id": "q02", + "category": "system", + "difficulty": 4, + "question": "What approach did you take to implement burst-traffic rate limiting at the API gateway?", + "expected_points": ["Context"], + "followups": ["What trade-offs?"], + }, + ], + "errors": [], + } + ) + + assert len(result["questions"]) == 15 + similar_count = sum( + 1 + for question in result["questions"] + if "api gateway" in question["question"].lower() + and "rate limiting" in question["question"].lower() + ) + assert similar_count == 1