Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 63 additions & 1 deletion casts/resume_ingestor/modules/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,29 @@ class ValidateQuestionsNode(BaseNode):
"system": 4,
"deep-dive": 4,
}
_SEMANTIC_STOPWORDS: set[str] = {
"a",
"an",
"and",
"are",
"at",
"did",
"do",
"for",
"how",
"in",
"of",
"on",
"or",
"that",
"the",
"to",
"what",
"when",
"with",
"you",
"your",
}

def execute(self, state):
existing_errors = list(state.get("errors", []))
Expand Down Expand Up @@ -726,16 +749,25 @@ def execute(self, state):

deduped_questions: list[dict[str, object]] = []
seen_questions: set[str] = set()
seen_semantic_tokens: list[set[str]] = []
duplicate_count = 0
near_duplicate_count = 0
for question in typed_questions:
normalized = self._normalize_text(str(question.get("question", "")))
raw_question = str(question.get("question", ""))
normalized = self._normalize_text(raw_question)
if not normalized:
duplicate_count += 1
continue
semantic_tokens = self._semantic_tokens(raw_question)
if normalized in seen_questions:
duplicate_count += 1
continue
if self._is_semantic_duplicate(semantic_tokens, seen_semantic_tokens):
near_duplicate_count += 1
continue

seen_questions.add(normalized)
seen_semantic_tokens.append(semantic_tokens)
deduped_questions.append(dict(question))

generic_count = 0
Expand Down Expand Up @@ -769,6 +801,8 @@ def execute(self, state):
notes: list[str] = []
if duplicate_count:
notes.append(f"removed {duplicate_count} duplicate/empty questions")
if near_duplicate_count:
notes.append(f"removed {near_duplicate_count} semantic near-duplicates")
if generic_count:
notes.append(f"rewrote {generic_count} generic questions")
if fill_count:
Expand Down Expand Up @@ -806,6 +840,34 @@ def _extract_evidence_terms(self, signals: object) -> list[str]:
evidence.append(item)
return evidence

def _semantic_tokens(self, question_text: str) -> set[str]:
lowered = question_text.lower()
return {
token
for token in re.findall(r"[a-z0-9]+", lowered)
if len(token) >= 4 and token not in self._SEMANTIC_STOPWORDS
}

def _is_semantic_duplicate(
self, candidate: set[str], seen_candidates: list[set[str]]
) -> bool:
if not candidate:
return False

for existing in seen_candidates:
if not existing:
continue

overlap = len(candidate & existing)
union = len(candidate | existing)
jaccard = overlap / union if union else 0.0
containment = overlap / min(len(candidate), len(existing))

if jaccard >= 0.68 or containment >= 0.8:
return True

return False

def _is_generic_question(self, question: str, evidence_terms: list[str]) -> bool:
normalized_question = question.lower()
if len(normalized_question.strip()) < 25:
Expand Down
41 changes: 41 additions & 0 deletions tests/node_tests/test_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,3 +265,44 @@ def test_validate_questions_node_rewrites_generic_question_using_evidence() -> N
assert len(result["questions"]) == 15
rewritten = result["questions"][0]["question"].lower()
assert "kubernetes" in rewritten or "realtime fraud detection" in rewritten


def test_validate_questions_node_removes_semantic_near_duplicates() -> None:
node = ValidateQuestionsNode()
result = node(
{
"signals": {
"skills": ["API Gateway", "Rate Limiting"],
"projects": ["Traffic shaping platform"],
"keywords": ["burst", "throughput"],
},
"questions": [
{
"id": "q01",
"category": "system",
"difficulty": 4,
"question": "How did you design API gateway rate limiting for burst traffic?",
"expected_points": ["Context"],
"followups": ["Why this design?"],
},
{
"id": "q02",
"category": "system",
"difficulty": 4,
"question": "What approach did you take to implement burst-traffic rate limiting at the API gateway?",
"expected_points": ["Context"],
"followups": ["What trade-offs?"],
},
],
"errors": [],
}
)

assert len(result["questions"]) == 15
similar_count = sum(
1
for question in result["questions"]
if "api gateway" in question["question"].lower()
and "rate limiting" in question["question"].lower()
)
assert similar_count == 1
Loading