Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion casts/resume_ingestor/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
GenerateQuestionsNode,
ParseSectionsNode,
RateDifficultyNode,
ValidateQuestionsNode,
)
from casts.resume_ingestor.modules.state import InputState, OutputState, State

Expand Down Expand Up @@ -62,13 +63,15 @@ def build(self):
builder.add_node("extract_signals", ExtractSignalsNode())
builder.add_node("generate_questions", GenerateQuestionsNode())
builder.add_node("rate_difficulty", RateDifficultyNode())
builder.add_node("validate_questions", ValidateQuestionsNode())
builder.add_node("format_output", FormatOutputNode())
builder.add_edge(START, "extract_text")
builder.add_edge("extract_text", "parse_sections")
builder.add_edge("parse_sections", "extract_signals")
builder.add_edge("extract_signals", "generate_questions")
builder.add_edge("generate_questions", "rate_difficulty")
builder.add_edge("rate_difficulty", "format_output")
builder.add_edge("rate_difficulty", "validate_questions")
builder.add_edge("validate_questions", "format_output")
builder.add_edge("format_output", END)

graph = builder.compile()
Expand Down
247 changes: 247 additions & 0 deletions casts/resume_ingestor/modules/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,253 @@ def execute(self, state):
return {"questions": rated_questions}


class ValidateQuestionsNode(BaseNode):
"""Quality gate for generated questions before formatting output."""

_CATEGORY_BASE: dict[str, int] = {
"tech": 2,
"project": 3,
"system": 4,
"deep-dive": 4,
}

def execute(self, state):
existing_errors = list(state.get("errors", []))
if existing_errors:
questions = state.get("questions")
return {"questions": questions if isinstance(questions, list) else []}

questions = state.get("questions")
if not isinstance(questions, list):
return {
"questions": [],
"errors": existing_errors
+ [
_error_item(
node="validate_questions",
code="INVALID_QUESTIONS",
message="Questions payload is not a list.",
retryable=False,
)
],
}

typed_questions = [item for item in questions if isinstance(item, dict)]
if not typed_questions:
return {
"questions": [],
"errors": existing_errors
+ [
_error_item(
node="validate_questions",
code="EMPTY_QUESTIONS",
message="No questions available for quality validation.",
retryable=False,
)
],
}

evidence_terms = self._extract_evidence_terms(state.get("signals"))

deduped_questions: list[dict[str, object]] = []
seen_questions: set[str] = set()
duplicate_count = 0
for question in typed_questions:
normalized = self._normalize_text(str(question.get("question", "")))
if not normalized:
duplicate_count += 1
continue
if normalized in seen_questions:
duplicate_count += 1
continue
seen_questions.add(normalized)
deduped_questions.append(dict(question))

generic_count = 0
for idx, question in enumerate(deduped_questions):
if self._is_generic_question(
str(question.get("question", "")), evidence_terms
):
generic_count += 1
topic = (
evidence_terms[idx % len(evidence_terms)]
if evidence_terms
else "your recent resume experience"
)
category = str(question.get("category", "tech"))
question["question"] = self._build_contextual_question(category, topic)

fill_count = max(0, 15 - len(deduped_questions))
if fill_count:
deduped_questions.extend(
self._build_fallback_questions(fill_count, evidence_terms)
)

validated_questions = deduped_questions[:15]
validated_questions = [
self._normalize_question(index + 1, question)
for index, question in enumerate(validated_questions)
]

distribution_issue = self._distribution_issue(validated_questions)

notes: list[str] = []
if duplicate_count:
notes.append(f"removed {duplicate_count} duplicate/empty questions")
if generic_count:
notes.append(f"rewrote {generic_count} generic questions")
if fill_count:
notes.append(f"added {fill_count} fallback questions")
if distribution_issue:
notes.append(distribution_issue)

return {"questions": validated_questions}

def _normalize_text(self, text: str) -> str:
normalized = re.sub(r"\s+", " ", text.strip().lower())
return re.sub(r"[^a-z0-9 ]+", "", normalized)

def _extract_evidence_terms(self, signals: object) -> list[str]:
if not isinstance(signals, dict):
return []

merged: list[str] = []
for key in ("skills", "projects", "keywords"):
value = signals.get(key)
if isinstance(value, list):
merged.extend(
item.strip()
for item in value
if isinstance(item, str) and item.strip()
)

seen: set[str] = set()
evidence: list[str] = []
for item in merged:
lowered = item.lower()
if lowered in seen:
continue
seen.add(lowered)
evidence.append(item)
return evidence

def _is_generic_question(self, question: str, evidence_terms: list[str]) -> bool:
normalized_question = question.lower()
if len(normalized_question.strip()) < 25:
return True
if not evidence_terms:
return False

return not any(term.lower() in normalized_question for term in evidence_terms)

def _build_contextual_question(self, category: str, topic: str) -> str:
prompts = {
"tech": f"When applying {topic}, what production constraints shaped your implementation choices?",
"project": f"In your project work on {topic}, what outcome did you own directly and how did you deliver it?",
"system": f"If scaling a system around {topic}, what architecture trade-offs would you prioritize and why?",
"deep-dive": f"Describe the hardest technical trade-off you made involving {topic}, including validation steps and impact.",
}
return prompts.get(
category,
f"Describe a concrete engineering decision you made involving {topic} and the resulting impact.",
)

def _build_fallback_questions(
self, count: int, evidence_terms: list[str]
) -> list[dict[str, object]]:
categories = ("tech", "project", "system", "deep-dive")
fallback: list[dict[str, object]] = []

for idx in range(count):
category = categories[idx % len(categories)]
topic = (
evidence_terms[idx % len(evidence_terms)]
if evidence_terms
else "your recent engineering work"
)
fallback.append(
{
"id": "",
"category": category,
"difficulty": self._CATEGORY_BASE[category],
"question": self._build_contextual_question(category, topic),
"expected_points": [
"Specific context from the resume",
"Decision rationale and trade-offs",
"Measured outcomes and lessons learned",
],
"followups": [
"What risks did you consider?",
"How would you improve this approach now?",
],
}
)

return fallback

def _normalize_question(
self, index: int, question: dict[str, object]
) -> dict[str, object]:
normalized = dict(question)
category = str(normalized.get("category", "tech"))
if category not in self._CATEGORY_BASE:
category = "tech"
normalized["category"] = category

difficulty = normalized.get("difficulty", self._CATEGORY_BASE[category])
if not isinstance(difficulty, int) or not 1 <= difficulty <= 5:
difficulty = self._CATEGORY_BASE[category]
normalized["difficulty"] = difficulty

normalized["id"] = f"q{index:02d}"

expected_points = normalized.get("expected_points")
if not isinstance(expected_points, list) or not expected_points:
normalized["expected_points"] = [
"Specific context from the resume",
"Technical rationale and trade-offs",
"Outcome and retrospective insight",
]

followups = normalized.get("followups")
if not isinstance(followups, list) or not followups:
normalized["followups"] = [
"What constraints most influenced your decision?",
"What would you change if rebuilding this today?",
]

if (
not isinstance(normalized.get("question"), str)
or not str(normalized.get("question", "")).strip()
):
normalized["question"] = self._build_contextual_question(
category, "your recent engineering work"
)

return normalized

def _distribution_issue(self, questions: list[dict[str, object]]) -> str | None:
categories = [str(item.get("category", "tech")) for item in questions]
unique_categories = set(categories)
if len(unique_categories) < 3:
return "category distribution is narrow"

difficult_values = [
self._safe_difficulty_value(item.get("difficulty")) for item in questions
]
if max(difficult_values) - min(difficult_values) < 2:
return "difficulty spread is too small"

return None

def _safe_difficulty_value(self, value: object) -> int:
if isinstance(value, int):
return max(1, min(5, value))
if isinstance(value, str) and value.isdigit():
return max(1, min(5, int(value)))
return 1


class FormatOutputNode(BaseNode):
"""Final node that renders markdown from structured questions."""

Expand Down
57 changes: 57 additions & 0 deletions tests/node_tests/test_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
GenerateQuestionsNode,
ParseSectionsNode,
RateDifficultyNode,
ValidateQuestionsNode,
)


Expand Down Expand Up @@ -151,3 +152,59 @@ def test_format_output_node_renders_markdown() -> None:
assert "# Interview Questions" in result["markdown"]
assert "Difficulty: 3" in result["markdown"]
assert "Expected points:" in result["markdown"]


def test_validate_questions_node_removes_duplicates_and_normalizes_ids() -> None:
node = ValidateQuestionsNode()
duplicate_question = {
"id": "qx",
"category": "tech",
"difficulty": 3,
"question": "How did you use FastAPI in production?",
"expected_points": ["Context"],
"followups": ["What was hard?"],
}

result = node(
{
"signals": {
"skills": ["FastAPI", "Python"],
"projects": ["Built hiring pipeline service"],
"keywords": ["backend"],
},
"questions": [duplicate_question, duplicate_question],
"errors": [],
}
)

assert len(result["questions"]) == 15
assert result["questions"][0]["id"] == "q01"
assert result["questions"][1]["id"] == "q02"


def test_validate_questions_node_rewrites_generic_question_using_evidence() -> None:
node = ValidateQuestionsNode()
result = node(
{
"signals": {
"skills": ["Kubernetes"],
"projects": ["Realtime fraud detection"],
"keywords": ["scalability"],
},
"questions": [
{
"id": "q01",
"category": "system",
"difficulty": 4,
"question": "Tell me about your experience.",
"expected_points": ["Something"],
"followups": ["More details?"],
}
],
"errors": [],
}
)

assert len(result["questions"]) == 15
rewritten = result["questions"][0]["question"].lower()
assert "kubernetes" in rewritten or "realtime fraud detection" in rewritten
Loading