From 492309c7fe7b28acf60777c38402abde57b84922 Mon Sep 17 00:00:00 2001 From: mac Date: Sat, 16 May 2026 00:46:31 +0700 Subject: [PATCH] test: expand transcript smoke coverage --- hooks/verification_stop_gate.py | 3 + scripts/smoke_test.py | 161 +++++++++++++++++++++++++++++--- 2 files changed, 152 insertions(+), 12 deletions(-) diff --git a/hooks/verification_stop_gate.py b/hooks/verification_stop_gate.py index 6ed28f4..9b2e824 100644 --- a/hooks/verification_stop_gate.py +++ b/hooks/verification_stop_gate.py @@ -82,6 +82,9 @@ def extract_text(value) -> str: parts = [extract_text(item) for item in value] return "\n".join(part for part in parts if part).strip() if isinstance(value, dict): + block_type = str(value.get("type") or "").lower() + if block_type in {"thinking", "redacted_thinking"}: + return "" for key in ("text", "message", "content", "value", "output", "result"): if key in value: text = extract_text(value.get(key)) diff --git a/scripts/smoke_test.py b/scripts/smoke_test.py index 364a418..b7e2806 100644 --- a/scripts/smoke_test.py +++ b/scripts/smoke_test.py @@ -8,6 +8,7 @@ import os import shutil import subprocess +import sys import tempfile from pathlib import Path @@ -17,11 +18,20 @@ TEMP = Path(tempfile.gettempdir()) SESSION = "standalone-fact-hook-smoke" STATE = TEMP / f"fact-verification-{SESSION}" +PYTHON = sys.executable + +sys.path.insert(0, str(HOOKS)) + +from verification_stop_gate import ( # noqa: E402 + get_last_assistant_message, + response_has_verification_caveat, + response_is_non_assertive, +) def run(script_name: str, payload: dict) -> dict: proc = subprocess.run( - ["python", str(HOOKS / script_name)], + [PYTHON, str(HOOKS / script_name)], input=json.dumps(payload), text=True, capture_output=True, @@ -39,11 +49,41 @@ def expect(name: str, result: dict, predicate, failures: list[str]) -> None: failures.append(f"{name} failed: {json.dumps(result, ensure_ascii=True)}") -def main() -> None: +def reset_state() -> None: if STATE.exists(): shutil.rmtree(STATE) STATE.mkdir(parents=True, exist_ok=True) + +def write_transcript(path: Path, rows: list[object]) -> None: + lines = [] + for row in rows: + if isinstance(row, str): + lines.append(row) + else: + lines.append(json.dumps(row)) + path.write_text("\n".join(lines) + "\n", encoding="utf-8") + + +def check_bool(name: str, value: bool) -> tuple[str, dict]: + return ( + name, + { + "code": 0 if value else 1, + "stdout": "", + "stderr": "" if value else "boolean check failed", + }, + ) + + +def main() -> None: + reset_state() + + transcript_dir = TEMP / f"fact-hook-transcripts-{SESSION}" + if transcript_dir.exists(): + shutil.rmtree(transcript_dir) + transcript_dir.mkdir(parents=True, exist_ok=True) + results = [] prompt_payload = { "session_id": SESSION, @@ -92,9 +132,7 @@ def main() -> None: results.append(("stop_blocks_unstructured_after_verification", run("verification_stop_gate.py", unverified_stop))) results.append(("stop_allows_structured_verified", run("verification_stop_gate.py", structured_stop))) - if STATE.exists(): - shutil.rmtree(STATE) - STATE.mkdir(parents=True, exist_ok=True) + reset_state() run("fact_prompt_gate.py", prompt_payload) caveated_stop = { "session_id": SESSION, @@ -102,9 +140,7 @@ def main() -> None: } results.append(("stop_blocks_caveat_without_attempt", run("verification_stop_gate.py", caveated_stop))) - if STATE.exists(): - shutil.rmtree(STATE) - STATE.mkdir(parents=True, exist_ok=True) + reset_state() run("fact_prompt_gate.py", prompt_payload) searched = { "session_id": SESSION, @@ -114,9 +150,7 @@ def main() -> None: results.append(("track_web_search", run("track_verification.py", searched))) results.append(("stop_allows_websearch_verified", run("verification_stop_gate.py", structured_stop))) - if STATE.exists(): - shutil.rmtree(STATE) - STATE.mkdir(parents=True, exist_ok=True) + reset_state() run("fact_prompt_gate.py", narrative_prompt_payload) missing_message_stop = { "session_id": SESSION, @@ -124,15 +158,109 @@ def main() -> None: } results.append(("stop_blocks_missing_message", run("verification_stop_gate.py", missing_message_stop))) + reset_state() + run("fact_prompt_gate.py", prompt_payload) + clarifying_stop = { + "session_id": SESSION, + "last_assistant_message": "Could you clarify which Claude Code host version and date range you want checked?", + } + results.append(("stop_allows_clarifying_question", run("verification_stop_gate.py", clarifying_stop))) + + nested_transcript = transcript_dir / "nested-assistant.jsonl" + nested_transcript_message = "The latest hook schema includes Stop, PreToolUse, PostToolUse, and UserPromptSubmit." + write_transcript( + nested_transcript, + [ + "", + "not-json", + {"role": "user", "content": "What is the latest Claude Code hook schema?"}, + { + "type": "assistant_message", + "message": { + "content": [ + {"type": "thinking", "text": "I should verify this."}, + {"type": "text", "text": nested_transcript_message}, + ] + }, + }, + ], + ) + results.append( + check_bool( + "extracts_nested_transcript_assistant_message", + get_last_assistant_message({"transcript_path": str(nested_transcript)}) == nested_transcript_message, + ) + ) + + reset_state() + run("fact_prompt_gate.py", prompt_payload) + results.append( + ( + "stop_blocks_nested_transcript_assistant", + run( + "verification_stop_gate.py", + {"session_id": SESSION, "transcript_path": str(nested_transcript)}, + ), + ) + ) + + malformed_transcript = transcript_dir / "malformed-only.jsonl" + write_transcript( + malformed_transcript, + [ + "", + "{not json", + {"role": "user", "content": "What is the latest Claude Code hook schema?"}, + {"type": "assistant_message", "message": {"content": []}}, + ], + ) + reset_state() + run("fact_prompt_gate.py", prompt_payload) + results.append( + ( + "stop_allows_malformed_transcript_when_active", + run( + "verification_stop_gate.py", + { + "session_id": SESSION, + "transcript_path": str(malformed_transcript), + "stop_hook_active": True, + }, + ), + ) + ) + + results.extend( + [ + check_bool( + "detects_unable_to_verify_caveat", + response_has_verification_caveat("I was unable to verify this from reliable sources."), + ), + check_bool( + "detects_best_effort_caveat", + response_has_verification_caveat("This is a best-effort answer based on currently available information."), + ), + check_bool( + "does_not_treat_plain_answer_as_caveat", + not response_has_verification_caveat("This answer is verified and final."), + ), + check_bool( + "detects_non_assertive_clarifying_question", + response_is_non_assertive("Could you clarify which release channel you mean?"), + ), + ] + ) + py_compile = subprocess.run( [ - "python", + PYTHON, "-m", "py_compile", str(HOOKS / "common.py"), str(HOOKS / "fact_prompt_gate.py"), str(HOOKS / "track_verification.py"), str(HOOKS / "verification_stop_gate.py"), + str(ROOT / "scripts" / "smoke_test.py"), ], text=True, capture_output=True, @@ -161,9 +289,18 @@ def main() -> None: expect("track_web_search", result_map["track_web_search"], lambda item: item["code"] == 0, failures) expect("stop_allows_websearch_verified", result_map["stop_allows_websearch_verified"], lambda item: item["code"] == 0 and not item["stdout"], failures) expect("stop_blocks_missing_message", result_map["stop_blocks_missing_message"], lambda item: "\"decision\": \"block\"" in item["stdout"], failures) + expect("stop_allows_clarifying_question", result_map["stop_allows_clarifying_question"], lambda item: item["code"] == 0 and not item["stdout"], failures) + expect("extracts_nested_transcript_assistant_message", result_map["extracts_nested_transcript_assistant_message"], lambda item: item["code"] == 0, failures) + expect("stop_blocks_nested_transcript_assistant", result_map["stop_blocks_nested_transcript_assistant"], lambda item: "\"decision\": \"block\"" in item["stdout"], failures) + expect("stop_allows_malformed_transcript_when_active", result_map["stop_allows_malformed_transcript_when_active"], lambda item: item["code"] == 0 and not item["stdout"], failures) + expect("detects_unable_to_verify_caveat", result_map["detects_unable_to_verify_caveat"], lambda item: item["code"] == 0, failures) + expect("detects_best_effort_caveat", result_map["detects_best_effort_caveat"], lambda item: item["code"] == 0, failures) + expect("does_not_treat_plain_answer_as_caveat", result_map["does_not_treat_plain_answer_as_caveat"], lambda item: item["code"] == 0, failures) + expect("detects_non_assertive_clarifying_question", result_map["detects_non_assertive_clarifying_question"], lambda item: item["code"] == 0, failures) expect("py_compile", result_map["py_compile"], lambda item: item["code"] == 0, failures) print(json.dumps(results, indent=2)) + shutil.rmtree(transcript_dir, ignore_errors=True) if failures: raise SystemExit("\n".join(failures))