From 51f272cf222cd59cf89edc8b38cce81e6049bcd9 Mon Sep 17 00:00:00 2001 From: rinaldofesta Date: Mon, 1 Jun 2026 16:31:02 +0200 Subject: [PATCH 1/2] ds4-eval: fix integer/regrade grader bugs + golden self-tests Two model-free correctness fixes in the answer grader, locked by new --self-test-extractors cases (no model/GPU needed): - find_integer_answer: the answer-marker path took the first integer in the window, so an answer line that shows its arithmetic ("Answer: m+n = 256+37 = 293") was graded as the first summand (256) instead of the stated total (293) -> false negative. Reachable on the many embedded AIME2025 "Find m+n / a+b+c" cases. The scan is now bound to the answer line and, when it shows arithmetic, reads the value after the last '='. "Final answer: 082" -> "82" and the loose fallback are preserved. - regrade_trace_file: every case with a MODEL_OUTPUT block was graded, including interrupted ones (STOPPED/SKIPPED/SWITCHED/ERROR). Their partial output inflated passed/failed and raised spurious "changed" drift. Grading is now factored into regrade_case_outcome(), which only grades PASSED/FAILED (and legacy status-less) traces; others are reported via a new not_graded counter. Tested: make (Metal) and make cpu build clean (-Wall -Wextra); ./ds4-eval --self-test-extractors passes; regrade of a 3-case trace (1 PASSED, 2 STOPPED) reports passed=1 not_graded=2 changed=0 (was passed=2 failed=1 changed=1). Co-Authored-By: Claude Opus 4.8 --- ds4_eval.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 100 insertions(+), 5 deletions(-) diff --git a/ds4_eval.c b/ds4_eval.c index ecb4881c5..429568b58 100644 --- a/ds4_eval.c +++ b/ds4_eval.c @@ -2709,7 +2709,17 @@ static void find_integer_answer(const char *generated, char *dst, size_t dstlen) if (answer) { const char *end = answer + strlen(answer); if (strlen(answer) > 160) end = answer + 160; - if (scan_first_integer(answer, end, dst, dstlen)) return; + /* Restrict to the final answer line: digits after it (continued + * reasoning, footnotes, years) must not override the answer. */ + const char *nl = memchr(answer, '\n', (size_t)(end - answer)); + if (nl) end = nl; + /* When the line shows arithmetic ("m+n = 256+37 = 293") the stated + * result is the right-hand side of the LAST '='. Otherwise the first + * integer on the line is the answer (keeps "Final answer: 082"). */ + const char *eq = NULL; + for (const char *r = answer; r < end; r++) if (*r == '=') eq = r; + if (scan_first_integer(eq ? eq + 1 : answer, end, dst, dstlen)) return; + if (eq && scan_first_integer(answer, end, dst, dstlen)) return; } const char *last_start = NULL; @@ -3076,6 +3086,32 @@ static char *trace_copy_model_output(const char *case_start, const char *case_en return out; } +typedef enum { + REGRADE_NOT_GRADED, + REGRADE_PASSED, + REGRADE_FAILED, +} regrade_outcome; + +/* Decide how one trace case regrades. Only PASSED/FAILED traces were graded by + * a completed run; STOPPED/SKIPPED/SWITCHED/ERROR cases carry partial or no + * model output and must be reported as not-graded rather than counted, since + * grading them inflates the totals and raises spurious "changed" drift. An + * empty status keeps legacy traces (written before the status line) working. */ +static regrade_outcome regrade_case_outcome(const eval_case *tc, + const char *traced_status, + const char *model_output, + char *got, size_t gotlen) { + bool gradeable = traced_status[0] == '\0' || + !strcmp(traced_status, "PASSED") || + !strcmp(traced_status, "FAILED"); + if (!gradeable) { + if (gotlen) got[0] = '\0'; + return REGRADE_NOT_GRADED; + } + find_case_answer(tc, model_output, got, gotlen); + return answer_matches(tc, got) ? REGRADE_PASSED : REGRADE_FAILED; +} + static int regrade_trace_file(const char *path) { size_t len = 0; char *text = read_text_file(path, &len); @@ -3089,6 +3125,7 @@ static int regrade_trace_file(const char *path) { int changed = 0; int unknown = 0; int parse_errors = 0; + int not_graded = 0; while (true) { const char *case_start = trace_find_next_case(start, end); @@ -3130,8 +3167,14 @@ static int regrade_trace_file(const char *path) { } char got[EVAL_ANSWER_MAX]; - find_case_answer(tc, model_output, got, sizeof(got)); - bool ok = answer_matches(tc, got); + regrade_outcome outcome = + regrade_case_outcome(tc, traced_status, model_output, got, sizeof(got)); + if (outcome == REGRADE_NOT_GRADED) { + not_graded++; + free(model_output); + continue; + } + bool ok = (outcome == REGRADE_PASSED); if (ok) passed++; else failed++; @@ -3148,8 +3191,8 @@ static int regrade_trace_file(const char *path) { free(model_output); } - printf("ds4-eval: regraded %d cases from %s: passed=%d failed=%d changed=%d unknown=%d parse_errors=%d\n", - total, path, passed, failed, changed, unknown, parse_errors); + printf("ds4-eval: regraded %d cases from %s: passed=%d failed=%d changed=%d not_graded=%d unknown=%d parse_errors=%d\n", + total, path, passed, failed, changed, not_graded, unknown, parse_errors); free(text); return (unknown || parse_errors || total == 0) ? 1 : 0; } @@ -3217,10 +3260,37 @@ static int trace_copy_self_test_case(void) { return 0; } +static int regrade_status_self_test_case(void) { + int failed = 0; + const eval_case integer = {.source = "AIME2025", .answer = "82"}; + char got[EVAL_ANSWER_MAX]; + + if (regrade_case_outcome(&integer, "PASSED", "Answer: 82", + got, sizeof(got)) != REGRADE_PASSED) { + fprintf(stderr, "ds4-eval: regrade self-test failed: PASSED trace not regraded\n"); + failed++; + } + /* An interrupted run whose partial output happens to look correct must not + * be counted or flagged as drift. */ + if (regrade_case_outcome(&integer, "STOPPED", "Answer: 82", + got, sizeof(got)) != REGRADE_NOT_GRADED) { + fprintf(stderr, "ds4-eval: regrade self-test failed: STOPPED trace was graded\n"); + failed++; + } + /* Legacy traces without a status line stay gradeable. */ + if (regrade_case_outcome(&integer, "", "Answer: 82", + got, sizeof(got)) != REGRADE_PASSED) { + fprintf(stderr, "ds4-eval: regrade self-test failed: legacy empty status not graded\n"); + failed++; + } + return failed; +} + static int run_extractor_self_tests(void) { int failed = 0; failed += trace_copy_self_test_case(); + failed += regrade_status_self_test_case(); const eval_case mc = { .source = "SuperGPQA", @@ -3281,6 +3351,31 @@ static int run_extractor_self_tests(void) { "Answer: 10", "10"); + /* --- Regression cases for answer-extractor false negatives. These guard + * against the grader under-reporting model accuracy on well-formed + * final-answer lines. --- */ + + /* Integer: when the answer line shows the arithmetic, the graded value + * must be the stated result (right of the last '='), not the first + * summand/factor; digits on later lines must not leak in either. Many + * embedded AIME2025 cases ask for a derived sum (m+n, a+b+c, ...). */ + const eval_case int_293 = {.source = "AIME2025", .answer = "293"}; + failed += extractor_self_test_case( + "integer: sum line grades the total, not the first summand", + &int_293, "Answer: m+n = 256+37 = 293", "293"); + const eval_case int_62 = {.source = "AIME2025", .answer = "62"}; + failed += extractor_self_test_case( + "integer: three-term sum line grades the total", + &int_62, "Answer: a+b+c = 12+25+25 = 62", "62"); + const eval_case int_81 = {.source = "AIME2025", .answer = "81"}; + failed += extractor_self_test_case( + "integer: product-sum line grades the result, not the first factor", + &int_81, "Answer: 2*7 + 3*6 + 5*4 + 7*5 = 81", "81"); + const eval_case int_82 = {.source = "AIME2025", .answer = "82"}; + failed += extractor_self_test_case( + "integer: digits on a later line do not override the answer line", + &int_82, "Answer: 82\nThe value 2025 is just the year.", "82"); + if (failed) return 1; printf("ds4-eval: answer extractor self-tests passed\n"); return 0; From d043fddc5d92bb7b17b1ad13a2028cf6525a06c8 Mon Sep 17 00:00:00 2001 From: rinaldofesta Date: Mon, 1 Jun 2026 16:31:50 +0200 Subject: [PATCH 2/2] ds4-eval: stop a leading pronoun/article from shadowing the MC answer find_answer_letter returned the first boundary-isolated in-range capital after "Answer:", so on 10-choice (A-J) cases a leading English pronoun or article was graded as the choice: "Answer: I think it is C" -> I (should be C) "Answer: I'll go with C." -> I (should be C) "Answer: A careful reading ... C" -> A (should be C) 24 embedded cases are 10-choice, so this is reachable. The forward scan now skips a standalone capital that begins a same-line word or a contraction; the reverse scan still recovers it when it is the only candidate ("Answer: A is correct"), and a genuine standalone answer ("Answer: I.") is unchanged. Known limitation: a distractor explicitly rejected *before* the chosen letter on the same line ("... rules out C, leaving D") is still misread; that needs sentence-level parsing and is left for discussion. Locked by new --self-test-extractors cases (model-free). All prior self-tests and the integer/regrade cases continue to pass. Co-Authored-By: Claude Opus 4.8 --- ds4_eval.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/ds4_eval.c b/ds4_eval.c index 429568b58..6e24883db 100644 --- a/ds4_eval.c +++ b/ds4_eval.c @@ -2656,7 +2656,19 @@ static char find_answer_letter(const char *generated, int nchoices) { if (c >= 'A' && c <= max_answer) { char before = p == visible ? ' ' : p[-1]; char after = p[1]; - if (is_letter_boundary(before, after)) return c; + if (!is_letter_boundary(before, after)) continue; + /* A standalone capital that begins a same-line English word + * ("A careful ...") or a contraction ("I'll ...") is prose, + * not the model's pick: the real letter comes later on the + * line. Skip it; the reverse scan below still recovers it if + * it was the only candidate (e.g. "Answer: A is correct"). */ + if (after == '\'') continue; + if (after == ' ' || after == '\t') { + const char *w = p + 1; + while (*w == ' ' || *w == '\t') w++; + if (islower((unsigned char)*w)) continue; + } + return c; } } } @@ -3355,6 +3367,44 @@ static int run_extractor_self_tests(void) { * against the grader under-reporting model accuracy on well-formed * final-answer lines. --- */ + /* Multiple choice: a standalone in-range capital that merely begins an + * English word ("I think", "A careful") or a contraction ("I'll") must + * not shadow the choice the model actually states later on the line. + * Only reachable when the stray letter is itself a valid option, i.e. + * 10-choice (A-J) cases, of which the embedded set has 24. */ + const eval_case mc_c = { + .source = "SuperGPQA", + .choice = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"}, + .answer = "C", + }; + failed += extractor_self_test_case( + "MC: leading pronoun 'I' does not shadow the chosen letter", + &mc_c, "Answer: I think it is C", "C"); + failed += extractor_self_test_case( + "MC: contraction I'll does not shadow the chosen letter", + &mc_c, "Answer: I'll go with C.", "C"); + failed += extractor_self_test_case( + "MC: leading article 'A' does not shadow the chosen letter", + &mc_c, "Answer: A careful reading shows C.", "C"); + + const eval_case mc_i = { + .source = "SuperGPQA", + .choice = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"}, + .answer = "I", + }; + failed += extractor_self_test_case( + "MC: a genuine standalone 'I' answer is still picked", + &mc_i, "Answer: I.", "I"); + + const eval_case mc4_d = { + .source = "SuperGPQA", + .choice = {"A", "B", "C", "D"}, + .answer = "D", + }; + failed += extractor_self_test_case( + "MC: out-of-range pronoun is harmless on 4-choice cases", + &mc4_d, "Answer: I think it is D", "D"); + /* Integer: when the answer line shows the arithmetic, the graded value * must be the stated result (right of the last '='), not the first * summand/factor; digits on later lines must not leak in either. Many