From 51f272cf222cd59cf89edc8b38cce81e6049bcd9 Mon Sep 17 00:00:00 2001
From: rinaldofesta <festarinaldo@gmail.com>
Date: Mon, 1 Jun 2026 16:31:02 +0200
Subject: [PATCH 1/2] ds4-eval: fix integer/regrade grader bugs + golden
 self-tests

Two model-free correctness fixes in the answer grader, locked by new
--self-test-extractors cases (no model/GPU needed):

- find_integer_answer: the answer-marker path took the first integer in
  the window, so an answer line that shows its arithmetic
  ("Answer: m+n = 256+37 = 293") was graded as the first summand (256)
  instead of the stated total (293) -> false negative. Reachable on the
  many embedded AIME2025 "Find m+n / a+b+c" cases. The scan is now bound
  to the answer line and, when it shows arithmetic, reads the value after
  the last '='. "Final answer: 082" -> "82" and the loose fallback are
  preserved.

- regrade_trace_file: every case with a MODEL_OUTPUT block was graded,
  including interrupted ones (STOPPED/SKIPPED/SWITCHED/ERROR). Their
  partial output inflated passed/failed and raised spurious "changed"
  drift. Grading is now factored into regrade_case_outcome(), which only
  grades PASSED/FAILED (and legacy status-less) traces; others are
  reported via a new not_graded counter.

Tested: make (Metal) and make cpu build clean (-Wall -Wextra);
./ds4-eval --self-test-extractors passes; regrade of a 3-case trace
(1 PASSED, 2 STOPPED) reports passed=1 not_graded=2 changed=0
(was passed=2 failed=1 changed=1).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 ds4_eval.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 100 insertions(+), 5 deletions(-)

diff --git a/ds4_eval.c b/ds4_eval.c
index ecb4881c5..429568b58 100644
--- a/ds4_eval.c
+++ b/ds4_eval.c
@@ -2709,7 +2709,17 @@ static void find_integer_answer(const char *generated, char *dst, size_t dstlen)
     if (answer) {
         const char *end = answer + strlen(answer);
         if (strlen(answer) > 160) end = answer + 160;
-        if (scan_first_integer(answer, end, dst, dstlen)) return;
+        /* Restrict to the final answer line: digits after it (continued
+         * reasoning, footnotes, years) must not override the answer. */
+        const char *nl = memchr(answer, '\n', (size_t)(end - answer));
+        if (nl) end = nl;
+        /* When the line shows arithmetic ("m+n = 256+37 = 293") the stated
+         * result is the right-hand side of the LAST '='. Otherwise the first
+         * integer on the line is the answer (keeps "Final answer: 082"). */
+        const char *eq = NULL;
+        for (const char *r = answer; r < end; r++) if (*r == '=') eq = r;
+        if (scan_first_integer(eq ? eq + 1 : answer, end, dst, dstlen)) return;
+        if (eq && scan_first_integer(answer, end, dst, dstlen)) return;
     }
 
     const char *last_start = NULL;
@@ -3076,6 +3086,32 @@ static char *trace_copy_model_output(const char *case_start, const char *case_en
     return out;
 }
 
+typedef enum {
+    REGRADE_NOT_GRADED,
+    REGRADE_PASSED,
+    REGRADE_FAILED,
+} regrade_outcome;
+
+/* Decide how one trace case regrades. Only PASSED/FAILED traces were graded by
+ * a completed run; STOPPED/SKIPPED/SWITCHED/ERROR cases carry partial or no
+ * model output and must be reported as not-graded rather than counted, since
+ * grading them inflates the totals and raises spurious "changed" drift. An
+ * empty status keeps legacy traces (written before the status line) working. */
+static regrade_outcome regrade_case_outcome(const eval_case *tc,
+                                            const char *traced_status,
+                                            const char *model_output,
+                                            char *got, size_t gotlen) {
+    bool gradeable = traced_status[0] == '\0' ||
+                     !strcmp(traced_status, "PASSED") ||
+                     !strcmp(traced_status, "FAILED");
+    if (!gradeable) {
+        if (gotlen) got[0] = '\0';
+        return REGRADE_NOT_GRADED;
+    }
+    find_case_answer(tc, model_output, got, gotlen);
+    return answer_matches(tc, got) ? REGRADE_PASSED : REGRADE_FAILED;
+}
+
 static int regrade_trace_file(const char *path) {
     size_t len = 0;
     char *text = read_text_file(path, &len);
@@ -3089,6 +3125,7 @@ static int regrade_trace_file(const char *path) {
     int changed = 0;
     int unknown = 0;
     int parse_errors = 0;
+    int not_graded = 0;
 
     while (true) {
         const char *case_start = trace_find_next_case(start, end);
@@ -3130,8 +3167,14 @@ static int regrade_trace_file(const char *path) {
         }
 
         char got[EVAL_ANSWER_MAX];
-        find_case_answer(tc, model_output, got, sizeof(got));
-        bool ok = answer_matches(tc, got);
+        regrade_outcome outcome =
+            regrade_case_outcome(tc, traced_status, model_output, got, sizeof(got));
+        if (outcome == REGRADE_NOT_GRADED) {
+            not_graded++;
+            free(model_output);
+            continue;
+        }
+        bool ok = (outcome == REGRADE_PASSED);
         if (ok) passed++;
         else failed++;
 
@@ -3148,8 +3191,8 @@ static int regrade_trace_file(const char *path) {
         free(model_output);
     }
 
-    printf("ds4-eval: regraded %d cases from %s: passed=%d failed=%d changed=%d unknown=%d parse_errors=%d\n",
-           total, path, passed, failed, changed, unknown, parse_errors);
+    printf("ds4-eval: regraded %d cases from %s: passed=%d failed=%d changed=%d not_graded=%d unknown=%d parse_errors=%d\n",
+           total, path, passed, failed, changed, not_graded, unknown, parse_errors);
     free(text);
     return (unknown || parse_errors || total == 0) ? 1 : 0;
 }
@@ -3217,10 +3260,37 @@ static int trace_copy_self_test_case(void) {
     return 0;
 }
 
+static int regrade_status_self_test_case(void) {
+    int failed = 0;
+    const eval_case integer = {.source = "AIME2025", .answer = "82"};
+    char got[EVAL_ANSWER_MAX];
+
+    if (regrade_case_outcome(&integer, "PASSED", "</think>Answer: 82",
+                             got, sizeof(got)) != REGRADE_PASSED) {
+        fprintf(stderr, "ds4-eval: regrade self-test failed: PASSED trace not regraded\n");
+        failed++;
+    }
+    /* An interrupted run whose partial output happens to look correct must not
+     * be counted or flagged as drift. */
+    if (regrade_case_outcome(&integer, "STOPPED", "</think>Answer: 82",
+                             got, sizeof(got)) != REGRADE_NOT_GRADED) {
+        fprintf(stderr, "ds4-eval: regrade self-test failed: STOPPED trace was graded\n");
+        failed++;
+    }
+    /* Legacy traces without a status line stay gradeable. */
+    if (regrade_case_outcome(&integer, "", "</think>Answer: 82",
+                             got, sizeof(got)) != REGRADE_PASSED) {
+        fprintf(stderr, "ds4-eval: regrade self-test failed: legacy empty status not graded\n");
+        failed++;
+    }
+    return failed;
+}
+
 static int run_extractor_self_tests(void) {
     int failed = 0;
 
     failed += trace_copy_self_test_case();
+    failed += regrade_status_self_test_case();
 
     const eval_case mc = {
         .source = "SuperGPQA",
@@ -3281,6 +3351,31 @@ static int run_extractor_self_tests(void) {
         "Answer: 10",
         "10");
 
+    /* --- Regression cases for answer-extractor false negatives. These guard
+     *     against the grader under-reporting model accuracy on well-formed
+     *     final-answer lines. --- */
+
+    /* Integer: when the answer line shows the arithmetic, the graded value
+     * must be the stated result (right of the last '='), not the first
+     * summand/factor; digits on later lines must not leak in either. Many
+     * embedded AIME2025 cases ask for a derived sum (m+n, a+b+c, ...). */
+    const eval_case int_293 = {.source = "AIME2025", .answer = "293"};
+    failed += extractor_self_test_case(
+        "integer: sum line grades the total, not the first summand",
+        &int_293, "</think>Answer: m+n = 256+37 = 293", "293");
+    const eval_case int_62 = {.source = "AIME2025", .answer = "62"};
+    failed += extractor_self_test_case(
+        "integer: three-term sum line grades the total",
+        &int_62, "</think>Answer: a+b+c = 12+25+25 = 62", "62");
+    const eval_case int_81 = {.source = "AIME2025", .answer = "81"};
+    failed += extractor_self_test_case(
+        "integer: product-sum line grades the result, not the first factor",
+        &int_81, "</think>Answer: 2*7 + 3*6 + 5*4 + 7*5 = 81", "81");
+    const eval_case int_82 = {.source = "AIME2025", .answer = "82"};
+    failed += extractor_self_test_case(
+        "integer: digits on a later line do not override the answer line",
+        &int_82, "</think>Answer: 82\nThe value 2025 is just the year.", "82");
+
     if (failed) return 1;
     printf("ds4-eval: answer extractor self-tests passed\n");
     return 0;

From d043fddc5d92bb7b17b1ad13a2028cf6525a06c8 Mon Sep 17 00:00:00 2001
From: rinaldofesta <festarinaldo@gmail.com>
Date: Mon, 1 Jun 2026 16:31:50 +0200
Subject: [PATCH 2/2] ds4-eval: stop a leading pronoun/article from shadowing
 the MC answer

find_answer_letter returned the first boundary-isolated in-range capital
after "Answer:", so on 10-choice (A-J) cases a leading English pronoun or
article was graded as the choice:

    "Answer: I think it is C"        -> I   (should be C)
    "Answer: I'll go with C."        -> I   (should be C)
    "Answer: A careful reading ... C" -> A   (should be C)

24 embedded cases are 10-choice, so this is reachable. The forward scan
now skips a standalone capital that begins a same-line word or a
contraction; the reverse scan still recovers it when it is the only
candidate ("Answer: A is correct"), and a genuine standalone answer
("Answer: I.") is unchanged.

Known limitation: a distractor explicitly rejected *before* the chosen
letter on the same line ("... rules out C, leaving D") is still misread;
that needs sentence-level parsing and is left for discussion.

Locked by new --self-test-extractors cases (model-free). All prior
self-tests and the integer/regrade cases continue to pass.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 ds4_eval.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/ds4_eval.c b/ds4_eval.c
index 429568b58..6e24883db 100644
--- a/ds4_eval.c
+++ b/ds4_eval.c
@@ -2656,7 +2656,19 @@ static char find_answer_letter(const char *generated, int nchoices) {
             if (c >= 'A' && c <= max_answer) {
                 char before = p == visible ? ' ' : p[-1];
                 char after = p[1];
-                if (is_letter_boundary(before, after)) return c;
+                if (!is_letter_boundary(before, after)) continue;
+                /* A standalone capital that begins a same-line English word
+                 * ("A careful ...") or a contraction ("I'll ...") is prose,
+                 * not the model's pick: the real letter comes later on the
+                 * line. Skip it; the reverse scan below still recovers it if
+                 * it was the only candidate (e.g. "Answer: A is correct"). */
+                if (after == '\'') continue;
+                if (after == ' ' || after == '\t') {
+                    const char *w = p + 1;
+                    while (*w == ' ' || *w == '\t') w++;
+                    if (islower((unsigned char)*w)) continue;
+                }
+                return c;
             }
         }
     }
@@ -3355,6 +3367,44 @@ static int run_extractor_self_tests(void) {
      *     against the grader under-reporting model accuracy on well-formed
      *     final-answer lines. --- */
 
+    /* Multiple choice: a standalone in-range capital that merely begins an
+     * English word ("I think", "A careful") or a contraction ("I'll") must
+     * not shadow the choice the model actually states later on the line.
+     * Only reachable when the stray letter is itself a valid option, i.e.
+     * 10-choice (A-J) cases, of which the embedded set has 24. */
+    const eval_case mc_c = {
+        .source = "SuperGPQA",
+        .choice = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"},
+        .answer = "C",
+    };
+    failed += extractor_self_test_case(
+        "MC: leading pronoun 'I' does not shadow the chosen letter",
+        &mc_c, "</think>Answer: I think it is C", "C");
+    failed += extractor_self_test_case(
+        "MC: contraction I'll does not shadow the chosen letter",
+        &mc_c, "</think>Answer: I'll go with C.", "C");
+    failed += extractor_self_test_case(
+        "MC: leading article 'A' does not shadow the chosen letter",
+        &mc_c, "</think>Answer: A careful reading shows C.", "C");
+
+    const eval_case mc_i = {
+        .source = "SuperGPQA",
+        .choice = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"},
+        .answer = "I",
+    };
+    failed += extractor_self_test_case(
+        "MC: a genuine standalone 'I' answer is still picked",
+        &mc_i, "</think>Answer: I.", "I");
+
+    const eval_case mc4_d = {
+        .source = "SuperGPQA",
+        .choice = {"A", "B", "C", "D"},
+        .answer = "D",
+    };
+    failed += extractor_self_test_case(
+        "MC: out-of-range pronoun is harmless on 4-choice cases",
+        &mc4_d, "</think>Answer: I think it is D", "D");
+
     /* Integer: when the answer line shows the arithmetic, the graded value
      * must be the stated result (right of the last '='), not the first
      * summand/factor; digits on later lines must not leak in either. Many