diff --git a/ds4_eval.c b/ds4_eval.c index 5c2559263..0f0709f38 100644 --- a/ds4_eval.c +++ b/ds4_eval.c @@ -2684,6 +2684,62 @@ static char *find_last_answer_marker(const char *visible) { return last ? last : strcasestr_local(visible, "answer"); } +/* True when the in-range capital at `letter` is the object of an explicit + * rejection earlier on the same answer line -- "not B", "isn't B", + * "rules out C", "eliminate E" -- so it is a distractor the model is + * discarding, not its pick. This rewrites only the lexical "first valid + * letter wins" default for clear elimination phrasing; the cue set is kept + * small and high-precision on purpose, and there is no general + * selection-vs-rejection sentence parsing (issue #321). It never looks before + * `start` or across a newline, so "D, not B" still grades D: the pick is + * reached and accepted before the rejected distractor is ever inspected. */ +static bool mc_letter_is_negated(const char *start, const char *letter) { + const char *p = letter; + /* Step back over the gap to the previous word: spaces and light separating + * punctuation only, and never across a line break. */ + while (p > start) { + char c = p[-1]; + if (c == '\n') return false; + if (c == ' ' || c == '\t' || c == ',' || c == ';') p--; + else break; + } + /* Read the immediately preceding word (letters/apostrophe), lowercased. */ + const char *wend = p; + while (p > start && (isalpha((unsigned char)p[-1]) || p[-1] == '\'')) p--; + size_t wlen = (size_t)(wend - p); + if (wlen == 0 || wlen >= 16) return false; + char w[16]; + for (size_t i = 0; i < wlen; i++) w[i] = (char)tolower((unsigned char)p[i]); + w[wlen] = '\0'; + + /* Contraction form: isn't / aren't / wasn't / doesn't / won't / can't. */ + if (wlen >= 3 && strcmp(w + wlen - 3, "n't") == 0) return true; + + static const char *cues[] = { + "not", "except", "excluding", "exclude", "excludes", + "eliminate", "eliminates", "eliminated", + "reject", "rejects", "rejected", "rejecting", NULL + }; + for (int i = 0; cues[i]; i++) if (strcmp(w, cues[i]) == 0) return true; + + /* Two-word cue: "rule out" / "rules out" / "ruled out". */ + if (strcmp(w, "out") == 0) { + const char *q = p; + while (q > start && (q[-1] == ' ' || q[-1] == '\t')) q--; + const char *rend = q; + while (q > start && isalpha((unsigned char)q[-1])) q--; + size_t rl = (size_t)(rend - q); + if (rl && rl < 8) { + char r[8]; + for (size_t i = 0; i < rl; i++) r[i] = (char)tolower((unsigned char)q[i]); + r[rl] = '\0'; + if (!strcmp(r, "rule") || !strcmp(r, "rules") || !strcmp(r, "ruled")) + return true; + } + } + return false; +} + static char find_answer_letter(const char *generated, int nchoices) { if (nchoices <= 0) return '?'; const char *visible = strstr(generated, ""); @@ -2699,7 +2755,22 @@ static char find_answer_letter(const char *generated, int nchoices) { if (c >= 'A' && c <= max_answer) { char before = p == visible ? ' ' : p[-1]; char after = p[1]; - if (is_letter_boundary(before, after)) return c; + if (!is_letter_boundary(before, after)) continue; + /* A standalone capital that begins a same-line English word + * ("A careful ...") or a contraction ("I'll ...") is prose, + * not the model's pick: the real letter comes later on the + * line. Skip it; the reverse scan below still recovers it if + * it was the only candidate (e.g. "Answer: A is correct"). */ + if (after == '\'') continue; + if (after == ' ' || after == '\t') { + const char *w = p + 1; + while (*w == ' ' || *w == '\t') w++; + if (islower((unsigned char)*w)) continue; + } + /* A distractor explicitly rejected before the pick on the same + * line ("not B, ... D") must not win over the real choice. */ + if (mc_letter_is_negated(answer, p)) continue; + return c; } } } @@ -2752,7 +2823,17 @@ static void find_integer_answer(const char *generated, char *dst, size_t dstlen) if (answer) { const char *end = answer + strlen(answer); if (strlen(answer) > 160) end = answer + 160; - if (scan_first_integer(answer, end, dst, dstlen)) return; + /* Restrict to the final answer line: digits after it (continued + * reasoning, footnotes, years) must not override the answer. */ + const char *nl = memchr(answer, '\n', (size_t)(end - answer)); + if (nl) end = nl; + /* When the line shows arithmetic ("m+n = 256+37 = 293") the stated + * result is the right-hand side of the LAST '='. Otherwise the first + * integer on the line is the answer (keeps "Final answer: 082"). */ + const char *eq = NULL; + for (const char *r = answer; r < end; r++) if (*r == '=') eq = r; + if (scan_first_integer(eq ? eq + 1 : answer, end, dst, dstlen)) return; + if (eq && scan_first_integer(answer, end, dst, dstlen)) return; } const char *last_start = NULL; @@ -3119,6 +3200,32 @@ static char *trace_copy_model_output(const char *case_start, const char *case_en return out; } +typedef enum { + REGRADE_NOT_GRADED, + REGRADE_PASSED, + REGRADE_FAILED, +} regrade_outcome; + +/* Decide how one trace case regrades. Only PASSED/FAILED traces were graded by + * a completed run; STOPPED/SKIPPED/SWITCHED/ERROR cases carry partial or no + * model output and must be reported as not-graded rather than counted, since + * grading them inflates the totals and raises spurious "changed" drift. An + * empty status keeps legacy traces (written before the status line) working. */ +static regrade_outcome regrade_case_outcome(const eval_case *tc, + const char *traced_status, + const char *model_output, + char *got, size_t gotlen) { + bool gradeable = traced_status[0] == '\0' || + !strcmp(traced_status, "PASSED") || + !strcmp(traced_status, "FAILED"); + if (!gradeable) { + if (gotlen) got[0] = '\0'; + return REGRADE_NOT_GRADED; + } + find_case_answer(tc, model_output, got, gotlen); + return answer_matches(tc, got) ? REGRADE_PASSED : REGRADE_FAILED; +} + static int regrade_trace_file(const char *path) { size_t len = 0; char *text = read_text_file(path, &len); @@ -3132,6 +3239,7 @@ static int regrade_trace_file(const char *path) { int changed = 0; int unknown = 0; int parse_errors = 0; + int not_graded = 0; while (true) { const char *case_start = trace_find_next_case(start, end); @@ -3173,8 +3281,14 @@ static int regrade_trace_file(const char *path) { } char got[EVAL_ANSWER_MAX]; - find_case_answer(tc, model_output, got, sizeof(got)); - bool ok = answer_matches(tc, got); + regrade_outcome outcome = + regrade_case_outcome(tc, traced_status, model_output, got, sizeof(got)); + if (outcome == REGRADE_NOT_GRADED) { + not_graded++; + free(model_output); + continue; + } + bool ok = (outcome == REGRADE_PASSED); if (ok) passed++; else failed++; @@ -3191,8 +3305,8 @@ static int regrade_trace_file(const char *path) { free(model_output); } - printf("ds4-eval: regraded %d cases from %s: passed=%d failed=%d changed=%d unknown=%d parse_errors=%d\n", - total, path, passed, failed, changed, unknown, parse_errors); + printf("ds4-eval: regraded %d cases from %s: passed=%d failed=%d changed=%d not_graded=%d unknown=%d parse_errors=%d\n", + total, path, passed, failed, changed, not_graded, unknown, parse_errors); free(text); return (unknown || parse_errors || total == 0) ? 1 : 0; } @@ -3260,10 +3374,37 @@ static int trace_copy_self_test_case(void) { return 0; } +static int regrade_status_self_test_case(void) { + int failed = 0; + const eval_case integer = {.source = "AIME2025", .answer = "82"}; + char got[EVAL_ANSWER_MAX]; + + if (regrade_case_outcome(&integer, "PASSED", "Answer: 82", + got, sizeof(got)) != REGRADE_PASSED) { + fprintf(stderr, "ds4-eval: regrade self-test failed: PASSED trace not regraded\n"); + failed++; + } + /* An interrupted run whose partial output happens to look correct must not + * be counted or flagged as drift. */ + if (regrade_case_outcome(&integer, "STOPPED", "Answer: 82", + got, sizeof(got)) != REGRADE_NOT_GRADED) { + fprintf(stderr, "ds4-eval: regrade self-test failed: STOPPED trace was graded\n"); + failed++; + } + /* Legacy traces without a status line stay gradeable. */ + if (regrade_case_outcome(&integer, "", "Answer: 82", + got, sizeof(got)) != REGRADE_PASSED) { + fprintf(stderr, "ds4-eval: regrade self-test failed: legacy empty status not graded\n"); + failed++; + } + return failed; +} + static int run_extractor_self_tests(void) { int failed = 0; failed += trace_copy_self_test_case(); + failed += regrade_status_self_test_case(); const eval_case mc = { .source = "SuperGPQA", @@ -3324,6 +3465,93 @@ static int run_extractor_self_tests(void) { "Answer: 10", "10"); + /* --- Regression cases for answer-extractor false negatives. These guard + * against the grader under-reporting model accuracy on well-formed + * final-answer lines. --- */ + + /* Multiple choice: a standalone in-range capital that merely begins an + * English word ("I think", "A careful") or a contraction ("I'll") must + * not shadow the choice the model actually states later on the line. + * Only reachable when the stray letter is itself a valid option, i.e. + * 10-choice (A-J) cases, of which the embedded set has 24. */ + const eval_case mc_c = { + .source = "SuperGPQA", + .choice = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"}, + .answer = "C", + }; + failed += extractor_self_test_case( + "MC: leading pronoun 'I' does not shadow the chosen letter", + &mc_c, "Answer: I think it is C", "C"); + failed += extractor_self_test_case( + "MC: contraction I'll does not shadow the chosen letter", + &mc_c, "Answer: I'll go with C.", "C"); + failed += extractor_self_test_case( + "MC: leading article 'A' does not shadow the chosen letter", + &mc_c, "Answer: A careful reading shows C.", "C"); + + const eval_case mc_i = { + .source = "SuperGPQA", + .choice = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"}, + .answer = "I", + }; + failed += extractor_self_test_case( + "MC: a genuine standalone 'I' answer is still picked", + &mc_i, "Answer: I.", "I"); + + const eval_case mc4_d = { + .source = "SuperGPQA", + .choice = {"A", "B", "C", "D"}, + .answer = "D", + }; + failed += extractor_self_test_case( + "MC: out-of-range pronoun is harmless on 4-choice cases", + &mc4_d, "Answer: I think it is D", "D"); + + /* A distractor the model explicitly rejects *before* stating its pick on + * the same line must not shadow the real choice ("not B, ... D" / + * "rules out C, leaving D"). The bare first-valid-letter rule grabbed the + * rejected letter; a small, high-precision negation-cue skip fixes it. + * "D, not B" is the guard against the naive "take the last letter" fix: + * the pick is reached and accepted before the rejected distractor. #321 */ + const eval_case mc_d = { + .source = "SuperGPQA", + .choice = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"}, + .answer = "D", + }; + failed += extractor_self_test_case( + "MC: 'not B' distractor before the pick does not shadow it", + &mc_d, "Answer: It is not B, the answer is D", "D"); + failed += extractor_self_test_case( + "MC: 'rules out C, leaving D' grades the surviving choice", + &mc_d, "Answer: rules out C, leaving D", "D"); + failed += extractor_self_test_case( + "MC: contraction negation (isn't B) before the pick is skipped", + &mc_d, "Answer: It isn't B, so D", "D"); + failed += extractor_self_test_case( + "MC: a leading pick followed by 'not B' is still graded as the pick", + &mc_d, "Answer: D, not B", "D"); + + /* Integer: when the answer line shows the arithmetic, the graded value + * must be the stated result (right of the last '='), not the first + * summand/factor; digits on later lines must not leak in either. Many + * embedded AIME2025 cases ask for a derived sum (m+n, a+b+c, ...). */ + const eval_case int_293 = {.source = "AIME2025", .answer = "293"}; + failed += extractor_self_test_case( + "integer: sum line grades the total, not the first summand", + &int_293, "Answer: m+n = 256+37 = 293", "293"); + const eval_case int_62 = {.source = "AIME2025", .answer = "62"}; + failed += extractor_self_test_case( + "integer: three-term sum line grades the total", + &int_62, "Answer: a+b+c = 12+25+25 = 62", "62"); + const eval_case int_81 = {.source = "AIME2025", .answer = "81"}; + failed += extractor_self_test_case( + "integer: product-sum line grades the result, not the first factor", + &int_81, "Answer: 2*7 + 3*6 + 5*4 + 7*5 = 81", "81"); + const eval_case int_82 = {.source = "AIME2025", .answer = "82"}; + failed += extractor_self_test_case( + "integer: digits on a later line do not override the answer line", + &int_82, "Answer: 82\nThe value 2025 is just the year.", "82"); + if (failed) return 1; printf("ds4-eval: answer extractor self-tests passed\n"); return 0;