Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 151 additions & 6 deletions ds4_eval.c
Original file line number Diff line number Diff line change
Expand Up @@ -2656,7 +2656,19 @@ static char find_answer_letter(const char *generated, int nchoices) {
if (c >= 'A' && c <= max_answer) {
char before = p == visible ? ' ' : p[-1];
char after = p[1];
if (is_letter_boundary(before, after)) return c;
if (!is_letter_boundary(before, after)) continue;
/* A standalone capital that begins a same-line English word
* ("A careful ...") or a contraction ("I'll ...") is prose,
* not the model's pick: the real letter comes later on the
* line. Skip it; the reverse scan below still recovers it if
* it was the only candidate (e.g. "Answer: A is correct"). */
if (after == '\'') continue;
if (after == ' ' || after == '\t') {
const char *w = p + 1;
while (*w == ' ' || *w == '\t') w++;
if (islower((unsigned char)*w)) continue;
}
return c;
}
}
}
Expand Down Expand Up @@ -2709,7 +2721,17 @@ static void find_integer_answer(const char *generated, char *dst, size_t dstlen)
if (answer) {
const char *end = answer + strlen(answer);
if (strlen(answer) > 160) end = answer + 160;
if (scan_first_integer(answer, end, dst, dstlen)) return;
/* Restrict to the final answer line: digits after it (continued
* reasoning, footnotes, years) must not override the answer. */
const char *nl = memchr(answer, '\n', (size_t)(end - answer));
if (nl) end = nl;
/* When the line shows arithmetic ("m+n = 256+37 = 293") the stated
* result is the right-hand side of the LAST '='. Otherwise the first
* integer on the line is the answer (keeps "Final answer: 082"). */
const char *eq = NULL;
for (const char *r = answer; r < end; r++) if (*r == '=') eq = r;
if (scan_first_integer(eq ? eq + 1 : answer, end, dst, dstlen)) return;
if (eq && scan_first_integer(answer, end, dst, dstlen)) return;
}

const char *last_start = NULL;
Expand Down Expand Up @@ -3076,6 +3098,32 @@ static char *trace_copy_model_output(const char *case_start, const char *case_en
return out;
}

typedef enum {
REGRADE_NOT_GRADED,
REGRADE_PASSED,
REGRADE_FAILED,
} regrade_outcome;

/* Decide how one trace case regrades. Only PASSED/FAILED traces were graded by
* a completed run; STOPPED/SKIPPED/SWITCHED/ERROR cases carry partial or no
* model output and must be reported as not-graded rather than counted, since
* grading them inflates the totals and raises spurious "changed" drift. An
* empty status keeps legacy traces (written before the status line) working. */
static regrade_outcome regrade_case_outcome(const eval_case *tc,
const char *traced_status,
const char *model_output,
char *got, size_t gotlen) {
bool gradeable = traced_status[0] == '\0' ||
!strcmp(traced_status, "PASSED") ||
!strcmp(traced_status, "FAILED");
if (!gradeable) {
if (gotlen) got[0] = '\0';
return REGRADE_NOT_GRADED;
}
find_case_answer(tc, model_output, got, gotlen);
return answer_matches(tc, got) ? REGRADE_PASSED : REGRADE_FAILED;
}

static int regrade_trace_file(const char *path) {
size_t len = 0;
char *text = read_text_file(path, &len);
Expand All @@ -3089,6 +3137,7 @@ static int regrade_trace_file(const char *path) {
int changed = 0;
int unknown = 0;
int parse_errors = 0;
int not_graded = 0;

while (true) {
const char *case_start = trace_find_next_case(start, end);
Expand Down Expand Up @@ -3130,8 +3179,14 @@ static int regrade_trace_file(const char *path) {
}

char got[EVAL_ANSWER_MAX];
find_case_answer(tc, model_output, got, sizeof(got));
bool ok = answer_matches(tc, got);
regrade_outcome outcome =
regrade_case_outcome(tc, traced_status, model_output, got, sizeof(got));
if (outcome == REGRADE_NOT_GRADED) {
not_graded++;
free(model_output);
continue;
}
bool ok = (outcome == REGRADE_PASSED);
if (ok) passed++;
else failed++;

Expand All @@ -3148,8 +3203,8 @@ static int regrade_trace_file(const char *path) {
free(model_output);
}

printf("ds4-eval: regraded %d cases from %s: passed=%d failed=%d changed=%d unknown=%d parse_errors=%d\n",
total, path, passed, failed, changed, unknown, parse_errors);
printf("ds4-eval: regraded %d cases from %s: passed=%d failed=%d changed=%d not_graded=%d unknown=%d parse_errors=%d\n",
total, path, passed, failed, changed, not_graded, unknown, parse_errors);
free(text);
return (unknown || parse_errors || total == 0) ? 1 : 0;
}
Expand Down Expand Up @@ -3217,10 +3272,37 @@ static int trace_copy_self_test_case(void) {
return 0;
}

static int regrade_status_self_test_case(void) {
int failed = 0;
const eval_case integer = {.source = "AIME2025", .answer = "82"};
char got[EVAL_ANSWER_MAX];

if (regrade_case_outcome(&integer, "PASSED", "</think>Answer: 82",
got, sizeof(got)) != REGRADE_PASSED) {
fprintf(stderr, "ds4-eval: regrade self-test failed: PASSED trace not regraded\n");
failed++;
}
/* An interrupted run whose partial output happens to look correct must not
* be counted or flagged as drift. */
if (regrade_case_outcome(&integer, "STOPPED", "</think>Answer: 82",
got, sizeof(got)) != REGRADE_NOT_GRADED) {
fprintf(stderr, "ds4-eval: regrade self-test failed: STOPPED trace was graded\n");
failed++;
}
/* Legacy traces without a status line stay gradeable. */
if (regrade_case_outcome(&integer, "", "</think>Answer: 82",
got, sizeof(got)) != REGRADE_PASSED) {
fprintf(stderr, "ds4-eval: regrade self-test failed: legacy empty status not graded\n");
failed++;
}
return failed;
}

static int run_extractor_self_tests(void) {
int failed = 0;

failed += trace_copy_self_test_case();
failed += regrade_status_self_test_case();

const eval_case mc = {
.source = "SuperGPQA",
Expand Down Expand Up @@ -3281,6 +3363,69 @@ static int run_extractor_self_tests(void) {
"Answer: 10",
"10");

/* --- Regression cases for answer-extractor false negatives. These guard
* against the grader under-reporting model accuracy on well-formed
* final-answer lines. --- */

/* Multiple choice: a standalone in-range capital that merely begins an
* English word ("I think", "A careful") or a contraction ("I'll") must
* not shadow the choice the model actually states later on the line.
* Only reachable when the stray letter is itself a valid option, i.e.
* 10-choice (A-J) cases, of which the embedded set has 24. */
const eval_case mc_c = {
.source = "SuperGPQA",
.choice = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"},
.answer = "C",
};
failed += extractor_self_test_case(
"MC: leading pronoun 'I' does not shadow the chosen letter",
&mc_c, "</think>Answer: I think it is C", "C");
failed += extractor_self_test_case(
"MC: contraction I'll does not shadow the chosen letter",
&mc_c, "</think>Answer: I'll go with C.", "C");
failed += extractor_self_test_case(
"MC: leading article 'A' does not shadow the chosen letter",
&mc_c, "</think>Answer: A careful reading shows C.", "C");

const eval_case mc_i = {
.source = "SuperGPQA",
.choice = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"},
.answer = "I",
};
failed += extractor_self_test_case(
"MC: a genuine standalone 'I' answer is still picked",
&mc_i, "</think>Answer: I.", "I");

const eval_case mc4_d = {
.source = "SuperGPQA",
.choice = {"A", "B", "C", "D"},
.answer = "D",
};
failed += extractor_self_test_case(
"MC: out-of-range pronoun is harmless on 4-choice cases",
&mc4_d, "</think>Answer: I think it is D", "D");

/* Integer: when the answer line shows the arithmetic, the graded value
* must be the stated result (right of the last '='), not the first
* summand/factor; digits on later lines must not leak in either. Many
* embedded AIME2025 cases ask for a derived sum (m+n, a+b+c, ...). */
const eval_case int_293 = {.source = "AIME2025", .answer = "293"};
failed += extractor_self_test_case(
"integer: sum line grades the total, not the first summand",
&int_293, "</think>Answer: m+n = 256+37 = 293", "293");
const eval_case int_62 = {.source = "AIME2025", .answer = "62"};
failed += extractor_self_test_case(
"integer: three-term sum line grades the total",
&int_62, "</think>Answer: a+b+c = 12+25+25 = 62", "62");
const eval_case int_81 = {.source = "AIME2025", .answer = "81"};
failed += extractor_self_test_case(
"integer: product-sum line grades the result, not the first factor",
&int_81, "</think>Answer: 2*7 + 3*6 + 5*4 + 7*5 = 81", "81");
const eval_case int_82 = {.source = "AIME2025", .answer = "82"};
failed += extractor_self_test_case(
"integer: digits on a later line do not override the answer line",
&int_82, "</think>Answer: 82\nThe value 2025 is just the year.", "82");

if (failed) return 1;
printf("ds4-eval: answer extractor self-tests passed\n");
return 0;
Expand Down