diff --git a/ds4_eval.c b/ds4_eval.c
index 5c2559263..0f0709f38 100644
--- a/ds4_eval.c
+++ b/ds4_eval.c
@@ -2684,6 +2684,62 @@ static char *find_last_answer_marker(const char *visible) {
     return last ? last : strcasestr_local(visible, "answer");
 }
 
+/* True when the in-range capital at `letter` is the object of an explicit
+ * rejection earlier on the same answer line -- "not B", "isn't B",
+ * "rules out C", "eliminate E" -- so it is a distractor the model is
+ * discarding, not its pick.  This rewrites only the lexical "first valid
+ * letter wins" default for clear elimination phrasing; the cue set is kept
+ * small and high-precision on purpose, and there is no general
+ * selection-vs-rejection sentence parsing (issue #321).  It never looks before
+ * `start` or across a newline, so "D, not B" still grades D: the pick is
+ * reached and accepted before the rejected distractor is ever inspected. */
+static bool mc_letter_is_negated(const char *start, const char *letter) {
+    const char *p = letter;
+    /* Step back over the gap to the previous word: spaces and light separating
+     * punctuation only, and never across a line break. */
+    while (p > start) {
+        char c = p[-1];
+        if (c == '\n') return false;
+        if (c == ' ' || c == '\t' || c == ',' || c == ';') p--;
+        else break;
+    }
+    /* Read the immediately preceding word (letters/apostrophe), lowercased. */
+    const char *wend = p;
+    while (p > start && (isalpha((unsigned char)p[-1]) || p[-1] == '\'')) p--;
+    size_t wlen = (size_t)(wend - p);
+    if (wlen == 0 || wlen >= 16) return false;
+    char w[16];
+    for (size_t i = 0; i < wlen; i++) w[i] = (char)tolower((unsigned char)p[i]);
+    w[wlen] = '\0';
+
+    /* Contraction form: isn't / aren't / wasn't / doesn't / won't / can't. */
+    if (wlen >= 3 && strcmp(w + wlen - 3, "n't") == 0) return true;
+
+    static const char *cues[] = {
+        "not", "except", "excluding", "exclude", "excludes",
+        "eliminate", "eliminates", "eliminated",
+        "reject", "rejects", "rejected", "rejecting", NULL
+    };
+    for (int i = 0; cues[i]; i++) if (strcmp(w, cues[i]) == 0) return true;
+
+    /* Two-word cue: "rule out" / "rules out" / "ruled out". */
+    if (strcmp(w, "out") == 0) {
+        const char *q = p;
+        while (q > start && (q[-1] == ' ' || q[-1] == '\t')) q--;
+        const char *rend = q;
+        while (q > start && isalpha((unsigned char)q[-1])) q--;
+        size_t rl = (size_t)(rend - q);
+        if (rl && rl < 8) {
+            char r[8];
+            for (size_t i = 0; i < rl; i++) r[i] = (char)tolower((unsigned char)q[i]);
+            r[rl] = '\0';
+            if (!strcmp(r, "rule") || !strcmp(r, "rules") || !strcmp(r, "ruled"))
+                return true;
+        }
+    }
+    return false;
+}
+
 static char find_answer_letter(const char *generated, int nchoices) {
     if (nchoices <= 0) return '?';
     const char *visible = strstr(generated, "</think>");
@@ -2699,7 +2755,22 @@ static char find_answer_letter(const char *generated, int nchoices) {
             if (c >= 'A' && c <= max_answer) {
                 char before = p == visible ? ' ' : p[-1];
                 char after = p[1];
-                if (is_letter_boundary(before, after)) return c;
+                if (!is_letter_boundary(before, after)) continue;
+                /* A standalone capital that begins a same-line English word
+                 * ("A careful ...") or a contraction ("I'll ...") is prose,
+                 * not the model's pick: the real letter comes later on the
+                 * line. Skip it; the reverse scan below still recovers it if
+                 * it was the only candidate (e.g. "Answer: A is correct"). */
+                if (after == '\'') continue;
+                if (after == ' ' || after == '\t') {
+                    const char *w = p + 1;
+                    while (*w == ' ' || *w == '\t') w++;
+                    if (islower((unsigned char)*w)) continue;
+                }
+                /* A distractor explicitly rejected before the pick on the same
+                 * line ("not B, ... D") must not win over the real choice. */
+                if (mc_letter_is_negated(answer, p)) continue;
+                return c;
             }
         }
     }
@@ -2752,7 +2823,17 @@ static void find_integer_answer(const char *generated, char *dst, size_t dstlen)
     if (answer) {
         const char *end = answer + strlen(answer);
         if (strlen(answer) > 160) end = answer + 160;
-        if (scan_first_integer(answer, end, dst, dstlen)) return;
+        /* Restrict to the final answer line: digits after it (continued
+         * reasoning, footnotes, years) must not override the answer. */
+        const char *nl = memchr(answer, '\n', (size_t)(end - answer));
+        if (nl) end = nl;
+        /* When the line shows arithmetic ("m+n = 256+37 = 293") the stated
+         * result is the right-hand side of the LAST '='. Otherwise the first
+         * integer on the line is the answer (keeps "Final answer: 082"). */
+        const char *eq = NULL;
+        for (const char *r = answer; r < end; r++) if (*r == '=') eq = r;
+        if (scan_first_integer(eq ? eq + 1 : answer, end, dst, dstlen)) return;
+        if (eq && scan_first_integer(answer, end, dst, dstlen)) return;
     }
 
     const char *last_start = NULL;
@@ -3119,6 +3200,32 @@ static char *trace_copy_model_output(const char *case_start, const char *case_en
     return out;
 }
 
+typedef enum {
+    REGRADE_NOT_GRADED,
+    REGRADE_PASSED,
+    REGRADE_FAILED,
+} regrade_outcome;
+
+/* Decide how one trace case regrades. Only PASSED/FAILED traces were graded by
+ * a completed run; STOPPED/SKIPPED/SWITCHED/ERROR cases carry partial or no
+ * model output and must be reported as not-graded rather than counted, since
+ * grading them inflates the totals and raises spurious "changed" drift. An
+ * empty status keeps legacy traces (written before the status line) working. */
+static regrade_outcome regrade_case_outcome(const eval_case *tc,
+                                            const char *traced_status,
+                                            const char *model_output,
+                                            char *got, size_t gotlen) {
+    bool gradeable = traced_status[0] == '\0' ||
+                     !strcmp(traced_status, "PASSED") ||
+                     !strcmp(traced_status, "FAILED");
+    if (!gradeable) {
+        if (gotlen) got[0] = '\0';
+        return REGRADE_NOT_GRADED;
+    }
+    find_case_answer(tc, model_output, got, gotlen);
+    return answer_matches(tc, got) ? REGRADE_PASSED : REGRADE_FAILED;
+}
+
 static int regrade_trace_file(const char *path) {
     size_t len = 0;
     char *text = read_text_file(path, &len);
@@ -3132,6 +3239,7 @@ static int regrade_trace_file(const char *path) {
     int changed = 0;
     int unknown = 0;
     int parse_errors = 0;
+    int not_graded = 0;
 
     while (true) {
         const char *case_start = trace_find_next_case(start, end);
@@ -3173,8 +3281,14 @@ static int regrade_trace_file(const char *path) {
         }
 
         char got[EVAL_ANSWER_MAX];
-        find_case_answer(tc, model_output, got, sizeof(got));
-        bool ok = answer_matches(tc, got);
+        regrade_outcome outcome =
+            regrade_case_outcome(tc, traced_status, model_output, got, sizeof(got));
+        if (outcome == REGRADE_NOT_GRADED) {
+            not_graded++;
+            free(model_output);
+            continue;
+        }
+        bool ok = (outcome == REGRADE_PASSED);
         if (ok) passed++;
         else failed++;
 
@@ -3191,8 +3305,8 @@ static int regrade_trace_file(const char *path) {
         free(model_output);
     }
 
-    printf("ds4-eval: regraded %d cases from %s: passed=%d failed=%d changed=%d unknown=%d parse_errors=%d\n",
-           total, path, passed, failed, changed, unknown, parse_errors);
+    printf("ds4-eval: regraded %d cases from %s: passed=%d failed=%d changed=%d not_graded=%d unknown=%d parse_errors=%d\n",
+           total, path, passed, failed, changed, not_graded, unknown, parse_errors);
     free(text);
     return (unknown || parse_errors || total == 0) ? 1 : 0;
 }
@@ -3260,10 +3374,37 @@ static int trace_copy_self_test_case(void) {
     return 0;
 }
 
+static int regrade_status_self_test_case(void) {
+    int failed = 0;
+    const eval_case integer = {.source = "AIME2025", .answer = "82"};
+    char got[EVAL_ANSWER_MAX];
+
+    if (regrade_case_outcome(&integer, "PASSED", "</think>Answer: 82",
+                             got, sizeof(got)) != REGRADE_PASSED) {
+        fprintf(stderr, "ds4-eval: regrade self-test failed: PASSED trace not regraded\n");
+        failed++;
+    }
+    /* An interrupted run whose partial output happens to look correct must not
+     * be counted or flagged as drift. */
+    if (regrade_case_outcome(&integer, "STOPPED", "</think>Answer: 82",
+                             got, sizeof(got)) != REGRADE_NOT_GRADED) {
+        fprintf(stderr, "ds4-eval: regrade self-test failed: STOPPED trace was graded\n");
+        failed++;
+    }
+    /* Legacy traces without a status line stay gradeable. */
+    if (regrade_case_outcome(&integer, "", "</think>Answer: 82",
+                             got, sizeof(got)) != REGRADE_PASSED) {
+        fprintf(stderr, "ds4-eval: regrade self-test failed: legacy empty status not graded\n");
+        failed++;
+    }
+    return failed;
+}
+
 static int run_extractor_self_tests(void) {
     int failed = 0;
 
     failed += trace_copy_self_test_case();
+    failed += regrade_status_self_test_case();
 
     const eval_case mc = {
         .source = "SuperGPQA",
@@ -3324,6 +3465,93 @@ static int run_extractor_self_tests(void) {
         "Answer: 10",
         "10");
 
+    /* --- Regression cases for answer-extractor false negatives. These guard
+     *     against the grader under-reporting model accuracy on well-formed
+     *     final-answer lines. --- */
+
+    /* Multiple choice: a standalone in-range capital that merely begins an
+     * English word ("I think", "A careful") or a contraction ("I'll") must
+     * not shadow the choice the model actually states later on the line.
+     * Only reachable when the stray letter is itself a valid option, i.e.
+     * 10-choice (A-J) cases, of which the embedded set has 24. */
+    const eval_case mc_c = {
+        .source = "SuperGPQA",
+        .choice = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"},
+        .answer = "C",
+    };
+    failed += extractor_self_test_case(
+        "MC: leading pronoun 'I' does not shadow the chosen letter",
+        &mc_c, "</think>Answer: I think it is C", "C");
+    failed += extractor_self_test_case(
+        "MC: contraction I'll does not shadow the chosen letter",
+        &mc_c, "</think>Answer: I'll go with C.", "C");
+    failed += extractor_self_test_case(
+        "MC: leading article 'A' does not shadow the chosen letter",
+        &mc_c, "</think>Answer: A careful reading shows C.", "C");
+
+    const eval_case mc_i = {
+        .source = "SuperGPQA",
+        .choice = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"},
+        .answer = "I",
+    };
+    failed += extractor_self_test_case(
+        "MC: a genuine standalone 'I' answer is still picked",
+        &mc_i, "</think>Answer: I.", "I");
+
+    const eval_case mc4_d = {
+        .source = "SuperGPQA",
+        .choice = {"A", "B", "C", "D"},
+        .answer = "D",
+    };
+    failed += extractor_self_test_case(
+        "MC: out-of-range pronoun is harmless on 4-choice cases",
+        &mc4_d, "</think>Answer: I think it is D", "D");
+
+    /* A distractor the model explicitly rejects *before* stating its pick on
+     * the same line must not shadow the real choice ("not B, ... D" /
+     * "rules out C, leaving D"). The bare first-valid-letter rule grabbed the
+     * rejected letter; a small, high-precision negation-cue skip fixes it.
+     * "D, not B" is the guard against the naive "take the last letter" fix:
+     * the pick is reached and accepted before the rejected distractor. #321 */
+    const eval_case mc_d = {
+        .source = "SuperGPQA",
+        .choice = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"},
+        .answer = "D",
+    };
+    failed += extractor_self_test_case(
+        "MC: 'not B' distractor before the pick does not shadow it",
+        &mc_d, "</think>Answer: It is not B, the answer is D", "D");
+    failed += extractor_self_test_case(
+        "MC: 'rules out C, leaving D' grades the surviving choice",
+        &mc_d, "</think>Answer: rules out C, leaving D", "D");
+    failed += extractor_self_test_case(
+        "MC: contraction negation (isn't B) before the pick is skipped",
+        &mc_d, "</think>Answer: It isn't B, so D", "D");
+    failed += extractor_self_test_case(
+        "MC: a leading pick followed by 'not B' is still graded as the pick",
+        &mc_d, "</think>Answer: D, not B", "D");
+
+    /* Integer: when the answer line shows the arithmetic, the graded value
+     * must be the stated result (right of the last '='), not the first
+     * summand/factor; digits on later lines must not leak in either. Many
+     * embedded AIME2025 cases ask for a derived sum (m+n, a+b+c, ...). */
+    const eval_case int_293 = {.source = "AIME2025", .answer = "293"};
+    failed += extractor_self_test_case(
+        "integer: sum line grades the total, not the first summand",
+        &int_293, "</think>Answer: m+n = 256+37 = 293", "293");
+    const eval_case int_62 = {.source = "AIME2025", .answer = "62"};
+    failed += extractor_self_test_case(
+        "integer: three-term sum line grades the total",
+        &int_62, "</think>Answer: a+b+c = 12+25+25 = 62", "62");
+    const eval_case int_81 = {.source = "AIME2025", .answer = "81"};
+    failed += extractor_self_test_case(
+        "integer: product-sum line grades the result, not the first factor",
+        &int_81, "</think>Answer: 2*7 + 3*6 + 5*4 + 7*5 = 81", "81");
+    const eval_case int_82 = {.source = "AIME2025", .answer = "82"};
+    failed += extractor_self_test_case(
+        "integer: digits on a later line do not override the answer line",
+        &int_82, "</think>Answer: 82\nThe value 2025 is just the year.", "82");
+
     if (failed) return 1;
     printf("ds4-eval: answer extractor self-tests passed\n");
     return 0;