Skip to content

Batch improvements#16

Open
sergeyf wants to merge 10 commits into
mainfrom
codex/current-changes
Open

Batch improvements#16
sergeyf wants to merge 10 commits into
mainfrom
codex/current-changes

Conversation

@sergeyf

@sergeyf sergeyf commented Jun 23, 2026

Copy link
Copy Markdown
Collaborator

Here is the self-contained Python version of the 856/971 abstain-biased rule:

from __future__ import annotations
import re
from dataclasses import dataclass
from typing import Literal

from sinonym.coretypes import BatchParseResult, NameFormat, ParseResult

Route = Literal["pp", "vys", "not_person"]

TOKEN_RE = re.compile(r"[^\W\d_]+(?:[-'][^\W\d_]+)?", flags=re.UNICODE)


@dataclass(frozen=True)
class RoutingDecision:
    route: Route
    result: ParseResult | None
    reason: str
    preserves_input_order: bool


def _result_text(result: ParseResult) -> str:
    return str(result.result) if result.success else ""


def _token_count(text: str) -> int:
    return len(TOKEN_RE.findall(text or ""))


def _is_garbage(pp_result: ParseResult, vys_result: ParseResult) -> bool:
    return (
        not pp_result.success
        or not vys_result.success
        or _token_count(_result_text(pp_result)) > 4
        or _token_count(_result_text(vys_result)) > 4
    )


def _preserves_input_order(batch: BatchParseResult, index: int) -> bool:
    return batch.name_order_evidence[index].selected_format == NameFormat.GIVEN_FIRST


def choose_pp_or_vys(
    pp_batch: BatchParseResult,
    pp_index: int,
    vys_batch: BatchParseResult,
    vys_index: int,
) -> RoutingDecision:
    pp_result = pp_batch.results[pp_index]
    vys_result = vys_batch.results[vys_index]
    pp_evidence = pp_batch.name_order_evidence[pp_index]
    vys_evidence = vys_batch.name_order_evidence[vys_index]

    if _is_garbage(pp_result, vys_result):
        return RoutingDecision("not_person", None, "garbage_or_failed_parse", False)

    if _result_text(pp_result) == _result_text(vys_result):
        return RoutingDecision(
            "pp",
            pp_result,
            "pp_vys_same",
            _preserves_input_order(pp_batch, pp_index),
        )

    pp_margin = pp_batch.format_pattern.vote_margin
    pp_ct = pp_batch.format_pattern.total_count
    vys_margin = vys_batch.format_pattern.vote_margin

    pp_freq = pp_evidence.selected_surname_frequency or 0.0
    vys_freq = vys_evidence.selected_surname_frequency or 0.0
    vys_over_pp_freq_ratio = vys_freq / pp_freq if pp_freq > 0 else 0.0

    endpoint_disagrees = (
        pp_evidence.selected_surname_position in {"first", "last"}
        and vys_evidence.selected_surname_position in {"first", "last"}
        and pp_evidence.selected_surname_position != vys_evidence.selected_surname_position
    )

    use_vys = (
        (
            endpoint_disagrees
            and vys_over_pp_freq_ratio >= 5
            and vys_margin >= 0.55
        )
        or (
            pp_ct <= 3
            and vys_batch.format_pattern.dominant_format == NameFormat.GIVEN_FIRST
            and vys_margin >= 0.65
        )
        or (
            pp_margin <= 0.2
            and vys_margin >= 0.35
        )
    )

    if use_vys:
        return RoutingDecision(
            "vys",
            vys_result,
            "vys_backoff_rule",
            _preserves_input_order(vys_batch, vys_index),
        )

    return RoutingDecision(
        "pp",
        pp_result,
        "default_pp",
        _preserves_input_order(pp_batch, pp_index),
    )

Usage shape:

decision = choose_pp_or_vys(pp_batch, pp_index, vys_batch, vys_index)

if decision.route == "not_person":
    output = None
else:
    output = decision.result

@sergeyf sergeyf changed the title [WIP] Batch improvements Batch improvements Jun 23, 2026
@sergeyf sergeyf requested a review from atalyaalon June 23, 2026 19:39
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant