diff --git a/README.md b/README.md index a417254..1605784 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,12 @@ Frequently used flags: | `--force` | Re-translate even if the output exists | | `-v, --verbose` | Show retry/validation warnings (hidden by default) | | `-o, --output` | Output path (single file only) | +| `--scan-budget` | Chars sent to the prepass scan (default **24000**). Lower on tight-context local models (~8k window); raise on large-context cloud models for full-file scans. | +| `--context-overlap` | Previous-batch source blocks shown as read-only context (default **2**, `0` to disable). Helps speaker continuity across batch boundaries. | +| `--no-review` | Disable the post-edit review pass. Saves one extra LLM call per batch — useful on metered providers. | +| `--no-refine-attribution` | Disable per-block speaker attribution for mixed-gender scenes (saves one small call per ambiguous scene). | + +The defaults are tuned for best translation quality. On metered cloud providers you can pass `--no-review` and/or `--no-refine-attribution` to cut LLM calls. On tight-context local models, lower `--scan-budget` (e.g. `8000`) so the scan prompt fits. Set `NO_COLOR=1` to disable ANSI colors; output auto-falls back to plain lines when piped. diff --git a/cli/core/batch_runner.py b/cli/core/batch_runner.py index c596f29..a1e56a3 100644 --- a/cli/core/batch_runner.py +++ b/cli/core/batch_runner.py @@ -8,15 +8,20 @@ import httpx +from .config import TranslationConfig +from .constants import ( + ATTEMPTS_BEFORE_SPLIT, + CRED_QUERY_PARAMS, + REQUEST_TIMEOUT_SECS, +) from .context_pass import FileContext +from .prompt import ( + REVIEW_SYSTEM_PROMPT, + SYSTEM_PROMPT, + build_review_user_message, + build_translate_user_message, +) from .srt_parser import SubtitleBlock, parse_lite, serialize_lite, validate_batch -from .config import TranslationConfig -from .prompt import SYSTEM_PROMPT - - -REQUEST_TIMEOUT_SECS = 120.0 - -_CRED_QUERY_PARAMS = {"key", "api_key", "apikey", "access_token"} class FileTranslationError(Exception): @@ -31,7 +36,7 @@ def sanitize_api_url(url: str) -> str: try: parts = urlsplit(url) kept = [(k, v) for k, v in parse_qsl(parts.query, keep_blank_values=True) - if k.lower() not in _CRED_QUERY_PARAMS] + if k.lower() not in CRED_QUERY_PARAMS] return urlunsplit((parts.scheme, parts.netloc, parts.path, urlencode(kept), parts.fragment)) except Exception: @@ -90,24 +95,35 @@ async def call_chat_api( return resp.json()["choices"][0]["message"]["content"] -def _build_user_message( +async def _review_pass( + client: httpx.AsyncClient, + batch: list[SubtitleBlock], + first_pass: list[SubtitleBlock], cfg: TranslationConfig, - batch_wire: str, file_context: FileContext | None, - batch: list[SubtitleBlock], -) -> str: - if cfg.source_lang: - header = f"Translate from {cfg.source_lang} to {cfg.target_lang}:" - else: - header = f"Translate to {cfg.target_lang}:" - if file_context is not None: - ctx = file_context.render_for_batch(batch) - if ctx: - return f"Glossary for this scene:\n{ctx}\n\n{header}\n\n{batch_wire}" - return f"{header}\n\n{batch_wire}" - - -_ATTEMPTS_BEFORE_SPLIT = 2 +) -> list[SubtitleBlock]: + """Re-check first-pass against the glossary; returns first-pass unchanged + if review output fails validation or there's no glossary to check against.""" + glossary = file_context.render_for_batch(batch) if file_context else "" + if not glossary: + return first_pass + user_msg = build_review_user_message(batch, first_pass, glossary) + try: + raw = await call_chat_api( + client, REVIEW_SYSTEM_PROMPT, user_msg, cfg, max(len(batch), 1) * 120) + except Exception as e: + cfg.warn(f" Review failed, keeping first-pass: {e}") + return first_pass + parsed = parse_lite(strip_markdown_fences(raw)) + if len(parsed) != len(batch): + return first_pass + revised = [ + SubtitleBlock(number=batch[i].number, + timestamp=batch[i].timestamp, + text=parsed[i].text) + for i in range(len(batch)) + ] + return revised if validate_batch(batch, revised).ok else first_pass async def translate_batch_with_retry( @@ -117,6 +133,7 @@ async def translate_batch_with_retry( cfg: TranslationConfig, file_context: FileContext | None = None, _split_path: str = "", + prev_tail: list[SubtitleBlock] | None = None, ) -> list[SubtitleBlock]: """Translate one batch; on repeated validation failure, halve and recurse. @@ -125,12 +142,15 @@ async def translate_batch_with_retry( because at N=1 a count mismatch is impossible. """ batch_wire = serialize_lite(batch) - user_msg = _build_user_message(cfg, batch_wire, file_context, batch) + glossary = file_context.render_for_batch(batch) if file_context else "" + user_msg = build_translate_user_message( + cfg.source_lang, cfg.target_lang, batch_wire, glossary, prev_tail or [], + ) label = f"Batch {batch_idx + 1}" + (f".{_split_path}" if _split_path else "") first_block = batch[0].number can_split = len(batch) > 1 - attempts = _ATTEMPTS_BEFORE_SPLIT if can_split else cfg.max_retries + attempts = ATTEMPTS_BEFORE_SPLIT if can_split else cfg.max_retries hit_validation_failure = False for attempt in range(1, attempts + 1): @@ -149,6 +169,10 @@ async def translate_batch_with_retry( ] check = validate_batch(batch, output) if check.ok: + if cfg.review: + output = await _review_pass( + client, batch, output, cfg, file_context, + ) return output hit_validation_failure = True cfg.warn(f" {label} validation failed ({tag}): {check.error}") @@ -185,9 +209,12 @@ async def translate_batch_with_retry( # Sequential: parallel halves would oversubscribe the outer semaphore. left_result = await translate_batch_with_retry( client, batch_idx, left, cfg, file_context, left_path, + prev_tail=prev_tail, ) + right_prev = left[-cfg.context_overlap:] if cfg.context_overlap else [] right_result = await translate_batch_with_retry( client, batch_idx, right, cfg, file_context, right_path, + prev_tail=right_prev, ) return left_result + right_result diff --git a/cli/core/config.py b/cli/core/config.py index 67b98bf..64badc0 100644 --- a/cli/core/config.py +++ b/cli/core/config.py @@ -4,8 +4,13 @@ from dataclasses import dataclass, field from typing import Callable - -DEFAULT_MAX_RETRIES = 5 +from .constants import ( + DEFAULT_BATCH_SIZE, + DEFAULT_CONCURRENCY, + DEFAULT_CONTEXT_OVERLAP, + DEFAULT_MAX_RETRIES, + DEFAULT_SCAN_CHAR_BUDGET, +) def _silent_warn(msg: str) -> None: @@ -25,9 +30,15 @@ class TranslationConfig: api_url: str api_key: str model: str | None = None - batch_size: int = 10 - concurrency: int = 1 + batch_size: int = DEFAULT_BATCH_SIZE + concurrency: int = DEFAULT_CONCURRENCY max_retries: int = DEFAULT_MAX_RETRIES + scan_char_budget: int = DEFAULT_SCAN_CHAR_BUDGET + context_overlap: int = DEFAULT_CONTEXT_OVERLAP + # One small LLM call per ambiguous scene; fixes cross-gender addressee slips. + refine_attribution: bool = True + # One extra call per batch; fixes gender/number/consistency slips. Doubles cost. + review: bool = True quiet: bool = False verbose: bool = False warn: Callable[[str], None] = field(default=_silent_warn) diff --git a/cli/core/constants.py b/cli/core/constants.py new file mode 100644 index 0000000..2cffc47 --- /dev/null +++ b/cli/core/constants.py @@ -0,0 +1,24 @@ +"""Public defaults and tuning constants shared across CLI modules.""" + +# === Translation defaults (mirrored in TranslationConfig field defaults) === +DEFAULT_BATCH_SIZE = 10 +DEFAULT_CONCURRENCY = 1 +DEFAULT_MAX_RETRIES = 5 +# Sized for full-quality scans on typical TV episodes; lower on tight-context +# local models (~8k window), raise on large-context cloud models. +DEFAULT_SCAN_CHAR_BUDGET = 24_000 +DEFAULT_CONTEXT_OVERLAP = 2 + +# === Prepass / attribution scan === +SCAN_MAX_TOKENS = 3000 +# 2-char names collide with common target-language words. +MIN_NAME_LEN = 3 +# Single-block scenes never need per-block speaker attribution. +ATTRIB_MIN_BLOCKS = 3 + +# === Batch retry/split === +ATTEMPTS_BEFORE_SPLIT = 2 + +# === HTTP === +REQUEST_TIMEOUT_SECS = 120.0 +CRED_QUERY_PARAMS = frozenset({"key", "api_key", "apikey", "access_token"}) diff --git a/cli/core/context_pass.py b/cli/core/context_pass.py index 8b3dedf..bc093e8 100644 --- a/cli/core/context_pass.py +++ b/cli/core/context_pass.py @@ -1,56 +1,32 @@ -"""Prepass scan: extract cast, terms, and register from the whole file once -so every batch shares the same glossary. Fails silently to an empty context.""" +"""Prepass scan: one call extracts cast, terms, scenes, and register.""" from __future__ import annotations +import asyncio import re from dataclasses import dataclass, field import httpx from .config import TranslationConfig +from .constants import ATTRIB_MIN_BLOCKS, MIN_NAME_LEN, SCAN_MAX_TOKENS +from .prompt import ( + ATTRIBUTION_SYSTEM_PROMPT, + CONTEXT_SYSTEM_PROMPT, + build_attribution_user_message, + build_scan_user_message, +) from .srt_parser import SubtitleBlock -CONTEXT_SYSTEM_PROMPT = """\ -You analyze a subtitle file before it is translated. Return a compact glossary -for the translator to use when picking correct pronouns, consistent names, and -a single consistent register. - -Your reply MUST start with `` and MUST contain all four sections -below, in this exact order, with no other text before, between, or after them. -No commentary. No code fences. No explanations. Tags only. - - -ONE LINE describing the target-language variant and formality the translator should use for the ENTIRE file. - - -NAME => TARGET_NAME | GENDER - - -SOURCE => TARGET - - -- NOTE - - -Rules: -- The line names the specific target-language variant and formality (e.g. "Modern Standard Arabic, neutral", "Brazilian Portuguese, casual", "Simplified Mandarin, neutral", "Japanese, polite です/ます form"). Pick ONE and commit to it for the whole file. Base the choice on the source's tone; default to the standard written form of the target language unless the source is clearly colloquial. -- GENDER is "male", "female", or "unknown". Use "unknown" only when the text gives no signal at all. -- TARGET_NAME is how the character's name should appear in the target language (transliterated or localized). -- Include up to 20 named characters, 10 recurring proper terms or jargon, 4 brief notes on setting/tone. -- Leave a section empty (tags only) if nothing qualifies. Never omit a section.\ -""" - -# Sized so small-context models (4k-8k) still have room for prompt + output. -_SCAN_CHAR_BUDGET = 12_000 -_SCAN_MAX_TOKENS = 1500 - - _SECTION_RE = re.compile( - r"<(?Pregister|characters|terms|notes)>\s*(?P.*?)\s*", + r"<(?Pregister|characters|terms|scenes|notes)>\s*" + r"(?P.*?)\s*" + r"(?=|<(?:register|characters|terms|scenes|notes)>|\Z)", re.I | re.S, ) +_SCENE_RANGE_RE = re.compile(r"^(\d+)\s*(?:-\s*(\d+))?$") +_ATTRIB_LINE_RE = re.compile(r"^\s*(\d+)\s*=\s*(.+?)\s*$") @dataclass @@ -66,61 +42,164 @@ class TermHint: target: str +@dataclass +class SceneHint: + start: int + end: int + description: str + participants: list[str] = field(default_factory=list) + # Per-block speaker map (block_number -> character source name), filled + # by refine_scene_attribution. + attribution: dict[int, str] = field(default_factory=dict) + + @dataclass class FileContext: register: str = "" characters: list[CharacterHint] = field(default_factory=list) terms: list[TermHint] = field(default_factory=list) + scenes: list[SceneHint] = field(default_factory=list) notes: list[str] = field(default_factory=list) def is_empty(self) -> bool: - return not (self.register or self.characters or self.terms or self.notes) + return not (self.register or self.characters or self.terms + or self.scenes or self.notes) def render_for_batch(self, batch: list[SubtitleBlock]) -> str: - """Return a glossary slice scoped to names/terms present in this batch. - Register and notes are file-wide and always included if set.""" + """Glossary slice scoped to this batch. Register/notes are file-wide.""" text = "\n".join(b.text for b in batch) - chars = [h for h in self.characters if _contains_word(text, h.source)] - terms = [h for h in self.terms if _contains_word(text, h.source)] - if not self.register and not chars and not terms and not self.notes: + scenes = _scenes_overlapping(self.scenes, batch) + # Include characters named in the batch AND scene participants — the + # latter covers speakers who address each other as "you" without + # vocatives, so the translator still learns their gender. + scene_names = {p for s in scenes for p in s.participants} + chars = [h for h in self.characters + if _find_word(text, h.source) >= 0 or h.source in scene_names] + terms = [h for h in self.terms if _find_word(text, h.source) >= 0] + if not (self.register or chars or terms or scenes or self.notes): return "" + gender_by = {h.source.casefold(): h.gender for h in self.characters} parts: list[str] = [] if self.register: parts.append(f"Target register: {self.register} (use consistently across every block)") if chars: - lines = [f"- {h.source} => {h.target} ({h.gender})" for h in chars] - parts.append("Characters:\n" + "\n".join(lines)) + parts.append("Characters:\n" + "\n".join( + f"- {h.source} => {h.target} ({h.gender})" for h in chars)) if terms: - lines = [f"- {h.source} => {h.target}" for h in terms] - parts.append("Terms:\n" + "\n".join(lines)) + parts.append("Terms:\n" + "\n".join( + f"- {h.source} => {h.target}" for h in terms)) + if scenes: + parts.append(_render_scenes(scenes, gender_by)) if self.notes: - lines = [f"- {n}" for n in self.notes[:4]] - parts.append("Notes:\n" + "\n".join(lines)) + parts.append("Notes:\n" + "\n".join(f"- {n}" for n in self.notes[:4])) return "\n\n".join(parts) -def _contains_word(text: str, word: str) -> bool: - if not word: - return False - return re.search(rf"(? list[SceneHint]: + if not scenes or not batch: + return [] + first, last = batch[0].number, batch[-1].number + return [s for s in scenes if s.end >= first and s.start <= last] -def serialize_for_scan(blocks: list[SubtitleBlock]) -> str: - """Text for the scan pass. Stride-samples large files so characters - introduced late still have a chance to land in the glossary.""" - total_chars = sum(len(b.text) + 1 for b in blocks) - if total_chars <= _SCAN_CHAR_BUDGET or len(blocks) <= 1: - return "\n".join(b.text for b in blocks) +def _gender_mark(g: str | None) -> str: + return "M" if g == "male" else "F" if g == "female" else "" + + +def _render_scenes(scenes: list[SceneHint], gender_by: dict[str, str]) -> str: + lines: list[str] = [] + for s in scenes: + tagged = ", ".join( + f"{n} ({mark})" if (mark := _gender_mark(gender_by.get(n.casefold()))) else n + for n in s.participants + ) + prefix = f"- Blocks {s.start}-{s.end}:" + lines.append( + f"{prefix} [{tagged}] — {s.description}" if tagged + else f"{prefix} {s.description}") + if s.attribution: + speakers = " ".join(f"{n}={s.attribution[n]}" for n in sorted(s.attribution)) + lines.append(f" speakers: {speakers}") + return ( + "Scene guidance — each entry applies ONLY to its listed block range. " + "Participants and genders in [brackets]; a 'speakers:' line names the " + "speaker per block so you pick the right gender for the ADDRESSEE:\n" + + "\n".join(lines) + ) + - take_n = max(1, int(len(blocks) * _SCAN_CHAR_BUDGET / total_chars)) +def _find_word(text: str, word: str) -> int: + """Case-insensitive whole-word search with Unicode-aware boundaries. + Works for Latin, Arabic, CJK, etc. Returns first match index or -1.""" + if not text or not word: + return -1 + haystack, needle = text.casefold(), word.casefold() + nlen = len(needle) + i = 0 + while i <= len(haystack) - nlen: + j = haystack.find(needle, i) + if j < 0: + return -1 + before = text[j - 1] if j > 0 else "" + after = text[j + nlen] if j + nlen < len(text) else "" + # isalnum is Unicode-aware. + if not (before.isalnum() or before == "_") and not (after.isalnum() or after == "_"): + return j + i = j + 1 + return -1 + + +def _detect_participants( + text: str, characters: list[CharacterHint], +) -> list[str]: + """Source names whose source OR target form appears in `text` as a whole + word, in order of first appearance. Matches both forms because scan + descriptions often slip into the target language.""" + aliases: list[tuple[str, str]] = [] # (alias, source_name) + for h in characters: + if len(h.source) >= MIN_NAME_LEN: + aliases.append((h.source, h.source)) + if h.target != h.source and len(h.target) >= MIN_NAME_LEN: + aliases.append((h.target, h.source)) + aliases.sort(key=lambda a: len(a[0]), reverse=True) + + first_at: dict[str, int] = {} + for alias, name in aliases: + if name in first_at: + continue + idx = _find_word(text, alias) + if idx >= 0: + first_at[name] = idx + return sorted(first_at, key=first_at.__getitem__) + + +def _format_scan_line(b: SubtitleBlock) -> str: + return f"[{b.number}] " + b.text.replace("\n", " ") + + +def serialize_for_scan( + blocks: list[SubtitleBlock], char_budget: int, +) -> str: + """Text for the scan pass. Stride-samples large files so characters + introduced late still land in the glossary.""" + total = sum(len(_format_scan_line(b)) + 1 for b in blocks) + if total <= char_budget or len(blocks) <= 1: + return "\n".join(_format_scan_line(b) for b in blocks) + take_n = max(1, int(len(blocks) * char_budget / total)) step = len(blocks) / take_n sampled = [blocks[int(i * step)] for i in range(take_n)] - return "\n".join(b.text for b in sampled) + return "\n".join(_format_scan_line(b) for b in sampled) + + +def _strip_bullet(line: str) -> str: + return line.strip().lstrip("-*• ").strip() def parse_context_response(text: str) -> FileContext: - """Parse the tagged response. Tolerates extra whitespace and bullet markers.""" + """Parse the tagged response. Tolerates whitespace and bullet markers.""" sections = { m.group("tag").lower(): m.group("body") for m in _SECTION_RE.finditer(text or "") @@ -130,7 +209,7 @@ def parse_context_response(text: str) -> FileContext: characters: list[CharacterHint] = [] for line in sections.get("characters", "").splitlines(): - line = line.strip().lstrip("-*• ").strip() + line = _strip_bullet(line) if not line or "=>" not in line: continue src, rest = line.split("=>", 1) @@ -141,30 +220,43 @@ def parse_context_response(text: str) -> FileContext: tgt, gender = rest.strip(), "unknown" if gender not in ("male", "female", "unknown"): gender = "unknown" - src = src.strip() - if src and tgt: - characters.append(CharacterHint(src, tgt, gender)) + if src.strip() and tgt: + characters.append(CharacterHint(src.strip(), tgt, gender)) terms: list[TermHint] = [] for line in sections.get("terms", "").splitlines(): - line = line.strip().lstrip("-*• ").strip() + line = _strip_bullet(line) if not line or "=>" not in line: continue src, tgt = line.split("=>", 1) - src, tgt = src.strip(), tgt.strip() - if src and tgt: - terms.append(TermHint(src, tgt)) + if src.strip() and tgt.strip(): + terms.append(TermHint(src.strip(), tgt.strip())) - notes: list[str] = [] - for line in sections.get("notes", "").splitlines(): - line = line.strip().lstrip("-*• ").strip() - if line: - notes.append(line) + scenes: list[SceneHint] = [] + for line in sections.get("scenes", "").splitlines(): + line = _strip_bullet(line) + if not line or "=>" not in line: + continue + rng, desc = line.split("=>", 1) + m = _SCENE_RANGE_RE.match(rng.strip()) + if not m or not desc.strip(): + continue + start = int(m.group(1)) + end = int(m.group(2)) if m.group(2) else start + if end < start: + start, end = end, start + scenes.append(SceneHint( + start=start, end=end, description=desc.strip(), + participants=_detect_participants(desc, characters), + )) + + notes = [_strip_bullet(l) for l in sections.get("notes", "").splitlines() if _strip_bullet(l)] return FileContext( register=register, characters=characters[:20], terms=terms[:10], + scenes=scenes[:80], notes=notes[:4], ) @@ -174,22 +266,17 @@ async def extract_file_context( blocks: list[SubtitleBlock], cfg: TranslationConfig, ) -> FileContext: - """Run one scan call and return a FileContext. Empty on any failure.""" + """Run one scan call. Returns the parsed+enriched context.""" from .batch_runner import call_chat_api, strip_markdown_fences - source_line = f"Source language: {cfg.source_lang}\n" if cfg.source_lang else "" - user_message = ( - f"{source_line}" - f"Target language: {cfg.target_lang}\n\n" - f"{serialize_for_scan(blocks)}" + user_msg = build_scan_user_message( + cfg.source_lang, cfg.target_lang, + serialize_for_scan(blocks, cfg.scan_char_budget), ) try: raw = await call_chat_api( - client, - CONTEXT_SYSTEM_PROMPT, - user_message, - cfg, - max_tokens=_SCAN_MAX_TOKENS, + client, CONTEXT_SYSTEM_PROMPT, user_msg, cfg, + max_tokens=SCAN_MAX_TOKENS, ) except Exception as e: cfg.warn(f" Context scan failed, proceeding without: {e}") @@ -197,7 +284,104 @@ async def extract_file_context( context = parse_context_response(strip_markdown_fences(raw)) if context.is_empty(): - # Diagnostic snippet: helps tell whether the model ignored tags, truncated, or refused. snippet = (raw or "").strip().replace("\n", " ")[:240] cfg.warn(f" Context scan returned empty glossary. Raw start: {snippet!r}") + else: + enrich_scenes_with_block_text(context, blocks) + return context + + +def enrich_scenes_with_block_text( + context: FileContext, blocks: list[SubtitleBlock], +) -> FileContext: + """Reconcile scene participants with what's actually in the source blocks. + Block-text names are primary truth: description-named participants are + kept only if grounded in the text, and any block-text names missed by the + description are appended.""" + if not context.scenes or not context.characters: + return context + by_num = {b.number: b for b in blocks} + for s in context.scenes: + joined = "\n".join( + by_num[n].text for n in range(s.start, s.end + 1) if n in by_num) + in_text = _detect_participants(joined, context.characters) + in_text_set = set(in_text) + kept = [p for p in s.participants if p in in_text_set] + seen = set(kept) + for name in in_text: + if name not in seen: + kept.append(name) + seen.add(name) + s.participants = kept return context + + +def _needs_attribution(scene: SceneHint, gender_by: dict[str, str]) -> bool: + return (scene.end - scene.start + 1 >= ATTRIB_MIN_BLOCKS + and len(scene.participants) >= 1) + + +async def _attribute_scene( + client: httpx.AsyncClient, + scene: SceneHint, + by_num: dict[int, SubtitleBlock], + cfg: TranslationConfig, + characters: list[CharacterHint], +) -> dict[int, str]: + from .batch_runner import call_chat_api + present = set(scene.participants) + roster = "\n".join( + f"- {h.source} ({_gender_mark(h.gender) or '?'})" + for h in characters if h.source in present + ) + block_lines = [ + f"[{n}] {by_num[n].text.replace(chr(10), ' ')}" + for n in range(scene.start, scene.end + 1) if n in by_num + ] + if not block_lines or not roster: + return {} + user_msg = build_attribution_user_message(roster, block_lines) + try: + raw = await call_chat_api( + client, ATTRIBUTION_SYSTEM_PROMPT, user_msg, cfg, + max_tokens=len(block_lines) * 20 + 100, + ) + except Exception as e: + cfg.warn(f" Attribution failed for blocks {scene.start}-{scene.end}: {e}") + return {} + out: dict[int, str] = {} + valid = {h.source for h in characters} | {"unknown"} + for line in (raw or "").splitlines(): + m = _ATTRIB_LINE_RE.match(line) + if not m: + continue + n = int(m.group(1)) + name = m.group(2).strip().strip('"\'') + if scene.start <= n <= scene.end and name in valid: + out[n] = name + return out + + +async def refine_scene_attribution( + client: httpx.AsyncClient, + context: FileContext, + blocks: list[SubtitleBlock], + cfg: TranslationConfig, +) -> None: + """Fill `SceneHint.attribution` for multi-block scenes with named + participants. One small LLM call per target scene, bounded by concurrency.""" + if not context.scenes or not context.characters: + return + gender_by = {h.source.casefold(): h.gender for h in context.characters} + targets = [s for s in context.scenes if _needs_attribution(s, gender_by)] + if not targets: + return + by_num = {b.number: b for b in blocks} + sem = asyncio.Semaphore(max(1, cfg.concurrency)) + + async def do(scene: SceneHint) -> None: + async with sem: + scene.attribution = await _attribute_scene( + client, scene, by_num, cfg, context.characters) + + await asyncio.gather(*(do(s) for s in targets)) diff --git a/cli/core/prompt.py b/cli/core/prompt.py index 45763af..eaba1fe 100644 --- a/cli/core/prompt.py +++ b/cli/core/prompt.py @@ -1,4 +1,10 @@ -"""The translation prompt, kept in one place so it can be iterated on.""" +"""All LLM-facing prompts and user-message builders, kept in one place so they +can be iterated on and reviewed alongside their counterparts.""" + +from __future__ import annotations + +from .srt_parser import SubtitleBlock, serialize_lite + SYSTEM_PROMPT = """\ You are a subtitle translator. You will receive numbered subtitle blocks (no timestamps) and translate them. @@ -15,8 +21,11 @@ - Translate each block independently — never combine split sentences. - Translate faithfully: profanity, slurs, slang — match the original register. - Conversational tone, concise — must fit the original timing. -- If a glossary is provided, use each character's listed gender when choosing pronouns and verb forms in the target language, and use the listed target-language name consistently. -- Use ONE consistent register and variant of the target language across every block. Do not switch dialects or formality between batches. If the target language has a standard written form (e.g., Modern Standard Arabic), use it by default unless the source is clearly colloquial. +- If a glossary is provided, use each character's listed gender for pronouns/verb forms, and the listed target-language name consistently. +- "Scene guidance" entries apply PER BLOCK RANGE only. Match the addressee's gender (not just the speaker's). For exactly-two referents addressed together, use the target's dual form if it has one. +- A `speakers:` line (e.g. `120=Alice 121=Alice 122=Bob`) names the speaker per block. The ADDRESSEE is usually the other named participant — use the addressee's gender (from [brackets]) for second-person forms. +- "Previous context" blocks (if shown) are read-only — infer speaker/addressee from them, do NOT translate or include them. +- Use ONE consistent register and variant of the target language across every block. If the target language has a standard written form (e.g. Modern Standard Arabic), use it unless the source is clearly colloquial. DO NOT TRANSLATE (copy verbatim): - HTML tags, music symbols, formatting tags (\\N, {\\an8}) @@ -26,3 +35,112 @@ Output ONLY the translated .srt blocks. No commentary, no markdown fences.\ """ + + +REVIEW_SYSTEM_PROMPT = """\ +You are a conservative subtitle translation reviewer. You receive a glossary, source blocks, and a first-pass translation in `\\ntext` wire format. + +DEFAULT: output the first-pass UNCHANGED. Only fix clear violations of the glossary: +- Wrong addressee gender (pronouns, verb conjugation, adjective ending, honorific level) when the glossary unambiguously names the addressee's gender. +- Character name spelled differently from the target form in the glossary. +- Dual/plural/singular agreement when the glossary explicitly flags the count. + +If uncertain, keep the block verbatim. Do NOT rephrase, restyle, or "polish". Same number of blocks, same block numbers, same line-count per block. + +Output: same wire format, one blank line between blocks. ALL blocks. No commentary, no fences.\ +""" + + +CONTEXT_SYSTEM_PROMPT = """\ +You analyze a subtitle file before it is translated. Return a compact glossary +for the translator to use when picking correct pronouns, consistent names, and +a single consistent register. + +Input blocks are prefixed with their block number as `[N] text`. + +Reply with all five sections below in this exact order. No commentary, no +fences — tags only. + + +ONE LINE describing the target-language variant and formality. + + +NAME => TARGET_NAME | GENDER + + +SOURCE => TARGET + + +START-END => description that NAMES the characters involved + + +- NOTE + + +Rules: +- : name the exact target variant (e.g. "Modern Standard Arabic, neutral", "Brazilian Portuguese, casual", "Japanese, polite です/ます form"). Pick one for the whole file. +- GENDER is "male", "female", or "unknown". Use "unknown" only when the text gives no signal. +- TARGET_NAME is how the character's name should appear in the target language. +- : every ≥3-block stretch of dialogue between named characters. Name the characters explicitly using the names from so the translator can apply the right gender per range. Ranges may touch but must not overlap. +- Example: `105-119 => Maria reassures Alex about the interview` (use the actual names from YOUR section). +- Include up to 20 characters, 10 terms, 40 scenes, 4 notes. +- Leave a section empty (tags only) if nothing qualifies. Never omit a section.\ +""" + + +ATTRIBUTION_SYSTEM_PROMPT = """\ +You identify the speaker of each subtitle line in a short scene. Given a +character list and a block-numbered scene excerpt (`[N] text`), reply with +exactly one line per input block as `N=SpeakerName`. SpeakerName MUST be one +of the listed characters or the literal "unknown". No commentary, no fences.\ +""" + + +def build_translate_user_message( + source_lang: str, + target_lang: str, + batch_wire: str, + glossary: str, + prev_tail: list[SubtitleBlock], +) -> str: + header = ( + f"Translate from {source_lang} to {target_lang}:" + if source_lang else f"Translate to {target_lang}:" + ) + sections: list[str] = [] + if glossary: + sections.append(f"Glossary for this scene:\n{glossary}") + if prev_tail: + # Non-numbered so the parser can't confuse these with real input blocks. + prev_lines = "\n".join( + f" [prev #{b.number}] {b.text.replace(chr(10), ' ')}" for b in prev_tail + ) + sections.append( + "Previous context (read-only, do NOT translate or output):\n" + prev_lines + ) + sections.append(f"{header}\n\n{batch_wire}") + return "\n\n".join(sections) + + +def build_review_user_message( + batch: list[SubtitleBlock], + first_pass: list[SubtitleBlock], + glossary: str, +) -> str: + return ( + f"Glossary:\n{glossary}\n\n" + f"Source blocks:\n{serialize_lite(batch)}\n\n" + f"First-pass translation:\n{serialize_lite(first_pass)}\n\n" + "Output the corrected translation (same wire format):" + ) + + +def build_scan_user_message( + source_lang: str, target_lang: str, scan_text: str, +) -> str: + source_line = f"Source language: {source_lang}\n" if source_lang else "" + return f"{source_line}Target language: {target_lang}\n\n{scan_text}" + + +def build_attribution_user_message(roster: str, block_lines: list[str]) -> str: + return f"Characters:\n{roster}\n\nScene:\n" + "\n".join(block_lines) diff --git a/cli/core/translator.py b/cli/core/translator.py index a4d68a6..5225e25 100644 --- a/cli/core/translator.py +++ b/cli/core/translator.py @@ -11,9 +11,10 @@ from .srt_parser import SubtitleBlock, split_batches from .formats import parse_subtitle -from .config import DEFAULT_MAX_RETRIES, TranslationConfig +from .config import TranslationConfig +from .constants import DEFAULT_MAX_RETRIES from .batch_runner import FileTranslationError, translate_batch_with_retry -from .context_pass import FileContext, extract_file_context +from .context_pass import FileContext, extract_file_context, refine_scene_attribution from .time_tracker import EtaEstimator, format_duration from .live_status import Colors, LiveLine, Ticker @@ -66,16 +67,27 @@ async def translate_file_async( async with httpx.AsyncClient() as scan_client: if not cfg.quiet: print(colors.dim(" Scanning for cast and context...")) - file_context = await extract_file_context(scan_client, doc.blocks, cfg) + file_context = await extract_file_context( + scan_client, doc.blocks, cfg, + ) + if cfg.refine_attribution and not file_context.is_empty(): + if not cfg.quiet: + print(colors.dim(" Attributing speakers in mixed-gender scenes...")) + await refine_scene_attribution( + scan_client, file_context, doc.blocks, cfg, + ) if not cfg.quiet: if file_context.is_empty(): print(colors.dim(" Glossary: empty (proceeding without context hints)")) else: chars = len(file_context.characters) terms = len(file_context.terms) + scenes = len(file_context.scenes) + attrib = sum(1 for s in file_context.scenes if s.attribution) notes = len(file_context.notes) print(colors.dim( - f" Glossary: {chars} character(s), {terms} term(s), {notes} note(s)" + f" Glossary: {chars} character(s), {terms} term(s), " + f"{scenes} scene(s) ({attrib} attributed), {notes} note(s)" )) if file_context.register: print(colors.dim(f" Register: {file_context.register}")) @@ -99,6 +111,14 @@ async def translate_file_async( print(colors.dim(f" Output: {output_path}")) +def _prev_tail( + batches: list[list[SubtitleBlock]], idx: int, overlap: int, +) -> list[SubtitleBlock]: + if idx <= 0 or overlap <= 0: + return [] + return batches[idx - 1][-overlap:] + + async def _run_batches( batches: list[list[SubtitleBlock]], cfg: TranslationConfig, @@ -147,9 +167,11 @@ async def run_one(idx: int) -> None: if failure: return batch_start = time.time() + prev_tail = _prev_tail(batches, idx, cfg.context_overlap) try: results[idx] = await translate_batch_with_retry( client, idx, batches[idx], cfg, file_context, + prev_tail=prev_tail, ) except FileTranslationError as e: failure = e diff --git a/cli/tests/test_context_pass.py b/cli/tests/test_context_pass.py index 50fefa2..9fb07e4 100644 --- a/cli/tests/test_context_pass.py +++ b/cli/tests/test_context_pass.py @@ -2,12 +2,16 @@ FileContext, CharacterHint, TermHint, + SceneHint, + _needs_attribution, + enrich_scenes_with_block_text, parse_context_response, serialize_for_scan, - _SCAN_CHAR_BUDGET, ) from core.srt_parser import SubtitleBlock +_TEST_BUDGET = 24_000 + def _block(n: int, text: str) -> SubtitleBlock: return SubtitleBlock(number=n, timestamp="00:00:00,000 --> 00:00:01,000", text=text) @@ -77,6 +81,31 @@ def test_is_empty_considers_register(): assert not FileContext(register="Target language").is_empty() +def test_parse_tolerates_missing_closing_tag(): + # Real scan models sometimes drop the closing tag before the + # next section. The body should still parse up to the next opening tag. + raw = """ + +Target variant + + +Alice => آليس | female + + + + +1-5 => Alice speaks +6-10 => Alice continues + +- tone note + +""" + ctx = parse_context_response(raw) + assert ctx.register == "Target variant" + assert len(ctx.scenes) == 2 + assert ctx.notes == ["tone note"] + + def test_parse_tolerates_missing_sections_and_bullets(): raw = """ @@ -134,20 +163,317 @@ def test_render_word_boundary_does_not_match_substrings(): assert "Alice" not in ctx.render_for_batch(batch) +def test_parse_scenes(): + raw = """ + + + + + + + +97-117 => Alice and Carol discuss a concern +279-284 => Dave talks about his daughters +42 => Bob monologues + + + +""" + ctx = parse_context_response(raw) + assert [(s.start, s.end, s.description) for s in ctx.scenes] == [ + (97, 117, "Alice and Carol discuss a concern"), + (279, 284, "Dave talks about his daughters"), + (42, 42, "Bob monologues"), + ] + # No characters section, so no participants should be detected. + for s in ctx.scenes: + assert s.participants == [] + + +def test_parse_scenes_detects_participants_from_characters(): + raw = """ + + + +Alice => Alice | female +Carol => Carol | female +Dave => Dave | male + + + + +97-117 => Alice tells Carol her worries +279-284 => Dave complains about his daughters + + + +""" + ctx = parse_context_response(raw) + assert ctx.scenes[0].participants == ["Alice", "Carol"] + assert ctx.scenes[1].participants == ["Dave"] + + +def test_parse_scenes_detects_participants_via_target_name(): + # Scan model wrote the scene description using the target-language form + # of the character's name (common when prompt output slips into the + # target language). We should still resolve it back to the source name. + raw = """ + +Alice => آليس | female +Carol => كارول | female + + +97-117 => آليس تخبر كارول بمخاوفها + +""" + ctx = parse_context_response(raw) + assert ctx.scenes[0].participants == ["Alice", "Carol"] + + +def test_needs_attribution_triggers_on_multi_block_named_scenes(): + g = {"alice": "female", "bob": "male"} + multi_named = SceneHint(start=1, end=5, description="x", participants=["Alice", "Bob"]) + multi_one = SceneHint(start=1, end=5, description="x", participants=["Alice"]) + two_block = SceneHint(start=1, end=2, description="x", participants=["Alice"]) + no_one = SceneHint(start=1, end=5, description="x", participants=[]) + assert _needs_attribution(multi_named, g) is True + assert _needs_attribution(multi_one, g) is True + assert _needs_attribution(two_block, g) is False + assert _needs_attribution(no_one, g) is False + + +def test_render_for_batch_includes_speakers_line_when_attribution_present(): + ctx = FileContext( + characters=[ + CharacterHint("Alice", "Alice", "female"), + CharacterHint("Bob", "Bob", "male"), + ], + scenes=[SceneHint( + start=10, end=12, description="Alice advises Bob", + participants=["Alice", "Bob"], + attribution={10: "Alice", 11: "Alice", 12: "Bob"}, + )], + ) + batch = [_block(10, "x"), _block(11, "y"), _block(12, "z")] + out = ctx.render_for_batch(batch) + assert "speakers: 10=Alice 11=Alice 12=Bob" in out + assert "[Alice (F), Bob (M)]" in out + + +def test_enrich_scenes_pulls_names_from_block_text_when_description_omits_them(): + # Description says nothing about who's speaking, but the block text + # contains a vocative — the classic "the summary is abstract but the + # dialogue names names" case. + ctx = FileContext( + characters=[ + CharacterHint("Alice", "Alice", "female"), + CharacterHint("Dave", "Dave", "male"), + ], + scenes=[SceneHint(start=1, end=3, description="A tense conversation")], + ) + blocks = [ + _block(1, "Alice, I need to talk to you."), + _block(2, "About what?"), + _block(3, "Dave said he's leaving."), + ] + enriched = enrich_scenes_with_block_text(ctx, blocks) + assert enriched.scenes[0].participants == ["Alice", "Dave"] + + +def test_enrich_scenes_preserves_description_order_and_dedups(): + ctx = FileContext( + characters=[ + CharacterHint("Alice", "Alice", "female"), + CharacterHint("Dave", "Dave", "male"), + ], + scenes=[SceneHint( + start=1, end=2, + description="Dave talks to someone", + participants=["Dave"], + )], + ) + blocks = [ + _block(1, "Alice, look at this."), + _block(2, "Dave, calm down."), + ] + enriched = enrich_scenes_with_block_text(ctx, blocks) + # "Dave" kept (grounded in block 2), "Alice" appended (found in block 1). + assert enriched.scenes[0].participants == ["Dave", "Alice"] + + +def test_enrich_drops_description_name_not_in_blocks(): + # Scan hallucinated "Alice" into the description but she never actually + # speaks in these blocks — drop her, keep only Dave who's really there. + ctx = FileContext( + characters=[ + CharacterHint("Alice", "Alice", "female"), + CharacterHint("Dave", "Dave", "male"), + ], + scenes=[SceneHint( + start=1, end=2, + description="Alice and Dave talk", + participants=["Alice", "Dave"], + )], + ) + blocks = [ + _block(1, "Dave, are you okay?"), + _block(2, "I'm fine."), + ] + enriched = enrich_scenes_with_block_text(ctx, blocks) + assert enriched.scenes[0].participants == ["Dave"] + + +def test_parse_scenes_rejects_substring_match_inside_other_words(): + # A 2-char Arabic transliteration like "لو" would substring-match inside + # many Arabic words (e.g. "الوقوف" contains "لو"). We require whole-word + # matching AND a minimum alias length of 3 to avoid these collisions. + raw = """ + +Lou => لو | male +Alice => آليس | female + + +10-20 => نصائح حول الوقوف وتأثيره على الصحة +21-25 => آليس تطمئن + +""" + ctx = parse_context_response(raw) + # "Lou" (2-char target "لو") must NOT match inside "الوقوف" (standing). + assert ctx.scenes[0].participants == [] + assert ctx.scenes[1].participants == ["Alice"] + + +def test_parse_scenes_skips_malformed_lines(): + raw = """ + +- 10-20 => Two characters (M, F) +- no-range => missing range +- 30 40 => bad separator +- 50-60 => +- 70-80 => good one + +""" + ctx = parse_context_response(raw) + assert [(s.start, s.end) for s in ctx.scenes] == [(10, 20), (70, 80)] + + +def test_parse_scenes_swaps_reversed_range(): + raw = """ + +200-100 => Accidentally reversed + +""" + ctx = parse_context_response(raw) + assert ctx.scenes[0].start == 100 + assert ctx.scenes[0].end == 200 + + +def test_render_includes_overlapping_scenes_only(): + ctx = FileContext( + scenes=[ + SceneHint(start=1, end=5, description="Scene A"), + SceneHint(start=10, end=20, description="Scene B"), + SceneHint(start=50, end=60, description="Scene C"), + ], + ) + # Batch covers blocks 15-25 — touches scene B only. + batch = [_block(15, "line"), _block(25, "line")] + rendered = ctx.render_for_batch(batch) + assert "Scene B" in rendered + assert "Scene A" not in rendered + assert "Scene C" not in rendered + assert "Blocks 10-20" in rendered + + +def test_render_scene_boundary_touch_is_match(): + # Batch first-block equals scene end — still overlaps. + ctx = FileContext(scenes=[SceneHint(start=5, end=10, description="Boundary scene")]) + batch = [_block(10, "line"), _block(15, "line")] + assert "Boundary scene" in ctx.render_for_batch(batch) + + +def test_render_includes_scene_participants_even_if_unnamed_in_batch_text(): + # Carol's name isn't vocatively spoken in the batch blocks, but she IS a + # scene participant — the translator still needs to know her gender. + ctx = FileContext( + characters=[ + CharacterHint("Carol", "Carol", "female"), + CharacterHint("Dave", "Dave", "male"), + ], + scenes=[SceneHint( + start=1, end=2, description="A conversation", + participants=["Carol"], + )], + ) + batch = [_block(1, "Drink water."), _block(2, "Oh, right.")] + out = ctx.render_for_batch(batch) + assert "Carol => Carol (female)" in out + # Dave isn't a participant and isn't in the text — must NOT be listed. + assert "Dave" not in out + + +def test_render_scene_tags_participants_with_gender(): + ctx = FileContext( + characters=[ + CharacterHint("Alice", "Alice", "female"), + CharacterHint("Bob", "Bob", "male"), + ], + scenes=[ + SceneHint( + start=10, end=20, + description="Alice gives Bob an update", + participants=["Alice", "Bob"], + ), + ], + ) + batch = [_block(10, "x"), _block(20, "y")] + rendered = ctx.render_for_batch(batch) + assert "Alice (F)" in rendered + assert "Bob (M)" in rendered + assert "Alice gives Bob an update" in rendered + + +def test_render_scene_without_participants_falls_back_to_description(): + ctx = FileContext( + scenes=[SceneHint(start=1, end=5, description="Crowd murmurs")], + ) + batch = [_block(1, "x")] + rendered = ctx.render_for_batch(batch) + assert "Crowd murmurs" in rendered + # No square-bracket prefix when no participants were detected. + assert "[" not in rendered.split("Crowd murmurs")[0].split("Blocks 1-5:")[-1] + + +def test_is_empty_considers_scenes(): + ctx = FileContext(scenes=[SceneHint(start=1, end=2, description="x")]) + assert not ctx.is_empty() + + def test_serialize_for_scan_returns_all_text_when_under_budget(): blocks = [_block(i, f"Line {i}.") for i in range(1, 6)] - out = serialize_for_scan(blocks) + out = serialize_for_scan(blocks, _TEST_BUDGET) for i in range(1, 6): assert f"Line {i}." in out + assert f"[{i}]" in out def test_serialize_for_scan_samples_large_files_under_budget(): # Build a file that clearly exceeds the scan budget. long_line = "x" * 500 blocks = [_block(i, f"{long_line}-{i}") for i in range(1, 500)] - out = serialize_for_scan(blocks) - assert len(out) <= _SCAN_CHAR_BUDGET * 1.1 # small slack for newlines + out = serialize_for_scan(blocks, _TEST_BUDGET) + assert len(out) <= _TEST_BUDGET * 1.1 # small slack for newlines # Sampled output must include blocks from across the whole file, # not just the first N. assert any(f"-{i}" in out for i in range(1, 20)) assert any(f"-{i}" in out for i in range(450, 500)) + + +def test_serialize_for_scan_joins_multiline_block_text(): + # Multi-line text must be joined onto the [N] line so the prefix stays + # usable for scene-range references. + blocks = [_block(1, "First line\nSecond line")] + out = serialize_for_scan(blocks, _TEST_BUDGET) + assert out.splitlines()[0].startswith("[1] ") + assert "First line" in out + assert "Second line" in out diff --git a/cli/tests/test_review_pass.py b/cli/tests/test_review_pass.py new file mode 100644 index 0000000..6132e37 --- /dev/null +++ b/cli/tests/test_review_pass.py @@ -0,0 +1,91 @@ +"""Tests for the post-edit review pass in batch_runner.""" +from __future__ import annotations + +import asyncio +from dataclasses import dataclass + +import core.batch_runner as br +from core.context_pass import CharacterHint, FileContext +from core.srt_parser import SubtitleBlock + + +def _block(n: int, text: str) -> SubtitleBlock: + return SubtitleBlock(number=n, timestamp="00:00:00,000 --> 00:00:01,000", text=text) + + +@dataclass +class _StubCfg: + source_lang: str = "English" + target_lang: str = "French" + model: str | None = None + api_url: str = "" + api_key: str = "" + review: bool = True + + def warn(self, msg: str) -> None: + pass + + +def _run(coro): + return asyncio.new_event_loop().run_until_complete(coro) + + +def _ctx_with_char() -> FileContext: + return FileContext( + characters=[CharacterHint("Alice", "Alice", "female")], + ) + + +def test_review_keeps_first_pass_when_block_count_mismatches(monkeypatch): + batch = [_block(1, "Hello Alice."), _block(2, "World.")] + first_pass = [_block(1, "Bonjour."), _block(2, "Monde.")] + + async def fake_call(*a, **k): + return "1\nsingle" # only 1 block; mismatched count + + monkeypatch.setattr(br, "call_chat_api", fake_call) + out = _run(br._review_pass(None, batch, first_pass, _StubCfg(), _ctx_with_char())) + assert out is first_pass + + +def test_review_accepts_valid_revision(monkeypatch): + batch = [_block(1, "Hello Alice."), _block(2, "World.")] + first_pass = [_block(1, "Bonjour."), _block(2, "Monde.")] + + async def fake_call(*a, **k): + return "1\nSalut.\n\n2\nMonde." + + monkeypatch.setattr(br, "call_chat_api", fake_call) + out = _run(br._review_pass(None, batch, first_pass, _StubCfg(), _ctx_with_char())) + assert [b.text for b in out] == ["Salut.", "Monde."] + assert [b.number for b in out] == [1, 2] + assert [b.timestamp for b in out] == [batch[0].timestamp, batch[1].timestamp] + + +def test_review_skips_when_no_glossary_and_never_calls_api(monkeypatch): + # Without scene/character guidance, there's no principled reason to touch + # the first-pass — the review must not fire at all. + batch = [_block(1, "Hi.")] + first_pass = [_block(1, "Salut.")] + calls = {"n": 0} + + async def fake_call(*a, **k): + calls["n"] += 1 + return "1\nX." + + monkeypatch.setattr(br, "call_chat_api", fake_call) + out = _run(br._review_pass(None, batch, first_pass, _StubCfg(), None)) + assert out is first_pass + assert calls["n"] == 0 + + +def test_review_keeps_first_pass_on_api_error(monkeypatch): + batch = [_block(1, "Hi Alice.")] + first_pass = [_block(1, "Salut.")] + + async def boom(*a, **k): + raise RuntimeError("network down") + + monkeypatch.setattr(br, "call_chat_api", boom) + out = _run(br._review_pass(None, batch, first_pass, _StubCfg(), _ctx_with_char())) + assert out is first_pass diff --git a/cli/translora.py b/cli/translora.py index 96a30ec..df801c9 100644 --- a/cli/translora.py +++ b/cli/translora.py @@ -8,7 +8,8 @@ from dataclasses import dataclass from pathlib import Path -from core.config import DEFAULT_MAX_RETRIES, TranslationConfig, _stderr_warn +from core.config import TranslationConfig, _stderr_warn +from core.constants import DEFAULT_MAX_RETRIES from core.batch_runner import FileTranslationError from core.time_tracker import format_duration from core.lang_codes import lang_code @@ -76,6 +77,23 @@ def _build_parser() -> argparse.ArgumentParser: help="Show retry/validation warnings (hidden by default)") p.add_argument("--output", "-o", type=Path, default=None, help="Output file path (single file only)") + p.add_argument("--scan-budget", type=int, default=24_000, metavar="CHARS", + help="Character budget for the prepass scan (default: 24000). " + "Tuned for best-quality scans on typical TV episodes; " + "lower on tight-context local models (~8k window), " + "raise on large-context cloud models for full-file scans.") + p.add_argument("--context-overlap", type=int, default=2, metavar="N", + help="Source blocks from the previous batch shown as read-only " + "context (default: 2). Helps maintain speaker continuity " + "across batch boundaries. Set to 0 to disable.") + p.add_argument("--no-refine-attribution", dest="refine_attribution", + action="store_false", default=True, + help="Disable per-block speaker attribution for mixed-gender " + "scenes (saves one small LLM call per ambiguous scene).") + p.add_argument("--no-review", dest="review", + action="store_false", default=True, + help="Disable the post-edit review pass (one extra call per " + "batch that fixes gender/number/consistency slips).") return p @@ -148,6 +166,10 @@ async def _translate_all(args, jobs: list[Job]) -> tuple[int, list[tuple[Path, s batch_size=args.batch_size, concurrency=args.concurrency, max_retries=args.max_retries, + scan_char_budget=args.scan_budget, + context_overlap=args.context_overlap, + refine_attribution=args.refine_attribution, + review=args.review, quiet=multi_file, verbose=args.verbose, ) diff --git a/web/src/app/app.component.html b/web/src/app/app.component.html index ff48ae2..326d572 100644 --- a/web/src/app/app.component.html +++ b/web/src/app/app.component.html @@ -239,6 +239,34 @@

Advanced

Attempts before failing.

+ +
+ + +

Chars sent to the prepass. Lower for tight-context local models, raise for full-file scans on large-context cloud models.

+
+ +
+ + +

Previous-batch blocks shown as read-only context. 0 disables.

+
+ + +
+ +

One extra LLM call per batch fixing gender/number/consistency. Doubles cost — disable on metered providers.

+
+ +
+ +

One small call per ambiguous scene to fix cross-gender addressee slips.

diff --git a/web/src/app/app.component.scss b/web/src/app/app.component.scss index 9fd2866..94f2f6f 100644 --- a/web/src/app/app.component.scss +++ b/web/src/app/app.component.scss @@ -3,9 +3,6 @@ } .nav { - position: sticky; - top: 0; - z-index: 20; padding: 0.8rem 1rem 0; } @@ -485,6 +482,34 @@ color: var(--text-muted); } +.field-toggle { + display: flex; + flex-direction: column; + gap: 0.25rem; + padding-top: 0.15rem; + + .toggle { + display: inline-flex; + align-items: center; + gap: 0.55rem; + font-size: 0.85rem; + font-weight: 600; + color: var(--text-primary); + cursor: pointer; + } + + .toggle input[type='checkbox'] { + width: 1rem; + height: 1rem; + accent-color: var(--color-primary-strong, var(--border-accent)); + cursor: pointer; + } + + .field-hint { + margin-left: 1.55rem; + } +} + .lang-hints { display: grid; grid-template-columns: minmax(0, 1fr) auto minmax(0, 1fr); diff --git a/web/src/app/app.component.ts b/web/src/app/app.component.ts index a3d8d2c..16eb52a 100644 --- a/web/src/app/app.component.ts +++ b/web/src/app/app.component.ts @@ -4,12 +4,18 @@ import JSZip from 'jszip'; import { TranslationService, ProviderConfig, - DEFAULT_MAX_RETRIES, + TranslationCancelledError, +} from './core/translation.service'; +import { DEFAULT_BATCH_SIZE, DEFAULT_CONCURRENCY, + DEFAULT_CONTEXT_OVERLAP, + DEFAULT_MAX_RETRIES, DEFAULT_PARALLEL_FILES, - TranslationCancelledError, -} from './core/translation.service'; + DEFAULT_REFINE_ATTRIBUTION, + DEFAULT_REVIEW, + DEFAULT_SCAN_BUDGET, +} from './core/constants'; import { parseSubtitle } from './core/subtitle-formats'; import { LANGUAGES } from './core/languages'; import { PROVIDER_PRESETS, PROVIDER_KEYS } from './core/providers'; @@ -28,6 +34,10 @@ const DEFAULTS = { concurrency: DEFAULT_CONCURRENCY, parallelFiles: DEFAULT_PARALLEL_FILES, maxRetries: DEFAULT_MAX_RETRIES, + contextOverlap: DEFAULT_CONTEXT_OVERLAP, + scanBudget: DEFAULT_SCAN_BUDGET, + refineAttribution: DEFAULT_REFINE_ATTRIBUTION, + review: DEFAULT_REVIEW, }; @Component({ @@ -58,6 +68,10 @@ export class AppComponent implements OnDestroy { batchSize = signal(DEFAULTS.batchSize); parallelFiles = signal(DEFAULTS.parallelFiles); maxRetries = signal(DEFAULTS.maxRetries); + contextOverlap = signal(DEFAULTS.contextOverlap); + scanBudget = signal(DEFAULTS.scanBudget); + refineAttribution = signal(DEFAULTS.refineAttribution); + review = signal(DEFAULTS.review); theme = signal<'light' | 'dark'>('light'); @@ -330,6 +344,10 @@ export class AppComponent implements OnDestroy { this.concurrency.set(this.currentPreset().defaultConcurrency); this.parallelFiles.set(DEFAULTS.parallelFiles); this.maxRetries.set(DEFAULTS.maxRetries); + this.contextOverlap.set(DEFAULTS.contextOverlap); + this.scanBudget.set(DEFAULTS.scanBudget); + this.refineAttribution.set(DEFAULTS.refineAttribution); + this.review.set(DEFAULTS.review); } swapLanguages() { @@ -493,6 +511,12 @@ export class AppComponent implements OnDestroy { }); }, cancelSignal, + { + contextOverlap: this.contextOverlap(), + scanBudget: this.scanBudget(), + refineAttribution: this.refineAttribution(), + review: this.review(), + }, ); if (cancelSignal.aborted || this.cancelRequested) return; diff --git a/web/src/app/core/constants.ts b/web/src/app/core/constants.ts new file mode 100644 index 0000000..ac851a0 --- /dev/null +++ b/web/src/app/core/constants.ts @@ -0,0 +1,26 @@ +// Public defaults and tuning constants shared across web modules. + +// === Translation defaults === +export const DEFAULT_MAX_RETRIES = 5; +export const DEFAULT_BATCH_SIZE = 10; +export const DEFAULT_CONCURRENCY = 5; +export const DEFAULT_PARALLEL_FILES = 1; +export const DEFAULT_CONTEXT_OVERLAP = 2; +export const DEFAULT_REVIEW = true; +export const DEFAULT_REFINE_ATTRIBUTION = true; +// Sized for full-quality scans on typical TV episodes; lower on tight-context +// local models (~8k window), raise on large-context cloud models. +export const DEFAULT_SCAN_BUDGET = 24_000; + +// === Prepass / attribution scan === +export const SCAN_MAX_TOKENS = 3000; +// 2-char names collide with common target-language words. +export const MIN_NAME_LEN = 3; +// Single-block scenes never need per-block speaker attribution. +export const ATTRIB_MIN_BLOCKS = 3; + +// === Batch retry/split === +export const ATTEMPTS_BEFORE_SPLIT = 2; + +// === HTTP === +export const CRED_QUERY_PARAMS = ['key', 'api_key', 'apikey', 'access_token'] as const; diff --git a/web/src/app/core/context-pass.ts b/web/src/app/core/context-pass.ts index 9f31cf5..4525947 100644 --- a/web/src/app/core/context-pass.ts +++ b/web/src/app/core/context-pass.ts @@ -1,36 +1,9 @@ -// One-shot prepass: scans the file once for cast/terms/register so every batch -// shares the same glossary. Fails silently to an empty FileContext. +// One-shot prepass: scans the file once for cast/terms/scenes/register so every +// batch shares the same glossary. Fails silently to an empty FileContext. +import { ATTRIB_MIN_BLOCKS, MIN_NAME_LEN } from './constants'; import { SubtitleBlock } from './srt-parser'; -export const CONTEXT_SYSTEM_PROMPT = `You analyze a subtitle file before it is translated. Return a compact glossary for the translator to use when picking correct pronouns, consistent names, and a single consistent register. - -Your reply MUST start with \`\` and MUST contain all four sections below, in this exact order, with no other text before, between, or after them. No commentary. No code fences. No explanations. Tags only. - - -ONE LINE describing the target-language variant and formality the translator should use for the ENTIRE file. - - -NAME => TARGET_NAME | GENDER - - -SOURCE => TARGET - - -- NOTE - - -Rules: -- The line names the specific target-language variant and formality (e.g. "Modern Standard Arabic, neutral", "Brazilian Portuguese, casual", "Simplified Mandarin, neutral", "Japanese, polite です/ます form"). Pick ONE and commit to it for the whole file. Base the choice on the source's tone; default to the standard written form of the target language unless the source is clearly colloquial. -- GENDER is "male", "female", or "unknown". Use "unknown" only when the text gives no signal at all. -- TARGET_NAME is how the character's name should appear in the target language (transliterated or localized). -- Include up to 20 named characters, 10 recurring proper terms or jargon, 4 brief notes on setting/tone. -- Leave a section empty (tags only) if nothing qualifies. Never omit a section.`; - -// Sized so small-context models (4k-8k) still have room for prompt + output. -export const SCAN_CHAR_BUDGET = 12_000; -export const SCAN_MAX_TOKENS = 1500; - export type Gender = 'male' | 'female' | 'unknown'; export interface CharacterHint { @@ -44,97 +17,193 @@ export interface TermHint { target: string; } +export interface SceneHint { + start: number; + end: number; + description: string; + participants: string[]; + attribution: Record; // Per-block speaker map (block_number -> character source name). +} + export class FileContext { constructor( public register = '', public characters: CharacterHint[] = [], public terms: TermHint[] = [], + public scenes: SceneHint[] = [], public notes: string[] = [], ) {} isEmpty(): boolean { - return !(this.register || this.characters.length || this.terms.length || this.notes.length); + return !(this.register || this.characters.length || this.terms.length + || this.scenes.length || this.notes.length); } - // Glossary slice scoped to names/terms present in this batch. Register and - // notes are file-wide and always included if set. + // Glossary slice scoped to this batch. Register/notes are file-wide. renderForBatch(batch: SubtitleBlock[]): string { const text = batch.map((b) => b.text).join('\n'); - const chars = this.characters.filter((h) => containsWord(text, h.source)); - const terms = this.terms.filter((h) => containsWord(text, h.source)); - if (!this.register && !chars.length && !terms.length && !this.notes.length) { + const scenes = scenesOverlapping(this.scenes, batch); + // Include characters named in the batch AND scene participants — the + // latter covers speakers who address each other as "you" without + // vocatives, so the translator still learns their gender. + const sceneNames = new Set(scenes.flatMap((s) => s.participants)); + const chars = this.characters.filter( + (h) => findWord(text, h.source) >= 0 || sceneNames.has(h.source), + ); + const terms = this.terms.filter((h) => findWord(text, h.source) >= 0); + if (!this.register && !chars.length && !terms.length && !scenes.length && !this.notes.length) { return ''; } + const genderBy = new Map(this.characters.map((h) => [h.source.toLowerCase(), h.gender])); const parts: string[] = []; if (this.register) { parts.push(`Target register: ${this.register} (use consistently across every block)`); } if (chars.length) { - const lines = chars.map((h) => `- ${h.source} => ${h.target} (${h.gender})`); - parts.push('Characters:\n' + lines.join('\n')); + parts.push('Characters:\n' + chars.map((h) => `- ${h.source} => ${h.target} (${h.gender})`).join('\n')); } if (terms.length) { - const lines = terms.map((h) => `- ${h.source} => ${h.target}`); - parts.push('Terms:\n' + lines.join('\n')); + parts.push('Terms:\n' + terms.map((h) => `- ${h.source} => ${h.target}`).join('\n')); + } + if (scenes.length) { + parts.push(renderScenes(scenes, genderBy)); } if (this.notes.length) { - const lines = this.notes.slice(0, 4).map((n) => `- ${n}`); - parts.push('Notes:\n' + lines.join('\n')); + parts.push('Notes:\n' + this.notes.slice(0, 4).map((n) => `- ${n}`).join('\n')); } return parts.join('\n\n'); } } -function escapeRegExp(s: string): string { - return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +function scenesOverlapping(scenes: SceneHint[], batch: SubtitleBlock[]): SceneHint[] { + if (!scenes.length || !batch.length) return []; + const first = batch[0].number; + const last = batch[batch.length - 1].number; + return scenes.filter((s) => s.end >= first && s.start <= last); } -function containsWord(text: string, word: string): boolean { - if (!word) return false; - const re = new RegExp(`(? sum + b.text.length + 1, 0); - if (totalChars <= SCAN_CHAR_BUDGET || blocks.length <= 1) { - return blocks.map((b) => b.text).join('\n'); +function renderScenes(scenes: SceneHint[], genderBy: Map): string { + const lines: string[] = []; + for (const s of scenes) { + const tagged = s.participants.map((n) => { + const mark = genderMark(genderBy.get(n.toLowerCase())); + return mark ? `${n} (${mark})` : n; + }).join(', '); + const prefix = `- Blocks ${s.start}-${s.end}:`; + lines.push(tagged ? `${prefix} [${tagged}] — ${s.description}` : `${prefix} ${s.description}`); + const nums = Object.keys(s.attribution).map(Number).sort((a, b) => a - b); + if (nums.length) { + lines.push(' speakers: ' + nums.map((n) => `${n}=${s.attribution[n]}`).join(' ')); + } } + return ( + "Scene guidance — each entry applies ONLY to its listed block range. " + + "Participants and genders in [brackets]; a 'speakers:' line names the " + + "speaker per block so you pick the right gender for the ADDRESSEE:\n" + + lines.join('\n') + ); +} - const takeN = Math.max(1, Math.floor((blocks.length * SCAN_CHAR_BUDGET) / totalChars)); +// Case-insensitive whole-word search with Unicode-aware boundaries. +// Works for Latin, Arabic, CJK, etc. Returns first match index or -1. +function findWord(text: string, word: string): number { + if (!text || !word) return -1; + const haystack = text.toLowerCase(); + const needle = word.toLowerCase(); + const nlen = needle.length; + let i = 0; + while (i <= haystack.length - nlen) { + const j = haystack.indexOf(needle, i); + if (j < 0) return -1; + const before = j > 0 ? text[j - 1] : ''; + const after = j + nlen < text.length ? text[j + nlen] : ''; + if (!extendsWord(before) && !extendsWord(after)) return j; + i = j + 1; + } + return -1; +} + +function extendsWord(ch: string): boolean { + if (!ch) return false; + if (ch === '_') return true; + return /\p{L}|\p{N}/u.test(ch); +} + +function detectParticipants(text: string, characters: CharacterHint[]): string[] { + // Match source AND target forms so descriptions in the target language + // still resolve to the canonical source name. + const aliases: Array<{ alias: string; name: string }> = []; + for (const h of characters) { + if (h.source.length >= MIN_NAME_LEN) aliases.push({ alias: h.source, name: h.source }); + if (h.target !== h.source && h.target.length >= MIN_NAME_LEN) { + aliases.push({ alias: h.target, name: h.source }); + } + } + aliases.sort((a, b) => b.alias.length - a.alias.length); + const firstAt = new Map(); + for (const { alias, name } of aliases) { + if (firstAt.has(name)) continue; + const idx = findWord(text, alias); + if (idx >= 0) firstAt.set(name, idx); + } + return [...firstAt.entries()].sort((a, b) => a[1] - b[1]).map(([n]) => n); +} + +function formatScanLine(b: SubtitleBlock): string { + return `[${b.number}] ${b.text.replace(/\n/g, ' ')}`; +} + +// Stride-samples large files so characters introduced late still land in +// the glossary. +export function serializeForScan( + blocks: SubtitleBlock[], + charBudget: number, +): string { + const total = blocks.reduce((sum, b) => sum + formatScanLine(b).length + 1, 0); + if (total <= charBudget || blocks.length <= 1) { + return blocks.map(formatScanLine).join('\n'); + } + const takeN = Math.max(1, Math.floor((blocks.length * charBudget) / total)); const step = blocks.length / takeN; const sampled: SubtitleBlock[] = []; - for (let i = 0; i < takeN; i++) { - sampled.push(blocks[Math.floor(i * step)]); - } - return sampled.map((b) => b.text).join('\n'); + for (let i = 0; i < takeN; i++) sampled.push(blocks[Math.floor(i * step)]); + return sampled.map(formatScanLine).join('\n'); } -const SECTION_RE = /<(register|characters|terms|notes)>\s*([\s\S]*?)\s*<\/\1>/gi; +// Closing tag optional so a truncated reply still parses. +const SECTION_RE = + /<(register|characters|terms|scenes|notes)>\s*([\s\S]*?)\s*(?=<\/\1>|<(?:register|characters|terms|scenes|notes)>|$)/gi; +const SCENE_RANGE_RE = /^(\d+)\s*(?:-\s*(\d+))?$/; +const ATTRIB_LINE_RE = /^\s*(\d+)\s*=\s*(.+?)\s*$/; function stripBullet(line: string): string { return line.trim().replace(/^[-*•]\s*/, '').trim(); } -// Parse tagged response. Tolerates extra whitespace and bullet markers. +function splitOnce(s: string, sep: string): [string, string] { + const i = s.indexOf(sep); + return i < 0 ? [s, ''] : [s.slice(0, i), s.slice(i + sep.length)]; +} + +// Parse the tagged response. Tolerates whitespace and bullet markers. export function parseContextResponse(text: string): FileContext { const sections: Record = {}; - const src = text || ''; SECTION_RE.lastIndex = 0; let m: RegExpExecArray | null; - while ((m = SECTION_RE.exec(src)) !== null) { + while ((m = SECTION_RE.exec(text || '')) !== null) { sections[m[1].toLowerCase()] = m[2]; } - const rawRegister = sections['register'] ?? ''; - const register = stripBullet(rawRegister.split(/\s+/).join(' ')); + const register = stripBullet((sections['register'] ?? '').split(/\s+/).join(' ')); const characters: CharacterHint[] = []; - for (const rawLine of (sections['characters'] ?? '').split('\n')) { - const line = stripBullet(rawLine); + for (const raw of (sections['characters'] ?? '').split('\n')) { + const line = stripBullet(raw); if (!line || !line.includes('=>')) continue; const [srcPart, restPart] = splitOnce(line, '=>'); let tgt: string, gender: string; @@ -146,29 +215,42 @@ export function parseContextResponse(text: string): FileContext { tgt = restPart.trim(); gender = 'unknown'; } - const normalizedGender: Gender = - gender === 'male' || gender === 'female' ? gender : 'unknown'; - const src2 = srcPart.trim(); - if (src2 && tgt) { - characters.push({ source: src2, target: tgt, gender: normalizedGender }); - } + const g: Gender = gender === 'male' || gender === 'female' ? gender : 'unknown'; + const src = srcPart.trim(); + if (src && tgt) characters.push({ source: src, target: tgt, gender: g }); } const terms: TermHint[] = []; - for (const rawLine of (sections['terms'] ?? '').split('\n')) { - const line = stripBullet(rawLine); + for (const raw of (sections['terms'] ?? '').split('\n')) { + const line = stripBullet(raw); if (!line || !line.includes('=>')) continue; const [srcPart, tgtPart] = splitOnce(line, '=>'); - const src2 = srcPart.trim(); + const src = srcPart.trim(); const tgt = tgtPart.trim(); - if (src2 && tgt) { - terms.push({ source: src2, target: tgt }); - } + if (src && tgt) terms.push({ source: src, target: tgt }); + } + + const scenes: SceneHint[] = []; + for (const raw of (sections['scenes'] ?? '').split('\n')) { + const line = stripBullet(raw); + if (!line || !line.includes('=>')) continue; + const [rangePart, descPart] = splitOnce(line, '=>'); + const desc = descPart.trim(); + const rm = SCENE_RANGE_RE.exec(rangePart.trim()); + if (!desc || !rm) continue; + let start = parseInt(rm[1], 10); + let end = rm[2] ? parseInt(rm[2], 10) : start; + if (end < start) [start, end] = [end, start]; + scenes.push({ + start, end, description: desc, + participants: detectParticipants(desc, characters), + attribution: {}, + }); } const notes: string[] = []; - for (const rawLine of (sections['notes'] ?? '').split('\n')) { - const line = stripBullet(rawLine); + for (const raw of (sections['notes'] ?? '').split('\n')) { + const line = stripBullet(raw); if (line) notes.push(line); } @@ -176,12 +258,59 @@ export function parseContextResponse(text: string): FileContext { register, characters.slice(0, 20), terms.slice(0, 10), + scenes.slice(0, 80), notes.slice(0, 4), ); } -function splitOnce(s: string, sep: string): [string, string] { - const i = s.indexOf(sep); - if (i < 0) return [s, '']; - return [s.slice(0, i), s.slice(i + sep.length)]; +// Reconcile scene participants with what's in the source blocks. Block-text +// names are primary truth: description-named participants are kept only if +// grounded in the text; missed block-text names are appended. +export function enrichScenesWithBlockText( + context: FileContext, + blocks: SubtitleBlock[], +): FileContext { + if (!context.scenes.length || !context.characters.length) return context; + const byNum = new Map(blocks.map((b) => [b.number, b])); + const enriched = context.scenes.map((scene) => { + const parts: string[] = []; + for (let n = scene.start; n <= scene.end; n++) { + const b = byNum.get(n); + if (b) parts.push(b.text); + } + const inText = detectParticipants(parts.join('\n'), context.characters); + const inTextSet = new Set(inText); + const kept = scene.participants.filter((p) => inTextSet.has(p)); + const seen = new Set(kept); + for (const name of inText) { + if (!seen.has(name)) { + kept.push(name); + seen.add(name); + } + } + return { ...scene, participants: kept }; + }); + return new FileContext( + context.register, context.characters, context.terms, enriched, context.notes, + ); +} + +export function needsAttribution(scene: SceneHint): boolean { + return (scene.end - scene.start + 1) >= ATTRIB_MIN_BLOCKS && scene.participants.length >= 1; +} + +export function parseAttributionResponse( + raw: string, scene: SceneHint, characters: CharacterHint[], +): Record { + const valid = new Set(characters.map((h) => h.source)); + valid.add('unknown'); + const out: Record = {}; + for (const line of (raw || '').split('\n')) { + const m = ATTRIB_LINE_RE.exec(line); + if (!m) continue; + const n = parseInt(m[1], 10); + const name = m[2].trim().replace(/^["']|["']$/g, ''); + if (n >= scene.start && n <= scene.end && valid.has(name)) out[n] = name; + } + return out; } diff --git a/web/src/app/core/translation-prompt.ts b/web/src/app/core/translation-prompt.ts index 181327f..f10fd25 100644 --- a/web/src/app/core/translation-prompt.ts +++ b/web/src/app/core/translation-prompt.ts @@ -1,3 +1,8 @@ +// All LLM-facing prompts and user-message builders, kept in one place so they +// can be iterated on alongside their counterparts. + +import { SubtitleBlock, serializeLite } from './srt-parser'; + export const SYSTEM_PROMPT = `You are a subtitle translator. You will receive numbered subtitle blocks (no timestamps) and translate them. Input format for each block: @@ -12,8 +17,11 @@ RULES (violating any = corrupt file): - Translate each block independently — never combine split sentences. - Translate faithfully: profanity, slurs, slang — match the original register. - Conversational tone, concise — must fit the original timing. -- If a glossary is provided, use each character's listed gender when choosing pronouns and verb forms in the target language, and use the listed target-language name consistently. -- Use ONE consistent register and variant of the target language across every block. Do not switch dialects or formality between batches. If the target language has a standard written form (e.g., Modern Standard Arabic), use it by default unless the source is clearly colloquial. +- If a glossary is provided, use each character's listed gender for pronouns/verb forms, and the listed target-language name consistently. +- "Scene guidance" entries apply PER BLOCK RANGE only. Match the addressee's gender (not just the speaker's). For exactly-two referents addressed together, use the target's dual form if it has one. +- A \`speakers:\` line (e.g. \`120=Alice 121=Alice 122=Bob\`) names the speaker per block. The ADDRESSEE is usually the other named participant — use the addressee's gender (from [brackets]) for second-person forms. +- "Previous context" blocks (if shown) are read-only — infer speaker/addressee from them, do NOT translate or include them. +- Use ONE consistent register and variant of the target language across every block. If the target language has a standard written form (e.g. Modern Standard Arabic), use it unless the source is clearly colloquial. DO NOT TRANSLATE (copy verbatim): - HTML tags, music symbols, formatting tags (\\N, {\\an8}) @@ -23,17 +31,96 @@ SHORT BLOCKS like "Oh!", "No!", "Hmm." are the #1 cause of missing blocks. Trans Output ONLY the translated .srt blocks. No commentary, no markdown fences.`; +export const REVIEW_SYSTEM_PROMPT = `You are a conservative subtitle translation reviewer. You receive a glossary, source blocks, and a first-pass translation in \`\\ntext\` wire format. + +DEFAULT: output the first-pass UNCHANGED. Only fix clear violations of the glossary: +- Wrong addressee gender (pronouns, verb conjugation, adjective ending, honorific level) when the glossary unambiguously names the addressee's gender. +- Character name spelled differently from the target form in the glossary. +- Dual/plural/singular agreement when the glossary explicitly flags the count. + +If uncertain, keep the block verbatim. Do NOT rephrase, restyle, or "polish". Same number of blocks, same block numbers, same line-count per block. + +Output: same wire format, one blank line between blocks. ALL blocks. No commentary, no fences.`; + +export const CONTEXT_SYSTEM_PROMPT = `You analyze a subtitle file before it is translated. Return a compact glossary for the translator to use when picking correct pronouns, consistent names, and a single consistent register. + +Input blocks are prefixed with their block number as \`[N] text\`. + +Reply with all five sections below in this exact order. No commentary, no fences — tags only. + + +ONE LINE describing the target-language variant and formality. + + +NAME => TARGET_NAME | GENDER + + +SOURCE => TARGET + + +START-END => description that NAMES the characters involved + + +- NOTE + + +Rules: +- : name the exact target variant (e.g. "Modern Standard Arabic, neutral", "Brazilian Portuguese, casual", "Japanese, polite です/ます form"). Pick one for the whole file. +- GENDER is "male", "female", or "unknown". Use "unknown" only when the text gives no signal. +- TARGET_NAME is how the character's name should appear in the target language. +- : every ≥3-block stretch of dialogue between named characters. Name the characters explicitly using the names from so the translator can apply the right gender per range. Ranges may touch but must not overlap. +- Example: \`105-119 => Maria reassures Alex about the interview\` (use the actual names from YOUR section). +- Include up to 20 characters, 10 terms, 40 scenes, 4 notes. +- Leave a section empty (tags only) if nothing qualifies. Never omit a section.`; + +export const ATTRIBUTION_SYSTEM_PROMPT = `You identify the speaker of each subtitle line in a short scene. Given a character list and a block-numbered scene excerpt (\`[N] text\`), reply with exactly one line per input block as \`N=SpeakerName\`. SpeakerName MUST be one of the listed characters or the literal "unknown". No commentary, no fences.`; + export function buildUserMessage( sourceLang: string, targetLang: string, srtContent: string, glossary?: string, + prevTail: SubtitleBlock[] = [], ): string { const header = sourceLang ? `Translate from ${sourceLang} to ${targetLang}:` : `Translate to ${targetLang}:`; + const sections: string[] = []; if (glossary && glossary.trim()) { - return `Glossary for this scene:\n${glossary}\n\n${header}\n\n${srtContent}`; + sections.push(`Glossary for this scene:\n${glossary}`); } - return `${header}\n\n${srtContent}`; + if (prevTail.length) { + const lines = prevTail + .map((b) => ` [prev #${b.number}] ${b.text.replace(/\n/g, ' ')}`) + .join('\n'); + sections.push('Previous context (read-only, do NOT translate or output):\n' + lines); + } + sections.push(`${header}\n\n${srtContent}`); + return sections.join('\n\n'); +} + +export function buildReviewUserMessage( + batch: SubtitleBlock[], + firstPass: SubtitleBlock[], + glossary: string, +): string { + return ( + `Glossary:\n${glossary}\n\n` + + `Source blocks:\n${serializeLite(batch)}\n\n` + + `First-pass translation:\n${serializeLite(firstPass)}\n\n` + + 'Output the corrected translation (same wire format):' + ); +} + +export function buildScanUserMessage( + sourceLang: string, + targetLang: string, + scanText: string, +): string { + const sourceLine = sourceLang ? `Source language: ${sourceLang}\n` : ''; + return `${sourceLine}Target language: ${targetLang}\n\n${scanText}`; +} + +export function buildAttributionUserMessage(roster: string, sceneLines: string[]): string { + return `Characters:\n${roster}\n\nScene:\n${sceneLines.join('\n')}`; } diff --git a/web/src/app/core/translation.service.ts b/web/src/app/core/translation.service.ts index d5e1923..aee6b2e 100644 --- a/web/src/app/core/translation.service.ts +++ b/web/src/app/core/translation.service.ts @@ -1,6 +1,39 @@ import { Injectable } from '@angular/core'; import { HttpClient, HttpErrorResponse } from '@angular/common/http'; import { Subscription } from 'rxjs'; +import { + ATTEMPTS_BEFORE_SPLIT, + CRED_QUERY_PARAMS, + DEFAULT_BATCH_SIZE, + DEFAULT_CONCURRENCY, + DEFAULT_CONTEXT_OVERLAP, + DEFAULT_MAX_RETRIES, + DEFAULT_REFINE_ATTRIBUTION, + DEFAULT_REVIEW, + DEFAULT_SCAN_BUDGET, + SCAN_MAX_TOKENS, +} from './constants'; +import { + FileContext, + enrichScenesWithBlockText, + genderMark, + needsAttribution, + parseAttributionResponse, + parseContextResponse, + serializeForScan, + type CharacterHint, + type SceneHint, +} from './context-pass'; +import { + ATTRIBUTION_SYSTEM_PROMPT, + CONTEXT_SYSTEM_PROMPT, + REVIEW_SYSTEM_PROMPT, + SYSTEM_PROMPT, + buildAttributionUserMessage, + buildReviewUserMessage, + buildScanUserMessage, + buildUserMessage, +} from './translation-prompt'; import { SubtitleBlock, parseLite, @@ -9,14 +42,6 @@ import { validateBatch, } from './srt-parser'; import { SubtitleDocument } from './subtitle-formats/types'; -import { SYSTEM_PROMPT, buildUserMessage } from './translation-prompt'; -import { - CONTEXT_SYSTEM_PROMPT, - FileContext, - SCAN_MAX_TOKENS, - parseContextResponse, - serializeForScan, -} from './context-pass'; export interface ProviderConfig { apiUrl: string; @@ -36,12 +61,12 @@ export class TranslationCancelledError extends Error { } } -export const DEFAULT_MAX_RETRIES = 5; -export const DEFAULT_BATCH_SIZE = 10; -export const DEFAULT_CONCURRENCY = 5; -export const DEFAULT_PARALLEL_FILES = 1; - -const ATTEMPTS_BEFORE_SPLIT = 2; +export interface QualityOptions { + contextOverlap?: number; + scanBudget?: number; + refineAttribution?: boolean; + review?: boolean; +} type ChatResponse = { choices: { message: { content: string } }[] }; @@ -59,15 +84,26 @@ export class TranslationService { maxRetries = DEFAULT_MAX_RETRIES, onProgress?: (p: TranslationProgress) => void, cancelSignal?: AbortSignal, + quality: QualityOptions = {}, ): Promise { if (doc.blocks.length === 0) { throw new Error('No subtitle blocks found in file'); } throwIfCancelled(cancelSignal); + const contextOverlap = quality.contextOverlap ?? DEFAULT_CONTEXT_OVERLAP; + const scanBudget = quality.scanBudget ?? DEFAULT_SCAN_BUDGET; + const refineAttribution = quality.refineAttribution ?? DEFAULT_REFINE_ATTRIBUTION; + const review = quality.review ?? DEFAULT_REVIEW; + const fileContext = await this.extractFileContext( - doc.blocks, sourceLang, targetLang, provider, cancelSignal, + doc.blocks, sourceLang, targetLang, provider, scanBudget, cancelSignal, ); + if (refineAttribution && !fileContext.isEmpty()) { + await this.refineSceneAttribution( + fileContext, doc.blocks, provider, concurrency, cancelSignal, + ); + } const batches = splitBatches(doc.blocks, batchSize); const results: SubtitleBlock[][] = new Array(batches.length); @@ -84,8 +120,13 @@ export class TranslationService { throwIfCancelled(cancelSignal); const i = nextIdx++; if (i >= batches.length) return; + const prevTail = + i > 0 && contextOverlap > 0 + ? batches[i - 1].slice(-contextOverlap) + : []; results[i] = await this.translateBatch( - batches[i], sourceLang, targetLang, provider, maxRetries, fileContext, cancelSignal, + batches[i], sourceLang, targetLang, provider, maxRetries, fileContext, + prevTail, contextOverlap, review, cancelSignal, ); completed++; emit(); @@ -106,19 +147,19 @@ export class TranslationService { sourceLang: string, targetLang: string, provider: ProviderConfig, + scanBudget: number, cancelSignal?: AbortSignal, ): Promise { - const sourceLine = sourceLang ? `Source language: ${sourceLang}\n` : ''; - const userMessage = - sourceLine + - `Target language: ${targetLang}\n\n` + - serializeForScan(blocks); - + const userMessage = buildScanUserMessage( + sourceLang, targetLang, serializeForScan(blocks, scanBudget), + ); try { const raw = await this.callChat( CONTEXT_SYSTEM_PROMPT, userMessage, provider, SCAN_MAX_TOKENS, cancelSignal, ); - return parseContextResponse(stripMarkdownFences(raw)); + const ctx = parseContextResponse(stripMarkdownFences(raw)); + if (ctx.isEmpty()) return ctx; + return enrichScenesWithBlockText(ctx, blocks); } catch (err) { if (err instanceof TranslationCancelledError) throw err; console.warn('Context scan failed, proceeding without:', err); @@ -126,6 +167,75 @@ export class TranslationService { } } + private async refineSceneAttribution( + ctx: FileContext, + blocks: SubtitleBlock[], + provider: ProviderConfig, + concurrency: number, + cancelSignal?: AbortSignal, + ): Promise { + const targets = ctx.scenes.filter(needsAttribution); + if (!targets.length) return; + + const byNum = new Map(blocks.map((b) => [b.number, b])); + let nextIdx = 0; + const worker = async () => { + while (true) { + throwIfCancelled(cancelSignal); + const i = nextIdx++; + if (i >= targets.length) return; + const scene = targets[i]; + try { + const userMsg = buildSceneAttributionMessage(scene, byNum, ctx.characters); + const raw = await this.callChat( + ATTRIBUTION_SYSTEM_PROMPT, userMsg, provider, + (scene.end - scene.start + 1) * 20 + 100, cancelSignal, + ); + scene.attribution = parseAttributionResponse( + stripMarkdownFences(raw), scene, ctx.characters, + ); + } catch (err) { + if (err instanceof TranslationCancelledError) throw err; + console.warn( + `Attribution failed for blocks ${scene.start}-${scene.end}:`, err, + ); + } + } + }; + + const workerCount = Math.min(concurrency, targets.length); + await Promise.all(Array.from({ length: workerCount }, worker)); + } + + private async reviewBatch( + batch: SubtitleBlock[], + firstPass: SubtitleBlock[], + glossary: string, + provider: ProviderConfig, + cancelSignal?: AbortSignal, + ): Promise { + if (!glossary.trim()) return firstPass; + try { + const raw = await this.callChat( + REVIEW_SYSTEM_PROMPT, + buildReviewUserMessage(batch, firstPass, glossary), + provider, Math.max(batch.length, 1) * 120, cancelSignal, + ); + const parsed = parseLite(stripMarkdownFences(raw)); + if (parsed.length !== batch.length) return firstPass; + const revised = parsed.map((b, i) => ({ + number: batch[i].number, + timestamp: batch[i].timestamp, + text: b.text, + })); + return validateBatch(batch, revised).ok ? revised : firstPass; + } catch (err) { + if (err instanceof TranslationCancelledError) throw err; + console.warn('Review failed, keeping first-pass:', err); + return firstPass; + } + } + private async translateBatch( inputBlocks: SubtitleBlock[], sourceLang: string, @@ -133,6 +243,9 @@ export class TranslationService { provider: ProviderConfig, maxRetries: number, fileContext: FileContext, + prevTail: SubtitleBlock[], + contextOverlap: number, + review: boolean, cancelSignal?: AbortSignal, ): Promise { throwIfCancelled(cancelSignal); @@ -145,7 +258,9 @@ export class TranslationService { const batchWire = serializeLite(inputBlocks); const glossary = fileContext.renderForBatch(inputBlocks); - const userMessage = buildUserMessage(sourceLang, targetLang, batchWire, glossary); + const userMessage = buildUserMessage( + sourceLang, targetLang, batchWire, glossary, prevTail, + ); let hitValidationFailure = false; let lastError = ''; @@ -166,7 +281,14 @@ export class TranslationService { })); } const check = validateBatch(inputBlocks, output); - if (check.ok) return output; + if (check.ok) { + if (review) { + output = await this.reviewBatch( + inputBlocks, output, glossary, provider, cancelSignal, + ); + } + return output; + } hitValidationFailure = true; lastError = `validation: ${check.error}`; @@ -213,10 +335,14 @@ export class TranslationService { ); // Sequential: parallel halves would oversubscribe the worker pool slot. const leftResult = await this.translateBatch( - left, sourceLang, targetLang, provider, maxRetries, fileContext, cancelSignal, + left, sourceLang, targetLang, provider, maxRetries, fileContext, + prevTail, contextOverlap, review, cancelSignal, ); + const rightPrev = + contextOverlap > 0 ? left.slice(-contextOverlap) : []; const rightResult = await this.translateBatch( - right, sourceLang, targetLang, provider, maxRetries, fileContext, cancelSignal, + right, sourceLang, targetLang, provider, maxRetries, fileContext, + rightPrev, contextOverlap, review, cancelSignal, ); return [...leftResult, ...rightResult]; } @@ -313,7 +439,23 @@ export class TranslationService { } -const CRED_QUERY_PARAMS = ['key', 'api_key', 'apikey', 'access_token']; +function buildSceneAttributionMessage( + scene: SceneHint, + byNum: Map, + characters: CharacterHint[], +): string { + const present = new Set(scene.participants); + const roster = characters + .filter((h) => present.has(h.source)) + .map((h) => `- ${h.source} (${genderMark(h.gender) || '?'})`) + .join('\n'); + const sceneLines: string[] = []; + for (let n = scene.start; n <= scene.end; n++) { + const b = byNum.get(n); + if (b) sceneLines.push(`[${n}] ${b.text.replace(/\n/g, ' ')}`); + } + return buildAttributionUserMessage(roster, sceneLines); +} // We authenticate via header, so strip credential query params before sending. function sanitizeApiUrl(url: string): string {