From 977e2a34c289379069c58738fbdd97ce2d9c0e4e Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Tue, 21 Apr 2026 18:21:26 +0300 Subject: [PATCH 01/15] Web: Simplify subtitle parsing process - Added functions to handle subtitle blocks without timestamps for easier parsing. - Updated translation logic to enhance handling of subtitle blocks. - Improved model interaction by eliminating noise from timestamps. - Streamlined serialization of subtitle data, boosting translation accuracy. --- web/src/app/core/srt-parser.ts | 31 +++++++++++++++++++++++++ web/src/app/core/translation-prompt.ts | 7 ++++-- web/src/app/core/translation.service.ts | 18 ++++++++++---- 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/web/src/app/core/srt-parser.ts b/web/src/app/core/srt-parser.ts index ef8f424..7d4c2c4 100644 --- a/web/src/app/core/srt-parser.ts +++ b/web/src/app/core/srt-parser.ts @@ -52,6 +52,37 @@ export function serializeSrt(blocks: SubtitleBlock[]): string { ); } +// Wire format sent to the LLM: number + text only. Timestamps are pure noise +// for the model — it echoes them back, and small models sometimes corrupt a +// digit. We strip them before sending and reattach from the original input. +export function serializeLite(blocks: SubtitleBlock[]): string { + return blocks.map((b) => `${b.number}\n${b.text}`).join('\n\n') + '\n'; +} + +/** + * Parse the wire-format response. Timestamps are left empty — callers reattach + * them positionally from the original batch. + */ +export function parseLite(content: string): SubtitleBlock[] { + content = content.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); + if (content.charCodeAt(0) === 0xfeff) content = content.slice(1); + + const rawBlocks = content.trim().split(/\n\n+/); + const blocks: SubtitleBlock[] = []; + + for (const raw of rawBlocks) { + const lines = raw.trim().split('\n'); + if (lines.length < 1) continue; + + const number = parseInt(lines[0].trim(), 10); + if (isNaN(number)) continue; + + const text = lines.slice(1).join('\n'); + blocks.push({ number, timestamp: '', text }); + } + return blocks; +} + /** * Split blocks into batches of the given size. */ diff --git a/web/src/app/core/translation-prompt.ts b/web/src/app/core/translation-prompt.ts index c9cb7bf..94b2b98 100644 --- a/web/src/app/core/translation-prompt.ts +++ b/web/src/app/core/translation-prompt.ts @@ -1,9 +1,12 @@ -export const SYSTEM_PROMPT = `You are an .srt subtitle translator. You will receive subtitle blocks and translate them. +export const SYSTEM_PROMPT = `You are a subtitle translator. You will receive numbered subtitle blocks (no timestamps) and translate them. + +Input format for each block: + + RULES (violating any = corrupt file): - Output the SAME number of blocks as input. No merging, no skipping. - Copy block numbers exactly. -- Copy timestamps exactly — not one character changed. - Keep one blank line between blocks. - Preserve line breaks within each block (same line count). - Translate each block independently — never combine split sentences. diff --git a/web/src/app/core/translation.service.ts b/web/src/app/core/translation.service.ts index 52d5e71..1ab7384 100644 --- a/web/src/app/core/translation.service.ts +++ b/web/src/app/core/translation.service.ts @@ -3,8 +3,8 @@ import { HttpClient, HttpErrorResponse } from '@angular/common/http'; import { Subscription } from 'rxjs'; import { SubtitleBlock, - parseSrt, - serializeSrt, + parseLite, + serializeLite, splitBatches, validateBatch, } from './srt-parser'; @@ -108,9 +108,9 @@ export class TranslationService { cancelSignal?: AbortSignal, ): Promise { throwIfCancelled(cancelSignal); - const batchSrt = serializeSrt(inputBlocks); + const batchWire = serializeLite(inputBlocks); const body = this.buildRequestBody( - sourceLang, targetLang, batchSrt, provider.model, inputBlocks.length, + sourceLang, targetLang, batchWire, provider.model, inputBlocks.length, ); const url = sanitizeApiUrl(provider.apiUrl); const headers = buildHeaders(sanitizeApiKey(provider.apiKey)); @@ -121,7 +121,15 @@ export class TranslationService { throwIfCancelled(cancelSignal); try { const resp = await this.postChat(url, body, headers, cancelSignal); - const output = parseSrt(stripMarkdownFences(resp.choices[0].message.content)); + let output = parseLite(stripMarkdownFences(resp.choices[0].message.content)); + // Reattach timestamps from the original input positionally. + if (output.length === inputBlocks.length) { + output = output.map((b, i) => ({ + number: inputBlocks[i].number, + timestamp: inputBlocks[i].timestamp, + text: b.text, + })); + } const check = validateBatch(inputBlocks, output); if (check.ok) return output; From 7553672b0f597ebd800fdf0ee57cc4b826d8b3d9 Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Tue, 21 Apr 2026 18:21:40 +0300 Subject: [PATCH 02/15] CLI: Enhance subtitle translation by refining parsing - Updated subtitle serialization to exclude timestamps, improving translation accuracy. - Added logic to reattach timestamps after translation. - Simplified input format in translation prompt to align with changes. --- cli/core/batch_runner.py | 16 ++++++++++++---- cli/core/prompt.py | 7 +++++-- cli/core/srt_parser.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 6 deletions(-) diff --git a/cli/core/batch_runner.py b/cli/core/batch_runner.py index 0e4b90d..b46ca6f 100644 --- a/cli/core/batch_runner.py +++ b/cli/core/batch_runner.py @@ -14,7 +14,7 @@ import httpx -from .srt_parser import SubtitleBlock, parse_srt, serialize_srt, validate_batch +from .srt_parser import SubtitleBlock, parse_lite, serialize_lite, validate_batch from .config import TranslationConfig from .prompt import SYSTEM_PROMPT @@ -116,15 +116,23 @@ async def translate_batch_with_retry( cfg: TranslationConfig, ) -> list[SubtitleBlock]: """Translate one batch; retry on transient errors; raise on exhaustion.""" - batch_srt = serialize_srt(batch) + batch_wire = serialize_lite(batch) label = f"Batch {batch_idx + 1}" first_block = batch[0].number for attempt in range(1, cfg.max_retries + 1): tag = f"attempt {attempt}/{cfg.max_retries}" try: - raw = await call_chat_api(client, batch_srt, cfg, len(batch)) - output = parse_srt(strip_markdown_fences(raw)) + raw = await call_chat_api(client, batch_wire, cfg, len(batch)) + output = parse_lite(strip_markdown_fences(raw)) + # Reattach timestamps from the original input positionally. + if len(output) == len(batch): + output = [ + SubtitleBlock(number=batch[i].number, + timestamp=batch[i].timestamp, + text=output[i].text) + for i in range(len(batch)) + ] check = validate_batch(batch, output) if check.ok: return output diff --git a/cli/core/prompt.py b/cli/core/prompt.py index 5d4e4a6..322e47b 100644 --- a/cli/core/prompt.py +++ b/cli/core/prompt.py @@ -1,12 +1,15 @@ """The translation prompt, kept in one place so it can be iterated on.""" SYSTEM_PROMPT = """\ -You are an .srt subtitle translator. You will receive subtitle blocks and translate them. +You are a subtitle translator. You will receive numbered subtitle blocks (no timestamps) and translate them. + +Input format for each block: + + RULES (violating any = corrupt file): - Output the SAME number of blocks as input. No merging, no skipping. - Copy block numbers exactly. -- Copy timestamps exactly — not one character changed. - Keep one blank line between blocks. - Preserve line breaks within each block (same line count). - Translate each block independently — never combine split sentences. diff --git a/cli/core/srt_parser.py b/cli/core/srt_parser.py index 4a2cfd7..f0c76fa 100644 --- a/cli/core/srt_parser.py +++ b/cli/core/srt_parser.py @@ -68,6 +68,35 @@ def serialize_srt(blocks: list[SubtitleBlock]) -> str: return "\n\n".join(parts) + "\n" +# Wire format sent to the LLM: number + text only. Timestamps are pure noise +# for the model — it just echoes them back, and small models sometimes corrupt +# a digit. We strip them before sending and reattach from the original input. +def serialize_lite(blocks: list[SubtitleBlock]) -> str: + return "\n\n".join(f"{b.number}\n{b.text}" for b in blocks) + "\n" + + +def parse_lite(content: str) -> list[SubtitleBlock]: + """Parse the wire-format response. Timestamps are left empty — callers + reattach them positionally from the original batch.""" + content = content.replace("\r\n", "\n").replace("\r", "\n") + if content.startswith("\ufeff"): + content = content[1:] + raw_blocks = re.split(r"\n\n+", content.strip()) + + blocks: list[SubtitleBlock] = [] + for raw in raw_blocks: + lines = raw.strip().split("\n") + if not lines: + continue + try: + number = int(lines[0].strip()) + except ValueError: + continue + text = "\n".join(lines[1:]) if len(lines) > 1 else "" + blocks.append(SubtitleBlock(number=number, timestamp="", text=text)) + return blocks + + def split_batches(blocks: list[SubtitleBlock], batch_size: int = 15) -> list[list[SubtitleBlock]]: """Split blocks into batches of the given size.""" return [blocks[i : i + batch_size] for i in range(0, len(blocks), batch_size)] From 97aabaac170e2f17a5a89d18b520105d4c640578 Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Tue, 21 Apr 2026 18:22:31 +0300 Subject: [PATCH 03/15] CLI: Add context-aware translation support - Integrates file context scanning for consistent character names and term translations. - Enhances translation accuracy by addressing gendered-pronoun errors. - Introduces a glossary system to improve scene-context translations. - Provides better support for language-specific nuances in subtitle translations. --- cli/core/batch_runner.py | 36 +++++-- cli/core/context_pass.py | 183 +++++++++++++++++++++++++++++++++ cli/core/prompt.py | 1 + cli/core/translator.py | 17 ++- cli/tests/test_context_pass.py | 93 +++++++++++++++++ 5 files changed, 320 insertions(+), 10 deletions(-) create mode 100644 cli/core/context_pass.py create mode 100644 cli/tests/test_context_pass.py diff --git a/cli/core/batch_runner.py b/cli/core/batch_runner.py index b46ca6f..5b4a69e 100644 --- a/cli/core/batch_runner.py +++ b/cli/core/batch_runner.py @@ -14,6 +14,7 @@ import httpx +from .context_pass import FileContext from .srt_parser import SubtitleBlock, parse_lite, serialize_lite, validate_batch from .config import TranslationConfig from .prompt import SYSTEM_PROMPT @@ -79,19 +80,19 @@ def is_retryable_http(code: int) -> bool: async def call_chat_api( client: httpx.AsyncClient, - batch_srt: str, + system_prompt: str, + user_message: str, cfg: TranslationConfig, - block_count: int, + max_tokens: int, ) -> str: - """POST one batch to the OpenAI-compatible chat endpoint, return raw text.""" + """POST one chat request to the OpenAI-compatible endpoint, return raw text.""" body: dict = { "messages": [ - {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": - f"Translate from {cfg.source_lang} to {cfg.target_lang}:\n\n{batch_srt}"}, + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_message}, ], "temperature": 0.1, - "max_tokens": max(block_count, 1) * 120, + "max_tokens": max(max_tokens, 1), "stream": False, "cache_prompt": True, } @@ -109,21 +110,40 @@ async def call_chat_api( return resp.json()["choices"][0]["message"]["content"] +def _build_user_message( + cfg: TranslationConfig, + batch_wire: str, + file_context: FileContext | None, + batch: list[SubtitleBlock], +) -> str: + """Assemble the user message, prepending any relevant glossary slice.""" + header = f"Translate from {cfg.source_lang} to {cfg.target_lang}:" + if file_context is not None: + ctx = file_context.render_for_batch(batch) + if ctx: + return f"Glossary for this scene:\n{ctx}\n\n{header}\n\n{batch_wire}" + return f"{header}\n\n{batch_wire}" + + async def translate_batch_with_retry( client: httpx.AsyncClient, batch_idx: int, batch: list[SubtitleBlock], cfg: TranslationConfig, + file_context: FileContext | None = None, ) -> list[SubtitleBlock]: """Translate one batch; retry on transient errors; raise on exhaustion.""" batch_wire = serialize_lite(batch) + user_msg = _build_user_message(cfg, batch_wire, file_context, batch) label = f"Batch {batch_idx + 1}" first_block = batch[0].number for attempt in range(1, cfg.max_retries + 1): tag = f"attempt {attempt}/{cfg.max_retries}" try: - raw = await call_chat_api(client, batch_wire, cfg, len(batch)) + raw = await call_chat_api( + client, SYSTEM_PROMPT, user_msg, cfg, max(len(batch), 1) * 120, + ) output = parse_lite(strip_markdown_fences(raw)) # Reattach timestamps from the original input positionally. if len(output) == len(batch): diff --git a/cli/core/context_pass.py b/cli/core/context_pass.py new file mode 100644 index 0000000..0617262 --- /dev/null +++ b/cli/core/context_pass.py @@ -0,0 +1,183 @@ +"""One-shot prepass: scan the whole file for cast, recurring terms, and tone +notes before batched translation begins. + +The goal is to fix gendered-pronoun errors in languages like Arabic, where +the model must pick masculine/feminine forms but English gives no signal. +We send the full source text once, ask for a compact glossary, and inject +the relevant slice into each batch's prompt. + +If the scan fails for any reason, callers get an empty FileContext and +translation proceeds exactly as it did before. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass, field + +import httpx + +from .config import TranslationConfig +from .srt_parser import SubtitleBlock + + +CONTEXT_SYSTEM_PROMPT = """\ +You analyze a subtitle file before it is translated. Output a compact glossary +the translator will use to pick correct pronouns and consistent names. + +Output ONLY this exact tagged format — no commentary, no code fences: + +NAME => TARGET_NAME | GENDER + + +SOURCE => TARGET + + +- NOTE + + +Rules: +- GENDER is "male", "female", or "unknown". Use "unknown" only when the text gives no signal at all. +- TARGET_NAME is how the character's name should appear in the target language (transliterated or localized). +- Include up to 20 named characters, 10 recurring proper terms or jargon, 4 brief notes on setting/register/tone. +- Leave a section with just its tags if nothing qualifies.\ +""" + + +_SECTION_RE = re.compile( + r"<(?Pcharacters|terms|notes)>\s*(?P.*?)\s*", + re.I | re.S, +) + + +@dataclass +class CharacterHint: + source: str + target: str + gender: str # "male" | "female" | "unknown" + + +@dataclass +class TermHint: + source: str + target: str + + +@dataclass +class FileContext: + characters: list[CharacterHint] = field(default_factory=list) + terms: list[TermHint] = field(default_factory=list) + notes: list[str] = field(default_factory=list) + + def is_empty(self) -> bool: + return not (self.characters or self.terms or self.notes) + + def render_for_batch(self, batch: list[SubtitleBlock]) -> str: + """Return only the glossary slice relevant to this batch. + + Characters and terms are included only if their source form appears + in the batch text. Notes are always included (short, file-wide). + Returns an empty string when there is nothing worth injecting. + """ + text = "\n".join(b.text for b in batch) + chars = [h for h in self.characters if _contains_word(text, h.source)] + terms = [h for h in self.terms if _contains_word(text, h.source)] + if not chars and not terms and not self.notes: + return "" + + parts: list[str] = [] + if chars: + lines = [f"- {h.source} => {h.target} ({h.gender})" for h in chars] + parts.append("Characters:\n" + "\n".join(lines)) + if terms: + lines = [f"- {h.source} => {h.target}" for h in terms] + parts.append("Terms:\n" + "\n".join(lines)) + if self.notes: + lines = [f"- {n}" for n in self.notes[:4]] + parts.append("Notes:\n" + "\n".join(lines)) + return "\n\n".join(parts) + + +def _contains_word(text: str, word: str) -> bool: + if not word: + return False + return re.search(rf"(? str: + """Serialize subtitle text for the scan pass — no numbers, no timestamps.""" + return "\n".join(b.text for b in blocks) + + +def parse_context_response(text: str) -> FileContext: + """Parse the tagged response. Tolerates extra whitespace and bullet markers.""" + sections = { + m.group("tag").lower(): m.group("body") + for m in _SECTION_RE.finditer(text or "") + } + + characters: list[CharacterHint] = [] + for line in sections.get("characters", "").splitlines(): + line = line.strip().lstrip("-*• ").strip() + if not line or "=>" not in line: + continue + src, rest = line.split("=>", 1) + if "|" in rest: + tgt, gender = rest.rsplit("|", 1) + tgt, gender = tgt.strip(), gender.strip().lower() + else: + tgt, gender = rest.strip(), "unknown" + if gender not in ("male", "female", "unknown"): + gender = "unknown" + src = src.strip() + if src and tgt: + characters.append(CharacterHint(src, tgt, gender)) + + terms: list[TermHint] = [] + for line in sections.get("terms", "").splitlines(): + line = line.strip().lstrip("-*• ").strip() + if not line or "=>" not in line: + continue + src, tgt = line.split("=>", 1) + src, tgt = src.strip(), tgt.strip() + if src and tgt: + terms.append(TermHint(src, tgt)) + + notes: list[str] = [] + for line in sections.get("notes", "").splitlines(): + line = line.strip().lstrip("-*• ").strip() + if line: + notes.append(line) + + return FileContext( + characters=characters[:20], + terms=terms[:10], + notes=notes[:4], + ) + + +async def extract_file_context( + client: httpx.AsyncClient, + blocks: list[SubtitleBlock], + cfg: TranslationConfig, +) -> FileContext: + """Run one scan call and return a FileContext. Empty on any failure.""" + from .batch_runner import call_chat_api, strip_markdown_fences + + user_message = ( + f"Source language: {cfg.source_lang}\n" + f"Target language: {cfg.target_lang}\n\n" + f"{serialize_for_scan(blocks)}" + ) + try: + raw = await call_chat_api( + client, + CONTEXT_SYSTEM_PROMPT, + user_message, + cfg, + max_tokens=800, + ) + return parse_context_response(strip_markdown_fences(raw)) + except Exception as e: + cfg.warn(f" Context scan failed, proceeding without: {e}") + return FileContext() diff --git a/cli/core/prompt.py b/cli/core/prompt.py index 322e47b..7a80e3c 100644 --- a/cli/core/prompt.py +++ b/cli/core/prompt.py @@ -15,6 +15,7 @@ - Translate each block independently — never combine split sentences. - Translate faithfully: profanity, slurs, slang — match the original register. - Conversational tone, concise — must fit the original timing. +- If a glossary is provided, use each character's listed gender when choosing pronouns and verb forms in the target language, and use the listed target-language name consistently. DO NOT TRANSLATE (copy verbatim): - HTML tags, music symbols, formatting tags (\\N, {\\an8}) diff --git a/cli/core/translator.py b/cli/core/translator.py index 2170288..0e6ec31 100644 --- a/cli/core/translator.py +++ b/cli/core/translator.py @@ -18,6 +18,7 @@ from .formats import parse_subtitle from .config import DEFAULT_MAX_RETRIES, TranslationConfig from .batch_runner import FileTranslationError, translate_batch_with_retry +from .context_pass import FileContext, extract_file_context from .time_tracker import EtaEstimator, format_duration from .live_status import Colors, LiveLine, Ticker @@ -67,7 +68,18 @@ async def translate_file_async( print(colors.dim(f"Concurrency: {cfg.concurrency}")) started_at = time.time() - translated_batches = await _run_batches(batches, cfg, colors, started_at) + async with httpx.AsyncClient() as scan_client: + if not cfg.quiet: + print(colors.dim(" Scanning for cast and context...")) + file_context = await extract_file_context(scan_client, doc.blocks, cfg) + if not cfg.quiet and not file_context.is_empty(): + chars = len(file_context.characters) + terms = len(file_context.terms) + print(colors.dim(f" Glossary: {chars} character(s), {terms} term(s)")) + + translated_batches = await _run_batches( + batches, cfg, colors, started_at, file_context, + ) # Stitch in order (they completed out-of-order but `_run_batches` returns # them indexed by their original position). @@ -91,6 +103,7 @@ async def _run_batches( cfg: TranslationConfig, colors: Colors, started_at: float, + file_context: FileContext | None = None, ) -> list[list[SubtitleBlock]]: """Translate every batch with up to `cfg.concurrency` requests in flight. @@ -138,7 +151,7 @@ async def run_one(idx: int) -> None: batch_start = time.time() try: results[idx] = await translate_batch_with_retry( - client, idx, batches[idx], cfg + client, idx, batches[idx], cfg, file_context, ) except FileTranslationError as e: failure = e diff --git a/cli/tests/test_context_pass.py b/cli/tests/test_context_pass.py new file mode 100644 index 0000000..9cb5d9c --- /dev/null +++ b/cli/tests/test_context_pass.py @@ -0,0 +1,93 @@ +from core.context_pass import ( + FileContext, + CharacterHint, + TermHint, + parse_context_response, +) +from core.srt_parser import SubtitleBlock + + +def _block(n: int, text: str) -> SubtitleBlock: + return SubtitleBlock(number=n, timestamp="00:00:00,000 --> 00:00:01,000", text=text) + + +def test_parse_well_formed_response(): + raw = """ + +Amy => إيمي | female +Jake => جيك | male +Stranger => غريب | unknown + + +precinct => قسم الشرطة + + +- Modern police procedural +- Casual register + +""" + ctx = parse_context_response(raw) + assert ctx.characters == [ + CharacterHint("Amy", "إيمي", "female"), + CharacterHint("Jake", "جيك", "male"), + CharacterHint("Stranger", "غريب", "unknown"), + ] + assert ctx.terms == [TermHint("precinct", "قسم الشرطة")] + assert ctx.notes == ["Modern police procedural", "Casual register"] + + +def test_parse_tolerates_missing_sections_and_bullets(): + raw = """ + +- Amy => إيمي | female +* Jake => جيك | MALE + +""" + ctx = parse_context_response(raw) + assert [h.source for h in ctx.characters] == ["Amy", "Jake"] + assert ctx.characters[1].gender == "male" + assert ctx.terms == [] + assert ctx.notes == [] + + +def test_parse_garbage_returns_empty(): + assert parse_context_response("").is_empty() + assert parse_context_response("sorry I cannot help").is_empty() + + +def test_render_for_batch_only_includes_matching_characters(): + ctx = FileContext( + characters=[ + CharacterHint("Amy", "إيمي", "female"), + CharacterHint("Jake", "جيك", "male"), + ], + terms=[TermHint("precinct", "قسم الشرطة")], + notes=["Police procedural"], + ) + batch = [_block(1, "Amy, come here."), _block(2, "I'm tired.")] + rendered = ctx.render_for_batch(batch) + assert "Amy" in rendered + assert "Jake" not in rendered + assert "precinct" not in rendered + assert "Police procedural" in rendered + + +def test_render_for_batch_empty_when_nothing_matches_and_no_notes(): + ctx = FileContext( + characters=[CharacterHint("Amy", "إيمي", "female")], + terms=[], + notes=[], + ) + batch = [_block(1, "I'm tired.")] + assert ctx.render_for_batch(batch) == "" + + +def test_render_word_boundary_does_not_match_substrings(): + ctx = FileContext( + characters=[CharacterHint("Amy", "إيمي", "female")], + terms=[], + notes=[], + ) + # "Amy" inside "Amyloid" should not match. + batch = [_block(1, "Amyloid plaques.")] + assert "Amy" not in ctx.render_for_batch(batch) From 173c8ed173bddcf7f2bc8bcfad3b6d008c21650c Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Tue, 21 Apr 2026 18:23:02 +0300 Subject: [PATCH 04/15] CLI: Improve translation reliability - Introduces batch splitting on validation failures to enhance translation accuracy. - Adds register consistency for coherent translations across subtitle files. - Ensures non-empty inputs do not produce empty outputs. - Improves context detection for better glossary generation. - Provides more granular retry logic, benefiting subtitle translation stability. --- cli/core/batch_runner.py | 52 +++++++++++++++++++++---- cli/core/context_pass.py | 71 +++++++++++++++++++++++++++------- cli/core/prompt.py | 1 + cli/core/srt_parser.py | 10 +++++ cli/core/translator.py | 16 ++++++-- cli/tests/test_context_pass.py | 60 ++++++++++++++++++++++++++++ cli/tests/test_srt_parser.py | 20 ++++++++++ cli/translora.py | 14 +++---- 8 files changed, 209 insertions(+), 35 deletions(-) diff --git a/cli/core/batch_runner.py b/cli/core/batch_runner.py index 5b4a69e..9923cfa 100644 --- a/cli/core/batch_runner.py +++ b/cli/core/batch_runner.py @@ -125,21 +125,36 @@ def _build_user_message( return f"{header}\n\n{batch_wire}" +_ATTEMPTS_BEFORE_SPLIT = 2 + + async def translate_batch_with_retry( client: httpx.AsyncClient, batch_idx: int, batch: list[SubtitleBlock], cfg: TranslationConfig, file_context: FileContext | None = None, + _split_path: str = "", ) -> list[SubtitleBlock]: - """Translate one batch; retry on transient errors; raise on exhaustion.""" + """Translate one batch; on repeated validation failure split it in half. + + Persistent count mismatches usually mean the model is deterministically + merging two adjacent similar-looking blocks (e.g., repeated reactions + like "Oh." / "Oh!"). Splitting gives the model fewer similar blocks to + confuse and almost always resolves the merge. We keep halving until we + reach single-block batches, which can't have count mismatches. + """ batch_wire = serialize_lite(batch) user_msg = _build_user_message(cfg, batch_wire, file_context, batch) - label = f"Batch {batch_idx + 1}" + label = f"Batch {batch_idx + 1}" + (f".{_split_path}" if _split_path else "") first_block = batch[0].number - for attempt in range(1, cfg.max_retries + 1): - tag = f"attempt {attempt}/{cfg.max_retries}" + can_split = len(batch) > 1 + attempts = _ATTEMPTS_BEFORE_SPLIT if can_split else cfg.max_retries + hit_validation_failure = False + + for attempt in range(1, attempts + 1): + tag = f"attempt {attempt}/{attempts}" try: raw = await call_chat_api( client, SYSTEM_PROMPT, user_msg, cfg, max(len(batch), 1) * 120, @@ -156,6 +171,7 @@ async def translate_batch_with_retry( check = validate_batch(batch, output) if check.ok: return output + hit_validation_failure = True cfg.warn(f" {label} validation failed ({tag}): {check.error}") except httpx.HTTPStatusError as e: @@ -167,9 +183,9 @@ async def translate_batch_with_retry( raise FileTranslationError( f"{label} (block {first_block}) HTTP {code}: {snippet}" ) - if code == 429 and attempt < cfg.max_retries: + if code == 429 and attempt < attempts: delay = 2 ** attempt - cfg.warn(f" Rate limited — waiting {delay}s...") + cfg.warn(f" Rate limited - waiting {delay}s...") await asyncio.sleep(delay) continue @@ -177,9 +193,29 @@ async def translate_batch_with_retry( cfg.warn(f" {label} request failed ({tag}): {e}") # Small back-off before the next attempt (1s, 2s, 3s cap). - if attempt < cfg.max_retries: + if attempt < attempts: await asyncio.sleep(min(attempt, 3)) + # All attempts exhausted. If we hit validation errors and can still split, + # cut the batch in half and retry each half independently. Otherwise fail. + if hit_validation_failure and can_split: + mid = len(batch) // 2 + left, right = batch[:mid], batch[mid:] + cfg.warn( + f" {label} splitting {len(batch)} -> {len(left)} + {len(right)} blocks" + ) + left_path = (_split_path + "L") if _split_path else "L" + right_path = (_split_path + "R") if _split_path else "R" + # Sequential: parallel halves would oversubscribe the outer semaphore's + # per-batch slot and starve other batches. + left_result = await translate_batch_with_retry( + client, batch_idx, left, cfg, file_context, left_path, + ) + right_result = await translate_batch_with_retry( + client, batch_idx, right, cfg, file_context, right_path, + ) + return left_result + right_result + raise FileTranslationError( - f"{label} (block {first_block}) failed all {cfg.max_retries} retries" + f"{label} (block {first_block}) failed all {attempts} retries" ) diff --git a/cli/core/context_pass.py b/cli/core/context_pass.py index 0617262..ca3e7c3 100644 --- a/cli/core/context_pass.py +++ b/cli/core/context_pass.py @@ -22,10 +22,17 @@ CONTEXT_SYSTEM_PROMPT = """\ -You analyze a subtitle file before it is translated. Output a compact glossary -the translator will use to pick correct pronouns and consistent names. +You analyze a subtitle file before it is translated. Return a compact glossary +for the translator to use when picking correct pronouns, consistent names, and +a single consistent register. -Output ONLY this exact tagged format — no commentary, no code fences: +Your reply MUST start with `` and MUST contain all four sections +below, in this exact order, with no other text before, between, or after them. +No commentary. No code fences. No explanations. Tags only. + + +ONE LINE describing the target-language variant and formality the translator should use for the ENTIRE file. + NAME => TARGET_NAME | GENDER @@ -37,15 +44,22 @@ Rules: +- The line names the specific target-language variant and formality (e.g. "Modern Standard Arabic, neutral", "Brazilian Portuguese, casual", "Simplified Mandarin, neutral", "Japanese, polite です/ます form"). Pick ONE and commit to it for the whole file. Base the choice on the source's tone; default to the standard written form of the target language unless the source is clearly colloquial. - GENDER is "male", "female", or "unknown". Use "unknown" only when the text gives no signal at all. - TARGET_NAME is how the character's name should appear in the target language (transliterated or localized). -- Include up to 20 named characters, 10 recurring proper terms or jargon, 4 brief notes on setting/register/tone. -- Leave a section with just its tags if nothing qualifies.\ +- Include up to 20 named characters, 10 recurring proper terms or jargon, 4 brief notes on setting/tone. +- Leave a section empty (tags only) if nothing qualifies. Never omit a section.\ """ +# Rough cap on source text sent to the scan. Tuned so small-context models +# (4k-8k total) still have room for the system prompt, the output, and a +# safety margin. ~4 chars ≈ 1 token on Latin text. +_SCAN_CHAR_BUDGET = 12_000 +_SCAN_MAX_TOKENS = 1500 + _SECTION_RE = re.compile( - r"<(?Pcharacters|terms|notes)>\s*(?P.*?)\s*", + r"<(?Pregister|characters|terms|notes)>\s*(?P.*?)\s*", re.I | re.S, ) @@ -65,27 +79,30 @@ class TermHint: @dataclass class FileContext: + register: str = "" characters: list[CharacterHint] = field(default_factory=list) terms: list[TermHint] = field(default_factory=list) notes: list[str] = field(default_factory=list) def is_empty(self) -> bool: - return not (self.characters or self.terms or self.notes) + return not (self.register or self.characters or self.terms or self.notes) def render_for_batch(self, batch: list[SubtitleBlock]) -> str: """Return only the glossary slice relevant to this batch. - Characters and terms are included only if their source form appears - in the batch text. Notes are always included (short, file-wide). + Register and notes are always included (short, file-wide). Characters + and terms are included only if their source form appears in the batch. Returns an empty string when there is nothing worth injecting. """ text = "\n".join(b.text for b in batch) chars = [h for h in self.characters if _contains_word(text, h.source)] terms = [h for h in self.terms if _contains_word(text, h.source)] - if not chars and not terms and not self.notes: + if not self.register and not chars and not terms and not self.notes: return "" parts: list[str] = [] + if self.register: + parts.append(f"Target register: {self.register} (use consistently across every block)") if chars: lines = [f"- {h.source} => {h.target} ({h.gender})" for h in chars] parts.append("Characters:\n" + "\n".join(lines)) @@ -105,8 +122,22 @@ def _contains_word(text: str, word: str) -> bool: def serialize_for_scan(blocks: list[SubtitleBlock]) -> str: - """Serialize subtitle text for the scan pass — no numbers, no timestamps.""" - return "\n".join(b.text for b in blocks) + """Serialize subtitle text for the scan pass — no numbers, no timestamps. + + For large files we can't fit every block in the scan call, so we stride- + sample evenly across the whole file. Sampling preserves each character's + chance of appearing at least once in the glossary, regardless of where + they're first introduced. + """ + total_chars = sum(len(b.text) + 1 for b in blocks) + if total_chars <= _SCAN_CHAR_BUDGET or len(blocks) <= 1: + return "\n".join(b.text for b in blocks) + + # Estimate how many blocks fit in the budget, then sample evenly. + take_n = max(1, int(len(blocks) * _SCAN_CHAR_BUDGET / total_chars)) + step = len(blocks) / take_n + sampled = [blocks[int(i * step)] for i in range(take_n)] + return "\n".join(b.text for b in sampled) def parse_context_response(text: str) -> FileContext: @@ -116,6 +147,9 @@ def parse_context_response(text: str) -> FileContext: for m in _SECTION_RE.finditer(text or "") } + # Register is free-form but must be a single line; collapse any whitespace. + register = " ".join(sections.get("register", "").split()).strip().lstrip("-*• ").strip() + characters: list[CharacterHint] = [] for line in sections.get("characters", "").splitlines(): line = line.strip().lstrip("-*• ").strip() @@ -150,6 +184,7 @@ def parse_context_response(text: str) -> FileContext: notes.append(line) return FileContext( + register=register, characters=characters[:20], terms=terms[:10], notes=notes[:4], @@ -175,9 +210,17 @@ async def extract_file_context( CONTEXT_SYSTEM_PROMPT, user_message, cfg, - max_tokens=800, + max_tokens=_SCAN_MAX_TOKENS, ) - return parse_context_response(strip_markdown_fences(raw)) except Exception as e: cfg.warn(f" Context scan failed, proceeding without: {e}") return FileContext() + + context = parse_context_response(strip_markdown_fences(raw)) + if context.is_empty(): + # Diagnostic: show a short snippet of what the model actually returned + # so it's obvious whether it ignored the tagged format, truncated, or + # refused. Truncated hard to keep noise down. + snippet = (raw or "").strip().replace("\n", " ")[:240] + cfg.warn(f" Context scan returned empty glossary. Raw start: {snippet!r}") + return context diff --git a/cli/core/prompt.py b/cli/core/prompt.py index 7a80e3c..45763af 100644 --- a/cli/core/prompt.py +++ b/cli/core/prompt.py @@ -16,6 +16,7 @@ - Translate faithfully: profanity, slurs, slang — match the original register. - Conversational tone, concise — must fit the original timing. - If a glossary is provided, use each character's listed gender when choosing pronouns and verb forms in the target language, and use the listed target-language name consistently. +- Use ONE consistent register and variant of the target language across every block. Do not switch dialects or formality between batches. If the target language has a standard written form (e.g., Modern Standard Arabic), use it by default unless the source is clearly colloquial. DO NOT TRANSLATE (copy verbatim): - HTML tags, music symbols, formatting tags (\\N, {\\an8}) diff --git a/cli/core/srt_parser.py b/cli/core/srt_parser.py index f0c76fa..8c22c44 100644 --- a/cli/core/srt_parser.py +++ b/cli/core/srt_parser.py @@ -138,4 +138,14 @@ def validate_batch( error=f"Timestamp modified at block {inp.number}: expected '{inp.timestamp}', got '{out.timestamp}'", ) + # 4. Non-empty input must produce non-empty output. Catches the silent + # data-loss case where the model shifts blocks and leaves a tail block blank + # while preserving count/numbers/timestamps. + for inp, out in zip(input_blocks, output_blocks): + if inp.text.strip() and not out.text.strip(): + return ValidationResult( + ok=False, + error=f"Empty output at block {inp.number} (input was non-empty)", + ) + return ValidationResult(ok=True) diff --git a/cli/core/translator.py b/cli/core/translator.py index 0e6ec31..4ebef13 100644 --- a/cli/core/translator.py +++ b/cli/core/translator.py @@ -72,10 +72,18 @@ async def translate_file_async( if not cfg.quiet: print(colors.dim(" Scanning for cast and context...")) file_context = await extract_file_context(scan_client, doc.blocks, cfg) - if not cfg.quiet and not file_context.is_empty(): - chars = len(file_context.characters) - terms = len(file_context.terms) - print(colors.dim(f" Glossary: {chars} character(s), {terms} term(s)")) + if not cfg.quiet: + if file_context.is_empty(): + print(colors.dim(" Glossary: empty (proceeding without context hints)")) + else: + chars = len(file_context.characters) + terms = len(file_context.terms) + notes = len(file_context.notes) + print(colors.dim( + f" Glossary: {chars} character(s), {terms} term(s), {notes} note(s)" + )) + if file_context.register: + print(colors.dim(f" Register: {file_context.register}")) translated_batches = await _run_batches( batches, cfg, colors, started_at, file_context, diff --git a/cli/tests/test_context_pass.py b/cli/tests/test_context_pass.py index 9cb5d9c..7742d45 100644 --- a/cli/tests/test_context_pass.py +++ b/cli/tests/test_context_pass.py @@ -3,6 +3,8 @@ CharacterHint, TermHint, parse_context_response, + serialize_for_scan, + _SCAN_CHAR_BUDGET, ) from core.srt_parser import SubtitleBlock @@ -13,6 +15,9 @@ def _block(n: int, text: str) -> SubtitleBlock: def test_parse_well_formed_response(): raw = """ + +Modern Standard Arabic, neutral + Amy => إيمي | female Jake => جيك | male @@ -27,6 +32,7 @@ def test_parse_well_formed_response(): """ ctx = parse_context_response(raw) + assert ctx.register == "Modern Standard Arabic, neutral" assert ctx.characters == [ CharacterHint("Amy", "إيمي", "female"), CharacterHint("Jake", "جيك", "male"), @@ -36,6 +42,41 @@ def test_parse_well_formed_response(): assert ctx.notes == ["Modern police procedural", "Casual register"] +def test_parse_register_collapses_whitespace_and_bullet(): + raw = """ + + - Brazilian Portuguese, + casual + + + + + + + +""" + ctx = parse_context_response(raw) + assert ctx.register == "Brazilian Portuguese, casual" + + +def test_render_includes_register_line_even_when_no_matches(): + ctx = FileContext( + register="Modern Standard Arabic, neutral", + characters=[CharacterHint("Amy", "إيمي", "female")], + terms=[], + notes=[], + ) + batch = [_block(1, "Nobody named here.")] + rendered = ctx.render_for_batch(batch) + assert "Target register: Modern Standard Arabic, neutral" in rendered + assert "Amy" not in rendered + + +def test_is_empty_considers_register(): + assert FileContext().is_empty() + assert not FileContext(register="MSA").is_empty() + + def test_parse_tolerates_missing_sections_and_bullets(): raw = """ @@ -91,3 +132,22 @@ def test_render_word_boundary_does_not_match_substrings(): # "Amy" inside "Amyloid" should not match. batch = [_block(1, "Amyloid plaques.")] assert "Amy" not in ctx.render_for_batch(batch) + + +def test_serialize_for_scan_returns_all_text_when_under_budget(): + blocks = [_block(i, f"Line {i}.") for i in range(1, 6)] + out = serialize_for_scan(blocks) + for i in range(1, 6): + assert f"Line {i}." in out + + +def test_serialize_for_scan_samples_large_files_under_budget(): + # Build a file that clearly exceeds the scan budget. + long_line = "x" * 500 + blocks = [_block(i, f"{long_line}-{i}") for i in range(1, 500)] + out = serialize_for_scan(blocks) + assert len(out) <= _SCAN_CHAR_BUDGET * 1.1 # small slack for newlines + # Sampled output must include blocks from across the whole file, + # not just the first N. + assert any(f"-{i}" in out for i in range(1, 20)) + assert any(f"-{i}" in out for i in range(450, 500)) diff --git a/cli/tests/test_srt_parser.py b/cli/tests/test_srt_parser.py index bb4b858..bd82c76 100644 --- a/cli/tests/test_srt_parser.py +++ b/cli/tests/test_srt_parser.py @@ -85,3 +85,23 @@ def test_validate_batch_timestamp_modified() -> None: result = validate_batch(a, b) assert not result.ok assert "timestamp" in result.error.lower() + + +def test_validate_batch_rejects_empty_output_for_nonempty_input() -> None: + a = [ + SubtitleBlock(1, "00:00:01,000 --> 00:00:02,000", "hi"), + SubtitleBlock(2, "00:00:03,000 --> 00:00:04,000", "there"), + ] + b = [ + SubtitleBlock(1, "00:00:01,000 --> 00:00:02,000", "hola"), + SubtitleBlock(2, "00:00:03,000 --> 00:00:04,000", ""), + ] + result = validate_batch(a, b) + assert not result.ok + assert "empty" in result.error.lower() + + +def test_validate_batch_allows_empty_output_for_empty_input() -> None: + a = [SubtitleBlock(1, "00:00:01,000 --> 00:00:02,000", "")] + b = [SubtitleBlock(1, "00:00:01,000 --> 00:00:02,000", "")] + assert validate_batch(a, b).ok diff --git a/cli/translora.py b/cli/translora.py index 1ca6722..7e9943a 100644 --- a/cli/translora.py +++ b/cli/translora.py @@ -21,25 +21,21 @@ EPILOG = """\ examples: - # Local llama-server (no key needed) + # Local OpenAI-compatible server (no key usually needed) python translora.py movie.srt -s English -t Arabic \\ --api-url http://127.0.0.1:8080/v1/chat/completions # Cloud provider (any OpenAI-compatible endpoint) python translora.py movie.srt -s English -t Arabic \\ - --api-url https://api.openai.com/v1/chat/completions \\ - --api-key sk-... --model gpt-4.1-mini -c 10 + --api-url https:///v1/chat/completions \\ + --api-key --model -c 10 # Translate a whole folder in parallel python translora.py ./subs/ -s English -t Arabic \\ --api-url ... --api-key ... --model ... -c 5 -pf 3 -provider endpoints (all OpenAI-compatible): - Local: http://127.0.0.1:8080/v1/chat/completions - OpenAI: https://api.openai.com/v1/chat/completions - Groq: https://api.groq.com/openai/v1/chat/completions - DeepSeek: https://api.deepseek.com/v1/chat/completions - OpenRouter: https://openrouter.ai/api/v1/chat/completions +Any OpenAI-compatible /v1/chat/completions endpoint works. The port and +path for local servers vary by tool — check your server's documentation. """ From 859399971736df38c602164e1437ac2bd4bb9717 Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Tue, 21 Apr 2026 18:24:16 +0300 Subject: [PATCH 05/15] Web: Enhance translation consistency - Introduces context scanning for gendered-pronoun accuracy. - Ensures consistent language register and terminology. - Improves batch handling with retry logic and batch splitting. - Enhances translation fidelity for non-Latin languages. --- web/src/app/core/context-pass.ts | 212 ++++++++++++++++++++++++ web/src/app/core/translation-prompt.ts | 11 +- web/src/app/core/translation.service.ts | 154 ++++++++++++----- 3 files changed, 338 insertions(+), 39 deletions(-) create mode 100644 web/src/app/core/context-pass.ts diff --git a/web/src/app/core/context-pass.ts b/web/src/app/core/context-pass.ts new file mode 100644 index 0000000..2ca2b1e --- /dev/null +++ b/web/src/app/core/context-pass.ts @@ -0,0 +1,212 @@ +/** + * One-shot prepass: scan the whole file for cast, recurring terms, and tone + * notes before batched translation begins. + * + * Goal is to fix gendered-pronoun errors in languages like Arabic, where the + * model must pick masculine/feminine forms but English gives no signal. We + * send the full source text once, ask for a compact glossary, and inject the + * relevant slice into each batch's prompt. + * + * If the scan fails for any reason, callers get an empty FileContext and + * translation proceeds exactly as it did before. + */ + +import { SubtitleBlock } from './srt-parser'; + +export const CONTEXT_SYSTEM_PROMPT = `You analyze a subtitle file before it is translated. Return a compact glossary for the translator to use when picking correct pronouns, consistent names, and a single consistent register. + +Your reply MUST start with \`\` and MUST contain all four sections below, in this exact order, with no other text before, between, or after them. No commentary. No code fences. No explanations. Tags only. + + +ONE LINE describing the target-language variant and formality the translator should use for the ENTIRE file. + + +NAME => TARGET_NAME | GENDER + + +SOURCE => TARGET + + +- NOTE + + +Rules: +- The line names the specific target-language variant and formality (e.g. "Modern Standard Arabic, neutral", "Brazilian Portuguese, casual", "Simplified Mandarin, neutral", "Japanese, polite です/ます form"). Pick ONE and commit to it for the whole file. Base the choice on the source's tone; default to the standard written form of the target language unless the source is clearly colloquial. +- GENDER is "male", "female", or "unknown". Use "unknown" only when the text gives no signal at all. +- TARGET_NAME is how the character's name should appear in the target language (transliterated or localized). +- Include up to 20 named characters, 10 recurring proper terms or jargon, 4 brief notes on setting/tone. +- Leave a section empty (tags only) if nothing qualifies. Never omit a section.`; + +// Rough cap on source text sent to the scan. Tuned so small-context models +// (4k-8k total) still have room for the system prompt, the output, and a +// safety margin. ~4 chars ≈ 1 token on Latin text. +export const SCAN_CHAR_BUDGET = 12_000; +export const SCAN_MAX_TOKENS = 1500; + +export type Gender = 'male' | 'female' | 'unknown'; + +export interface CharacterHint { + source: string; + target: string; + gender: Gender; +} + +export interface TermHint { + source: string; + target: string; +} + +export class FileContext { + constructor( + public register = '', + public characters: CharacterHint[] = [], + public terms: TermHint[] = [], + public notes: string[] = [], + ) {} + + isEmpty(): boolean { + return !(this.register || this.characters.length || this.terms.length || this.notes.length); + } + + /** + * Return only the glossary slice relevant to this batch. + * + * Register and notes are always included (short, file-wide). Characters and + * terms are included only if their source form appears in the batch. + * Returns an empty string when there is nothing worth injecting. + */ + renderForBatch(batch: SubtitleBlock[]): string { + const text = batch.map((b) => b.text).join('\n'); + const chars = this.characters.filter((h) => containsWord(text, h.source)); + const terms = this.terms.filter((h) => containsWord(text, h.source)); + if (!this.register && !chars.length && !terms.length && !this.notes.length) { + return ''; + } + + const parts: string[] = []; + if (this.register) { + parts.push(`Target register: ${this.register} (use consistently across every block)`); + } + if (chars.length) { + const lines = chars.map((h) => `- ${h.source} => ${h.target} (${h.gender})`); + parts.push('Characters:\n' + lines.join('\n')); + } + if (terms.length) { + const lines = terms.map((h) => `- ${h.source} => ${h.target}`); + parts.push('Terms:\n' + lines.join('\n')); + } + if (this.notes.length) { + const lines = this.notes.slice(0, 4).map((n) => `- ${n}`); + parts.push('Notes:\n' + lines.join('\n')); + } + return parts.join('\n\n'); + } +} + +function escapeRegExp(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +function containsWord(text: string, word: string): boolean { + if (!word) return false; + const re = new RegExp(`(? sum + b.text.length + 1, 0); + if (totalChars <= SCAN_CHAR_BUDGET || blocks.length <= 1) { + return blocks.map((b) => b.text).join('\n'); + } + + // Estimate how many blocks fit in the budget, then sample evenly. + const takeN = Math.max(1, Math.floor((blocks.length * SCAN_CHAR_BUDGET) / totalChars)); + const step = blocks.length / takeN; + const sampled: SubtitleBlock[] = []; + for (let i = 0; i < takeN; i++) { + sampled.push(blocks[Math.floor(i * step)]); + } + return sampled.map((b) => b.text).join('\n'); +} + +const SECTION_RE = /<(register|characters|terms|notes)>\s*([\s\S]*?)\s*<\/\1>/gi; + +function stripBullet(line: string): string { + return line.trim().replace(/^[-*•]\s*/, '').trim(); +} + +/** Parse the tagged response. Tolerates extra whitespace and bullet markers. */ +export function parseContextResponse(text: string): FileContext { + const sections: Record = {}; + const src = text || ''; + SECTION_RE.lastIndex = 0; + let m: RegExpExecArray | null; + while ((m = SECTION_RE.exec(src)) !== null) { + sections[m[1].toLowerCase()] = m[2]; + } + + // Register is free-form but must be a single line; collapse any whitespace. + const rawRegister = sections['register'] ?? ''; + const register = stripBullet(rawRegister.split(/\s+/).join(' ')); + + const characters: CharacterHint[] = []; + for (const rawLine of (sections['characters'] ?? '').split('\n')) { + const line = stripBullet(rawLine); + if (!line || !line.includes('=>')) continue; + const [srcPart, restPart] = splitOnce(line, '=>'); + let tgt: string, gender: string; + if (restPart.includes('|')) { + const idx = restPart.lastIndexOf('|'); + tgt = restPart.slice(0, idx).trim(); + gender = restPart.slice(idx + 1).trim().toLowerCase(); + } else { + tgt = restPart.trim(); + gender = 'unknown'; + } + const normalizedGender: Gender = + gender === 'male' || gender === 'female' ? gender : 'unknown'; + const src2 = srcPart.trim(); + if (src2 && tgt) { + characters.push({ source: src2, target: tgt, gender: normalizedGender }); + } + } + + const terms: TermHint[] = []; + for (const rawLine of (sections['terms'] ?? '').split('\n')) { + const line = stripBullet(rawLine); + if (!line || !line.includes('=>')) continue; + const [srcPart, tgtPart] = splitOnce(line, '=>'); + const src2 = srcPart.trim(); + const tgt = tgtPart.trim(); + if (src2 && tgt) { + terms.push({ source: src2, target: tgt }); + } + } + + const notes: string[] = []; + for (const rawLine of (sections['notes'] ?? '').split('\n')) { + const line = stripBullet(rawLine); + if (line) notes.push(line); + } + + return new FileContext( + register, + characters.slice(0, 20), + terms.slice(0, 10), + notes.slice(0, 4), + ); +} + +function splitOnce(s: string, sep: string): [string, string] { + const i = s.indexOf(sep); + if (i < 0) return [s, '']; + return [s.slice(0, i), s.slice(i + sep.length)]; +} diff --git a/web/src/app/core/translation-prompt.ts b/web/src/app/core/translation-prompt.ts index 94b2b98..35a94d5 100644 --- a/web/src/app/core/translation-prompt.ts +++ b/web/src/app/core/translation-prompt.ts @@ -12,6 +12,8 @@ RULES (violating any = corrupt file): - Translate each block independently — never combine split sentences. - Translate faithfully: profanity, slurs, slang — match the original register. - Conversational tone, concise — must fit the original timing. +- If a glossary is provided, use each character's listed gender when choosing pronouns and verb forms in the target language, and use the listed target-language name consistently. +- Use ONE consistent register and variant of the target language across every block. Do not switch dialects or formality between batches. If the target language has a standard written form (e.g., Modern Standard Arabic), use it by default unless the source is clearly colloquial. DO NOT TRANSLATE (copy verbatim): - HTML tags, music symbols, formatting tags (\\N, {\\an8}) @@ -24,7 +26,12 @@ Output ONLY the translated .srt blocks. No commentary, no markdown fences.`; export function buildUserMessage( sourceLang: string, targetLang: string, - srtContent: string + srtContent: string, + glossary?: string, ): string { - return `Translate from ${sourceLang} to ${targetLang}:\n\n${srtContent}`; + const header = `Translate from ${sourceLang} to ${targetLang}:`; + if (glossary && glossary.trim()) { + return `Glossary for this scene:\n${glossary}\n\n${header}\n\n${srtContent}`; + } + return `${header}\n\n${srtContent}`; } diff --git a/web/src/app/core/translation.service.ts b/web/src/app/core/translation.service.ts index 1ab7384..9420cfa 100644 --- a/web/src/app/core/translation.service.ts +++ b/web/src/app/core/translation.service.ts @@ -10,6 +10,13 @@ import { } from './srt-parser'; import { SubtitleDocument } from './subtitle-formats/types'; import { SYSTEM_PROMPT, buildUserMessage } from './translation-prompt'; +import { + CONTEXT_SYSTEM_PROMPT, + FileContext, + SCAN_MAX_TOKENS, + parseContextResponse, + serializeForScan, +} from './context-pass'; export interface ProviderConfig { apiUrl: string; @@ -35,6 +42,8 @@ export const DEFAULT_BATCH_SIZE = 5; export const DEFAULT_CONCURRENCY = 5; export const DEFAULT_PARALLEL_FILES = 1; +const ATTEMPTS_BEFORE_SPLIT = 2; + type ChatResponse = { choices: { message: { content: string } }[] }; @Injectable({ providedIn: 'root' }) @@ -62,6 +71,10 @@ export class TranslationService { } throwIfCancelled(cancelSignal); + const fileContext = await this.extractFileContext( + doc.blocks, sourceLang, targetLang, provider, cancelSignal, + ); + const batches = splitBatches(doc.blocks, batchSize); const results: SubtitleBlock[][] = new Array(batches.length); @@ -79,7 +92,7 @@ export class TranslationService { const i = nextIdx++; if (i >= batches.length) return; results[i] = await this.translateBatch( - batches[i], sourceLang, targetLang, provider, maxRetries, cancelSignal, + batches[i], sourceLang, targetLang, provider, maxRetries, fileContext, cancelSignal, ); completed++; emit(); @@ -96,7 +109,35 @@ export class TranslationService { } // --------------------------------------------------------------------- - // Per-batch translation with retry + // Prepass: one scan call for cast, recurring terms, and tone notes. + // --------------------------------------------------------------------- + + private async extractFileContext( + blocks: SubtitleBlock[], + sourceLang: string, + targetLang: string, + provider: ProviderConfig, + cancelSignal?: AbortSignal, + ): Promise { + const userMessage = + `Source language: ${sourceLang}\n` + + `Target language: ${targetLang}\n\n` + + serializeForScan(blocks); + + try { + const raw = await this.callChat( + CONTEXT_SYSTEM_PROMPT, userMessage, provider, SCAN_MAX_TOKENS, cancelSignal, + ); + return parseContextResponse(stripMarkdownFences(raw)); + } catch (err) { + if (err instanceof TranslationCancelledError) throw err; + console.warn('Context scan failed, proceeding without:', err); + return new FileContext(); + } + } + + // --------------------------------------------------------------------- + // Per-batch translation with retry + recursive split on validation failure // --------------------------------------------------------------------- private async translateBatch( @@ -105,23 +146,32 @@ export class TranslationService { targetLang: string, provider: ProviderConfig, maxRetries: number, + fileContext: FileContext, cancelSignal?: AbortSignal, ): Promise { throwIfCancelled(cancelSignal); - const batchWire = serializeLite(inputBlocks); - const body = this.buildRequestBody( - sourceLang, targetLang, batchWire, provider.model, inputBlocks.length, - ); - const url = sanitizeApiUrl(provider.apiUrl); - const headers = buildHeaders(sanitizeApiKey(provider.apiKey)); + + const canSplit = inputBlocks.length > 1; + // Give splittable batches fewer retries before halving — persistent count + // mismatches almost always resolve when we hand the model fewer similar + // blocks. Single-block batches can't be split, so let them exhaust. + const attempts = canSplit ? ATTEMPTS_BEFORE_SPLIT : maxRetries; const firstBlockNum = inputBlocks[0].number; + + const batchWire = serializeLite(inputBlocks); + const glossary = fileContext.renderForBatch(inputBlocks); + const userMessage = buildUserMessage(sourceLang, targetLang, batchWire, glossary); + let hitValidationFailure = false; let lastError = ''; - for (let attempt = 1; attempt <= maxRetries; attempt++) { + for (let attempt = 1; attempt <= attempts; attempt++) { throwIfCancelled(cancelSignal); try { - const resp = await this.postChat(url, body, headers, cancelSignal); - let output = parseLite(stripMarkdownFences(resp.choices[0].message.content)); + const raw = await this.callChat( + SYSTEM_PROMPT, userMessage, provider, + Math.max(inputBlocks.length, 1) * 120, cancelSignal, + ); + let output = parseLite(stripMarkdownFences(raw)); // Reattach timestamps from the original input positionally. if (output.length === inputBlocks.length) { output = output.map((b, i) => ({ @@ -133,8 +183,9 @@ export class TranslationService { const check = validateBatch(inputBlocks, output); if (check.ok) return output; + hitValidationFailure = true; lastError = `validation: ${check.error}`; - console.warn(`Batch validation failed (${attempt}/${maxRetries}):`, check.error); + console.warn(`Batch validation failed (${attempt}/${attempts}):`, check.error); } catch (err: unknown) { if (err instanceof TranslationCancelledError) { @@ -145,7 +196,7 @@ export class TranslationService { lastError = this.extractServerMessage(err) || (err as Error)?.message || String(err); console.warn( - `Batch request failed (${attempt}/${maxRetries}) [HTTP ${status}]:`, + `Batch request failed (${attempt}/${attempts}) [HTTP ${status}]:`, lastError, ); @@ -155,7 +206,7 @@ export class TranslationService { } // Rate-limited: exponential backoff before retrying. - if (status === 429 && attempt < maxRetries) { + if (status === 429 && attempt < attempts) { const delay = 2 ** attempt * 1000; console.warn(`Rate limited — waiting ${delay / 1000}s...`); await sleep(delay, cancelSignal); @@ -164,16 +215,66 @@ export class TranslationService { } // Small linear backoff between other retries (1s, 2s, 3s cap). - if (attempt < maxRetries) { + if (attempt < attempts) { await sleep(Math.min(attempt, 3) * 1000, cancelSignal); } } + // Attempts exhausted. If we hit validation errors and can still split, + // halve and retry each half independently. Recurse until single-block + // batches, which can't have count mismatches. + if (hitValidationFailure && canSplit) { + const mid = Math.floor(inputBlocks.length / 2); + const left = inputBlocks.slice(0, mid); + const right = inputBlocks.slice(mid); + console.warn( + `Batch splitting ${inputBlocks.length} -> ${left.length} + ${right.length} blocks`, + ); + // Sequential: parallel halves would oversubscribe the worker pool slot + // and starve other batches. + const leftResult = await this.translateBatch( + left, sourceLang, targetLang, provider, maxRetries, fileContext, cancelSignal, + ); + const rightResult = await this.translateBatch( + right, sourceLang, targetLang, provider, maxRetries, fileContext, cancelSignal, + ); + return [...leftResult, ...rightResult]; + } + throw new Error( - `Batch failed all ${maxRetries} retries (block ${firstBlockNum}): ${lastError}`, + `Batch failed all ${attempts} retries (block ${firstBlockNum}): ${lastError}`, ); } + // --------------------------------------------------------------------- + // Chat HTTP call (shared by scan + translation) + // --------------------------------------------------------------------- + + private async callChat( + systemPrompt: string, + userMessage: string, + provider: ProviderConfig, + maxTokens: number, + cancelSignal?: AbortSignal, + ): Promise { + const body: Record = { + messages: [ + { role: 'system', content: systemPrompt }, + { role: 'user', content: userMessage }, + ], + temperature: 0.1, + max_tokens: Math.max(maxTokens, 1), + stream: false, + cache_prompt: true, + }; + if (provider.model) body['model'] = provider.model; + + const url = sanitizeApiUrl(provider.apiUrl); + const headers = buildHeaders(sanitizeApiKey(provider.apiKey)); + const resp = await this.postChat(url, body, headers, cancelSignal); + return resp.choices[0].message.content; + } + private postChat( url: string, body: Record, @@ -218,27 +319,6 @@ export class TranslationService { }); } - private buildRequestBody( - sourceLang: string, - targetLang: string, - batchSrt: string, - model: string, - blockCount: number, - ): Record { - const body: Record = { - messages: [ - { role: 'system', content: SYSTEM_PROMPT }, - { role: 'user', content: buildUserMessage(sourceLang, targetLang, batchSrt) }, - ], - temperature: 0.1, - max_tokens: Math.max(blockCount, 1) * 120, - stream: false, - cache_prompt: true, - }; - if (model) body['model'] = model; - return body; - } - /** Pull a human-readable message out of whatever shape the provider returned. */ private extractServerMessage(err: unknown): string { if (!(err instanceof HttpErrorResponse) || !err.error) return ''; From 1e03b853c3752ec1f59cd66b29fb64721543c2f7 Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Tue, 21 Apr 2026 18:24:48 +0300 Subject: [PATCH 06/15] Web/CLI: Add auto-detect for source language - Enables automatic detection of source language for translations. - Enhances usability for mixed-language subtitle batches. - Updates UI to include an "Auto-detect" option in language selection. - Improves translation workflow by eliminating the need to specify a source language. --- cli/core/batch_runner.py | 5 ++++- cli/core/config.py | 2 +- cli/core/context_pass.py | 3 ++- cli/core/translator.py | 3 ++- cli/translora.py | 5 +++-- web/src/app/app.component.html | 1 + web/src/app/app.component.ts | 4 +++- web/src/app/core/languages.ts | 2 +- web/src/app/core/translation-prompt.ts | 4 +++- web/src/app/core/translation.service.ts | 3 ++- 10 files changed, 22 insertions(+), 10 deletions(-) diff --git a/cli/core/batch_runner.py b/cli/core/batch_runner.py index 9923cfa..6d01a90 100644 --- a/cli/core/batch_runner.py +++ b/cli/core/batch_runner.py @@ -117,7 +117,10 @@ def _build_user_message( batch: list[SubtitleBlock], ) -> str: """Assemble the user message, prepending any relevant glossary slice.""" - header = f"Translate from {cfg.source_lang} to {cfg.target_lang}:" + if cfg.source_lang: + header = f"Translate from {cfg.source_lang} to {cfg.target_lang}:" + else: + header = f"Translate to {cfg.target_lang}:" if file_context is not None: ctx = file_context.render_for_batch(batch) if ctx: diff --git a/cli/core/config.py b/cli/core/config.py index 61f3ce5..43c69b5 100644 --- a/cli/core/config.py +++ b/cli/core/config.py @@ -20,7 +20,7 @@ class TranslationConfig: `warn` lets callers intercept retry/validation messages so they can be routed around a live progress line instead of clobbering it. """ - source_lang: str + source_lang: str # "" means auto-detect from the text target_lang: str api_url: str api_key: str diff --git a/cli/core/context_pass.py b/cli/core/context_pass.py index ca3e7c3..2a65008 100644 --- a/cli/core/context_pass.py +++ b/cli/core/context_pass.py @@ -199,8 +199,9 @@ async def extract_file_context( """Run one scan call and return a FileContext. Empty on any failure.""" from .batch_runner import call_chat_api, strip_markdown_fences + source_line = f"Source language: {cfg.source_lang}\n" if cfg.source_lang else "" user_message = ( - f"Source language: {cfg.source_lang}\n" + f"{source_line}" f"Target language: {cfg.target_lang}\n\n" f"{serialize_for_scan(blocks)}" ) diff --git a/cli/core/translator.py b/cli/core/translator.py index 4ebef13..d4df242 100644 --- a/cli/core/translator.py +++ b/cli/core/translator.py @@ -59,10 +59,11 @@ async def translate_file_async( colors = Colors() if not cfg.quiet: + src_label = cfg.source_lang or "auto" print( f"{colors.bold('Translating')} {colors.cyan(str(len(doc.blocks)))} blocks " f"in {colors.cyan(str(total))} batches " - f"{colors.dim(f'({cfg.source_lang} → {cfg.target_lang}, {doc.format})')}" + f"{colors.dim(f'({src_label} → {cfg.target_lang}, {doc.format})')}" ) if cfg.concurrency > 1: print(colors.dim(f"Concurrency: {cfg.concurrency}")) diff --git a/cli/translora.py b/cli/translora.py index 7e9943a..8035757 100644 --- a/cli/translora.py +++ b/cli/translora.py @@ -57,8 +57,9 @@ def _build_parser() -> argparse.ArgumentParser: version=f"TransLora CLI {__version__}") p.add_argument("files", nargs="+", type=Path, help="subtitle files or directories (.srt, .vtt, .ass, ...)") - p.add_argument("--source", "-s", required=True, - help="Source language (e.g. English, French, Japanese)") + p.add_argument("--source", "-s", default="", + help="Source language (e.g. English, French). " + "Omit to auto-detect — useful for mixed-language batches.") p.add_argument("--target", "-t", required=True, help="Target language (e.g. Arabic, Spanish, Korean)") p.add_argument("--api-url", required=True, help="LLM API endpoint URL") diff --git a/web/src/app/app.component.html b/web/src/app/app.component.html index 6f0379a..e5be4e6 100644 --- a/web/src/app/app.component.html +++ b/web/src/app/app.component.html @@ -250,6 +250,7 @@

Advanced

+

Blocks per request.

+

Parallel requests per file.

+

Files running at the same time.

+

Attempts before failing.

@@ -275,6 +283,11 @@

Advanced

+
+

Auto-detect works for almost every file. Pick one to force the source language.

+ +

Subtitles will be translated into this language.

+
@@ -307,6 +320,8 @@

Advanced

/> @if (apiKeyWarning()) {

{{ apiKeyWarning() }}

+ } @else { +

Paste the key from your provider's dashboard. Stays in your browser — never uploaded.

} } @@ -322,6 +337,7 @@

Advanced

(ngModelChange)="apiUrl.set($event)" placeholder="http://127.0.0.1:8080/v1/chat/completions" /> +

The chat-completions endpoint of your server or service.

} @else {
@@ -342,6 +358,7 @@

Advanced

} } +

Pick one of the provider's models or type a model ID your account supports.

} diff --git a/web/src/app/app.component.scss b/web/src/app/app.component.scss index f12a814..9fd2866 100644 --- a/web/src/app/app.component.scss +++ b/web/src/app/app.component.scss @@ -373,6 +373,11 @@ display: grid; grid-template-columns: repeat(2, minmax(0, 1fr)); gap: 0.7rem; + align-items: start; + + .field + .field { + margin-top: 0; + } } .label { @@ -473,6 +478,28 @@ color: var(--color-danger); } +.field-hint { + margin: 0.3rem 0 0; + font-size: 0.7rem; + line-height: 1.35; + color: var(--text-muted); +} + +.lang-hints { + display: grid; + grid-template-columns: minmax(0, 1fr) auto minmax(0, 1fr); + gap: 0 0.65rem; + margin-top: 0.1rem; + + .field-hint { + margin-top: 0; + } +} + +.lang-hints-spacer { + width: 2.2rem; +} + .dropzone { padding: 1rem; margin-top: 0.85rem; From 529011ea647aa4bf61af6aa51436e1818e8f4357 Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Tue, 21 Apr 2026 18:26:32 +0300 Subject: [PATCH 09/15] Web/CLI: Adjust batch size for improved performance - Decreased default batch size from 15 to 10 for better handling. - Updated help text to reflect batch size adjustment. - Aligns batch size across CLI and web for consistency. --- cli/core/config.py | 2 +- cli/translora.py | 6 +++--- web/src/app/core/translation.service.ts | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cli/core/config.py b/cli/core/config.py index 4a148e3..acd9e56 100644 --- a/cli/core/config.py +++ b/cli/core/config.py @@ -30,7 +30,7 @@ class TranslationConfig: api_url: str api_key: str model: str | None = None - batch_size: int = 15 + batch_size: int = 10 concurrency: int = 1 max_retries: int = DEFAULT_MAX_RETRIES quiet: bool = False diff --git a/cli/translora.py b/cli/translora.py index 7b4be70..61b5baf 100644 --- a/cli/translora.py +++ b/cli/translora.py @@ -67,10 +67,10 @@ def _build_parser() -> argparse.ArgumentParser: help="API key (default: none — for local servers)") p.add_argument("--model", default=None, help="Model name (e.g. gpt-4.1-mini, deepseek-chat)") - p.add_argument("--batch-size", type=int, default=15, - help="Subtitle blocks per batch (default: 15)") + p.add_argument("--batch-size", type=int, default=10, + help="Subtitle blocks per batch (default: 10)") p.add_argument("--concurrency", "-c", type=int, default=1, - help="Parallel batches per file (default: 1)") + help="Parallel batches per file (default: 1, raise for cloud providers)") p.add_argument("--parallel-files", "-pf", type=int, default=1, help="Translate this many files at once (default: 1)") p.add_argument("--max-retries", type=int, default=DEFAULT_MAX_RETRIES, diff --git a/web/src/app/core/translation.service.ts b/web/src/app/core/translation.service.ts index a34ac66..cf33d38 100644 --- a/web/src/app/core/translation.service.ts +++ b/web/src/app/core/translation.service.ts @@ -38,7 +38,7 @@ export class TranslationCancelledError extends Error { } export const DEFAULT_MAX_RETRIES = 5; -export const DEFAULT_BATCH_SIZE = 5; +export const DEFAULT_BATCH_SIZE = 10; export const DEFAULT_CONCURRENCY = 5; export const DEFAULT_PARALLEL_FILES = 1; From acdd885ce5073054e1750610a34b5ee2f3ab3d00 Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Tue, 21 Apr 2026 18:27:24 +0300 Subject: [PATCH 10/15] Doc: Update README --- README.md | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 4c8cd40..a417254 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ --- -Works with any OpenAI-compatible endpoint — local llama.cpp servers, OpenAI, Groq, DeepSeek, OpenRouter, and more. +Works with any OpenAI-compatible endpoint — local servers, OpenAI, Groq, DeepSeek, OpenRouter, and more. Two interfaces share the same pipeline: @@ -26,8 +26,10 @@ Two interfaces share the same pipeline: ## Highlights -- **Batched translation** — sends ~15 subtitle blocks at a time so small models don't drift, skip short lines, or merge split sentences. -- **Strict validation** — every batch is checked for block count, numbering, and unchanged timestamps; failures retry with back-off. +- **Batched translation** — sends ~10 subtitle blocks at a time so small models don't drift, skip short lines, or merge split sentences. +- **Cast & register prepass** — a pre-scan extracts characters, recurring terms, and the written register so every batch translates names and formality consistently. +- **Strict validation** — every batch is checked for block count, numbering, and unchanged timestamps; failures retry with back-off and recursively split on repeated failure. +- **Auto-detect source language** — omit the source and the model infers it from the text, so mixed-language batches translate to a single target cleanly. - **Any OpenAI-compatible provider** — local or cloud, no vendor lock-in. - **Parallelism** — translate many batches per file and many files at once. - **Live progress** — per-file progress bars in the web app, an in-place status line (elapsed / ETA / throughput) in the CLI. @@ -40,7 +42,7 @@ npm install ng serve ``` -Open http://localhost:4200, drop in one or more subtitle files, pick source/target languages and a provider, and download translated files individually or as a ZIP. +Open http://localhost:4200, drop in one or more subtitle files, pick a target language (source defaults to Auto-detect) and a provider, and download translated files individually or as a ZIP. ## Command line @@ -49,16 +51,20 @@ cd cli # Option A — pip pip install -r requirements.txt -python translora.py movie.srt -s English -t Arabic \ +python translora.py movie.srt -t Arabic \ --api-url http://127.0.0.1:8080/v1/chat/completions # Option B — uv (faster, auto-manages the venv) uv sync -uv run translora.py movie.srt -s English -t Arabic \ +uv run translora.py movie.srt -t Arabic \ + --api-url http://127.0.0.1:8080/v1/chat/completions + +# Explicit source language (skip auto-detect) +python translora.py movie.srt -s English -t Arabic \ --api-url http://127.0.0.1:8080/v1/chat/completions -# Cloud provider, whole folder in parallel -python translora.py ./subs/ -s English -t Arabic \ +# Cloud provider, whole folder in parallel (source auto-detected per file) +python translora.py ./subs/ -t Arabic \ --api-url https://api.openai.com/v1/chat/completions \ --api-key sk-... --model gpt-4.1-mini -c 10 -pf 3 ``` @@ -67,28 +73,31 @@ Frequently used flags: | Flag | Description | | --- | --- | -| `-s, --source` / `-t, --target` | Source and target language names | +| `-t, --target` | Target language name (required) | +| `-s, --source` | Source language (optional; omit to auto-detect — useful for mixed-language batches) | | `--api-url` | OpenAI-compatible `/v1/chat/completions` endpoint | | `--api-key` | API key; use `none` for local servers | | `--model` | Model name (optional for local) | -| `--batch-size` | Subtitle blocks per batch (default **15**) | -| `-c, --concurrency` | Parallel batches per file (default **1**) | +| `--batch-size` | Subtitle blocks per batch (default **10**) | +| `-c, --concurrency` | Parallel batches per file (default **1** — raise for cloud providers) | | `-pf, --parallel-files` | Files translated in parallel (default **1**) | | `--max-retries` | Retries per batch (default **5**) | | `--force` | Re-translate even if the output exists | +| `-v, --verbose` | Show retry/validation warnings (hidden by default) | | `-o, --output` | Output path (single file only) | Set `NO_COLOR=1` to disable ANSI colors; output auto-falls back to plain lines when piped. ## How it works -Small and medium LLMs have known failure modes on long subtitle files: skipping one-word blocks (`"Oh!"`, `"Hmm."`), merging sentences split across two blocks for timing, and drifting mid-file. TransLora defends against that with a five-step pipeline: +Small and medium LLMs have known failure modes on long subtitle files: skipping one-word blocks (`"Oh!"`, `"Hmm."`), merging sentences split across two blocks for timing, drifting mid-file, and switching dialect or formality between batches. TransLora defends against that with a six-step pipeline: 1. Parse the subtitle file into numbered blocks with timestamps (SRT, VTT, ASS, SSA, SBV, SUB). -2. Split blocks into batches small enough that the model can't drift. -3. Send each batch with a structure-preserving system prompt. -4. Validate the response: block count in = out, numbers and timestamps untouched. -5. Retry failed batches up to `--max-retries` before flagging the file, then stitch the validated batches back in order. +2. Pre-scan the file with one extra LLM call to extract the cast, recurring terms, and the written register (e.g. Modern Standard Arabic, peninsular Spanish, polite Japanese). The relevant slice is attached to each batch so names and formality stay consistent across the whole file. +3. Split blocks into batches small enough that the model can't drift. +4. Send each batch with a structure-preserving system prompt. +5. Validate the response: block count in = out, numbers and timestamps untouched. Repeated failures recursively split the batch down to singletons before giving up. +6. Retry failed batches up to `--max-retries` before flagging the file, then stitch the validated batches back in order. ## Providers @@ -128,7 +137,6 @@ Anything else that speaks the OpenAI chat-completions protocol will work the sam ## Roadmap - Side-by-side preview and per-block editing in the web app -- Translation memory for character-voice consistency across a file - General document/text translation beyond subtitles ## License From 05a3af5a0865c0c8401f7503b838e98ee252c4dc Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Tue, 21 Apr 2026 18:27:36 +0300 Subject: [PATCH 11/15] Web: Remove unneeded favicon. --- web/public/favicon.ico | Bin 15086 -> 0 bytes web/src/index.html | 1 - 2 files changed, 1 deletion(-) delete mode 100644 web/public/favicon.ico diff --git a/web/public/favicon.ico b/web/public/favicon.ico deleted file mode 100644 index 57614f9c967596fad0a3989bec2b1deff33034f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15086 zcmd^G33O9Omi+`8$@{|M-I6TH3wzF-p5CV8o}7f~KxR60LK+ApEFB<$bcciv%@SmA zV{n>g85YMFFeU*Uvl=i4v)C*qgnb;$GQ=3XTe9{Y%c`mO%su)noNCCQ*@t1WXn|B(hQ7i~ zrUK8|pUkD6#lNo!bt$6)jR!&C?`P5G(`e((P($RaLeq+o0Vd~f11;qB05kdbAOm?r zXv~GYr_sibQO9NGTCdT;+G(!{4Xs@4fPak8#L8PjgJwcs-Mm#nR_Z0s&u?nDX5^~@ z+A6?}g0|=4e_LoE69pPFO`yCD@BCjgKpzMH0O4Xs{Ahc?K3HC5;l=f zg>}alhBXX&);z$E-wai+9TTRtBX-bWYY@cl$@YN#gMd~tM_5lj6W%8ah4;uZ;jP@Q zVbuel1rPA?2@x9Y+u?e`l{Z4ngfG5q5BLH5QsEu4GVpt{KIp1?U)=3+KQ;%7ec8l* zdV=zZgN5>O3G(3L2fqj3;oBbZZw$Ij@`Juz@?+yy#OPw)>#wsTewVgTK9BGt5AbZ&?K&B3GVF&yu?@(Xj3fR3n+ZP0%+wo)D9_xp>Z$`A4 zfV>}NWjO#3lqumR0`gvnffd9Ka}JJMuHS&|55-*mCD#8e^anA<+sFZVaJe7{=p*oX zE_Uv?1>e~ga=seYzh{9P+n5<+7&9}&(kwqSaz;1aD|YM3HBiy<))4~QJSIryyqp| z8nGc(8>3(_nEI4n)n7j(&d4idW1tVLjZ7QbNLXg;LB ziHsS5pXHEjGJZb59KcvS~wv;uZR-+4qEqow`;JCfB*+b^UL^3!?;-^F%yt=VjU|v z39SSqKcRu_NVvz!zJzL0CceJaS6%!(eMshPv_0U5G`~!a#I$qI5Ic(>IONej@aH=f z)($TAT#1I{iCS4f{D2+ApS=$3E7}5=+y(rA9mM#;Cky%b*Gi0KfFA`ofKTzu`AV-9 znW|y@19rrZ*!N2AvDi<_ZeR3O2R{#dh1#3-d%$k${Rx42h+i&GZo5!C^dSL34*AKp z27mTd>k>?V&X;Nl%GZ(>0s`1UN~Hfyj>KPjtnc|)xM@{H_B9rNr~LuH`Gr5_am&Ep zTjZA8hljNj5H1Ipm-uD9rC}U{-vR!eay5&6x6FkfupdpT*84MVwGpdd(}ib)zZ3Ky z7C$pnjc82(W_y_F{PhYj?o!@3__UUvpX)v69aBSzYj3 zdi}YQkKs^SyXyFG2LTRz9{(w}y~!`{EuAaUr6G1M{*%c+kP1olW9z23dSH!G4_HSK zzae-DF$OGR{ofP*!$a(r^5Go>I3SObVI6FLY)N@o<*gl0&kLo-OT{Tl*7nCz>Iq=? zcigIDHtj|H;6sR?or8Wd_a4996GI*CXGU}o;D9`^FM!AT1pBY~?|4h^61BY#_yIfO zKO?E0 zJ{Pc`9rVEI&$xxXu`<5E)&+m(7zX^v0rqofLs&bnQT(1baQkAr^kEsk)15vlzAZ-l z@OO9RF<+IiJ*O@HE256gCt!bF=NM*vh|WVWmjVawcNoksRTMvR03H{p@cjwKh(CL4 z7_PB(dM=kO)!s4fW!1p0f93YN@?ZSG` z$B!JaAJCtW$B97}HNO9(x-t30&E}Mo1UPi@Av%uHj~?T|!4JLwV;KCx8xO#b9IlUW zI6+{a@Wj|<2Y=U;a@vXbxqZNngH8^}LleE_4*0&O7#3iGxfJ%Id>+sb;7{L=aIic8 z|EW|{{S)J-wr@;3PmlxRXU8!e2gm_%s|ReH!reFcY8%$Hl4M5>;6^UDUUae?kOy#h zk~6Ee_@ZAn48Bab__^bNmQ~+k=02jz)e0d9Z3>G?RGG!65?d1>9}7iG17?P*=GUV-#SbLRw)Hu{zx*azHxWkGNTWl@HeWjA?39Ia|sCi{e;!^`1Oec zb>Z|b65OM*;eC=ZLSy?_fg$&^2xI>qSLA2G*$nA3GEnp3$N-)46`|36m*sc#4%C|h zBN<2U;7k>&G_wL4=Ve5z`ubVD&*Hxi)r@{4RCDw7U_D`lbC(9&pG5C*z#W>8>HU)h z!h3g?2UL&sS!oY5$3?VlA0Me9W5e~V;2jds*fz^updz#AJ%G8w2V}AEE?E^=MK%Xt z__Bx1cr7+DQmuHmzn*|hh%~eEc9@m05@clWfpEFcr+06%0&dZJH&@8^&@*$qR@}o3 z@Tuuh2FsLz^zH+dN&T&?0G3I?MpmYJ;GP$J!EzjeM#YLJ!W$}MVNb0^HfOA>5Fe~UNn%Zk(PT@~9}1dt)1UQ zU*B5K?Dl#G74qmg|2>^>0WtLX#Jz{lO4NT`NYB*(L#D|5IpXr9v&7a@YsGp3vLR7L zHYGHZg7{ie6n~2p$6Yz>=^cEg7tEgk-1YRl%-s7^cbqFb(U7&Dp78+&ut5!Tn(hER z|Gp4Ed@CnOPeAe|N>U(dB;SZ?NU^AzoD^UAH_vamp6Ws}{|mSq`^+VP1g~2B{%N-!mWz<`)G)>V-<`9`L4?3dM%Qh6<@kba+m`JS{Ya@9Fq*m6$$ zA1%Ogc~VRH33|S9l%CNb4zM%k^EIpqY}@h{w(aBcJ9c05oiZx#SK9t->5lSI`=&l~ z+-Ic)a{FbBhXV$Xt!WRd`R#Jk-$+_Z52rS>?Vpt2IK<84|E-SBEoIw>cs=a{BlQ7O z-?{Fy_M&84&9|KM5wt~)*!~i~E=(6m8(uCO)I=)M?)&sRbzH$9Rovzd?ZEY}GqX+~ zFbEbLz`BZ49=2Yh-|<`waK-_4!7`ro@zlC|r&I4fc4oyb+m=|c8)8%tZ-z5FwhzDt zL5kB@u53`d@%nHl0Sp)Dw`(QU&>vujEn?GPEXUW!Wi<+4e%BORl&BIH+SwRcbS}X@ z01Pk|vA%OdJKAs17zSXtO55k!;%m9>1eW9LnyAX4uj7@${O6cfii`49qTNItzny5J zH&Gj`e}o}?xjQ}r?LrI%FjUd@xflT3|7LA|ka%Q3i}a8gVm<`HIWoJGH=$EGClX^C0lysQJ>UO(q&;`T#8txuoQ_{l^kEV9CAdXuU1Ghg8 zN_6hHFuy&1x24q5-(Z7;!poYdt*`UTdrQOIQ!2O7_+AHV2hgXaEz7)>$LEdG z<8vE^Tw$|YwZHZDPM!SNOAWG$?J)MdmEk{U!!$M#fp7*Wo}jJ$Q(=8>R`Ats?e|VU?Zt7Cdh%AdnfyN3MBWw{ z$OnREvPf7%z6`#2##_7id|H%Y{vV^vWXb?5d5?a_y&t3@p9t$ncHj-NBdo&X{wrfJ zamN)VMYROYh_SvjJ=Xd!Ga?PY_$;*L=SxFte!4O6%0HEh%iZ4=gvns7IWIyJHa|hT z2;1+e)`TvbNb3-0z&DD_)Jomsg-7p_Uh`wjGnU1urmv1_oVqRg#=C?e?!7DgtqojU zWoAB($&53;TsXu^@2;8M`#z{=rPy?JqgYM0CDf4v@z=ZD|ItJ&8%_7A#K?S{wjxgd z?xA6JdJojrWpB7fr2p_MSsU4(R7=XGS0+Eg#xR=j>`H@R9{XjwBmqAiOxOL` zt?XK-iTEOWV}f>Pz3H-s*>W z4~8C&Xq25UQ^xH6H9kY_RM1$ch+%YLF72AA7^b{~VNTG}Tj#qZltz5Q=qxR`&oIlW Nr__JTFzvMr^FKp4S3v*( diff --git a/web/src/index.html b/web/src/index.html index c627cdf..a32b061 100644 --- a/web/src/index.html +++ b/web/src/index.html @@ -6,7 +6,6 @@ - From d262658e019c261f63974a5465bc40245624d2ce Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Tue, 21 Apr 2026 21:04:36 +0300 Subject: [PATCH 12/15] CLI: Remove unneeded comments. --- cli/core/batch_runner.py | 43 ++++---------------- cli/core/config.py | 11 ++---- cli/core/context_pass.py | 40 ++++--------------- cli/core/formats/__init__.py | 10 +---- cli/core/formats/types.py | 14 +------ cli/core/live_status.py | 43 ++++---------------- cli/core/srt_parser.py | 41 +++---------------- cli/core/time_tracker.py | 9 +---- cli/core/translator.py | 32 +++++---------- cli/tests/test_context_pass.py | 72 +++++++++++++++++----------------- cli/translora.py | 37 ++++------------- 11 files changed, 91 insertions(+), 261 deletions(-) diff --git a/cli/core/batch_runner.py b/cli/core/batch_runner.py index 6d01a90..bb9dbb5 100644 --- a/cli/core/batch_runner.py +++ b/cli/core/batch_runner.py @@ -1,10 +1,4 @@ -"""Per-batch HTTP call, response sanitizing, and retry loop. - -This is the "send one batch, get it back validated" layer. It knows how -to talk to an OpenAI-compatible chat endpoint and how to recover from -transient failures. Everything above this layer (translator.py) just -asks for batches and stitches them together. -""" +"""Per-batch HTTP call, response sanitizing, and retry loop.""" from __future__ import annotations @@ -26,16 +20,11 @@ class FileTranslationError(Exception): - """A batch used up all its retries — the whole file is considered failed.""" - + """A batch exhausted its retries; the whole file is considered failed.""" -# --------------------------------------------------------------------------- -# Input sanitization — users paste URLs/keys in all kinds of shapes. -# --------------------------------------------------------------------------- def sanitize_api_url(url: str) -> str: - """Drop credential query params like `?key=...` so we don't authenticate - twice when the user pastes a pre-keyed URL.""" + """Drop credential query params so we don't authenticate twice.""" url = (url or "").strip() if not url: return url @@ -50,7 +39,6 @@ def sanitize_api_url(url: str) -> str: def sanitize_api_key(key: str) -> str: - """Strip whitespace, surrounding quotes, and any `Bearer ` prefix.""" k = (key or "").strip() if (k.startswith('"') and k.endswith('"')) or \ (k.startswith("'") and k.endswith("'")): @@ -61,7 +49,6 @@ def sanitize_api_key(key: str) -> str: def strip_markdown_fences(text: str) -> str: - """LLMs sometimes wrap output in ```...``` despite being told not to.""" text = text.strip() if text.startswith("```"): text = re.sub(r"^```[a-zA-Z]*\n?", "", text) @@ -70,14 +57,9 @@ def strip_markdown_fences(text: str) -> str: def is_retryable_http(code: int) -> bool: - """Retry on timeout / rate-limit / server errors. Everything else is fatal.""" return code in (408, 429) or code >= 500 -# --------------------------------------------------------------------------- -# HTTP call + retry -# --------------------------------------------------------------------------- - async def call_chat_api( client: httpx.AsyncClient, system_prompt: str, @@ -85,7 +67,6 @@ async def call_chat_api( cfg: TranslationConfig, max_tokens: int, ) -> str: - """POST one chat request to the OpenAI-compatible endpoint, return raw text.""" body: dict = { "messages": [ {"role": "system", "content": system_prompt}, @@ -116,7 +97,6 @@ def _build_user_message( file_context: FileContext | None, batch: list[SubtitleBlock], ) -> str: - """Assemble the user message, prepending any relevant glossary slice.""" if cfg.source_lang: header = f"Translate from {cfg.source_lang} to {cfg.target_lang}:" else: @@ -139,13 +119,11 @@ async def translate_batch_with_retry( file_context: FileContext | None = None, _split_path: str = "", ) -> list[SubtitleBlock]: - """Translate one batch; on repeated validation failure split it in half. + """Translate one batch; on repeated validation failure, halve and recurse. Persistent count mismatches usually mean the model is deterministically - merging two adjacent similar-looking blocks (e.g., repeated reactions - like "Oh." / "Oh!"). Splitting gives the model fewer similar blocks to - confuse and almost always resolves the merge. We keep halving until we - reach single-block batches, which can't have count mismatches. + merging two adjacent similar-looking blocks. Halving keeps terminating + because at N=1 a count mismatch is impossible. """ batch_wire = serialize_lite(batch) user_msg = _build_user_message(cfg, batch_wire, file_context, batch) @@ -163,7 +141,6 @@ async def translate_batch_with_retry( client, SYSTEM_PROMPT, user_msg, cfg, max(len(batch), 1) * 120, ) output = parse_lite(strip_markdown_fences(raw)) - # Reattach timestamps from the original input positionally. if len(output) == len(batch): output = [ SubtitleBlock(number=batch[i].number, @@ -192,15 +169,12 @@ async def translate_batch_with_retry( await asyncio.sleep(delay) continue - except Exception as e: # network error, JSON decode error, etc. + except Exception as e: cfg.warn(f" {label} request failed ({tag}): {e}") - # Small back-off before the next attempt (1s, 2s, 3s cap). if attempt < attempts: await asyncio.sleep(min(attempt, 3)) - # All attempts exhausted. If we hit validation errors and can still split, - # cut the batch in half and retry each half independently. Otherwise fail. if hit_validation_failure and can_split: mid = len(batch) // 2 left, right = batch[:mid], batch[mid:] @@ -209,8 +183,7 @@ async def translate_batch_with_retry( ) left_path = (_split_path + "L") if _split_path else "L" right_path = (_split_path + "R") if _split_path else "R" - # Sequential: parallel halves would oversubscribe the outer semaphore's - # per-batch slot and starve other batches. + # Sequential: parallel halves would oversubscribe the outer semaphore. left_result = await translate_batch_with_retry( client, batch_idx, left, cfg, file_context, left_path, ) diff --git a/cli/core/config.py b/cli/core/config.py index acd9e56..67b98bf 100644 --- a/cli/core/config.py +++ b/cli/core/config.py @@ -18,14 +18,9 @@ def _stderr_warn(msg: str) -> None: @dataclass class TranslationConfig: - """Everything a translation run needs beyond the file paths. - - Bundled so we aren't threading 8+ arguments through every helper. - `warn` lets callers intercept retry/validation messages so they can be - routed around a live progress line instead of clobbering it. Default is - silent — pass --verbose on the CLI to surface retry/validation chatter. - """ - source_lang: str # "" means auto-detect from the text + """Per-run config. `warn` is the retry/validation sink — silent by default, + rebindable by callers so it can route around a live progress line.""" + source_lang: str # "" means auto-detect target_lang: str api_url: str api_key: str diff --git a/cli/core/context_pass.py b/cli/core/context_pass.py index 2a65008..8b3dedf 100644 --- a/cli/core/context_pass.py +++ b/cli/core/context_pass.py @@ -1,14 +1,5 @@ -"""One-shot prepass: scan the whole file for cast, recurring terms, and tone -notes before batched translation begins. - -The goal is to fix gendered-pronoun errors in languages like Arabic, where -the model must pick masculine/feminine forms but English gives no signal. -We send the full source text once, ask for a compact glossary, and inject -the relevant slice into each batch's prompt. - -If the scan fails for any reason, callers get an empty FileContext and -translation proceeds exactly as it did before. -""" +"""Prepass scan: extract cast, terms, and register from the whole file once +so every batch shares the same glossary. Fails silently to an empty context.""" from __future__ import annotations @@ -51,9 +42,7 @@ - Leave a section empty (tags only) if nothing qualifies. Never omit a section.\ """ -# Rough cap on source text sent to the scan. Tuned so small-context models -# (4k-8k total) still have room for the system prompt, the output, and a -# safety margin. ~4 chars ≈ 1 token on Latin text. +# Sized so small-context models (4k-8k) still have room for prompt + output. _SCAN_CHAR_BUDGET = 12_000 _SCAN_MAX_TOKENS = 1500 @@ -88,12 +77,8 @@ def is_empty(self) -> bool: return not (self.register or self.characters or self.terms or self.notes) def render_for_batch(self, batch: list[SubtitleBlock]) -> str: - """Return only the glossary slice relevant to this batch. - - Register and notes are always included (short, file-wide). Characters - and terms are included only if their source form appears in the batch. - Returns an empty string when there is nothing worth injecting. - """ + """Return a glossary slice scoped to names/terms present in this batch. + Register and notes are file-wide and always included if set.""" text = "\n".join(b.text for b in batch) chars = [h for h in self.characters if _contains_word(text, h.source)] terms = [h for h in self.terms if _contains_word(text, h.source)] @@ -122,18 +107,12 @@ def _contains_word(text: str, word: str) -> bool: def serialize_for_scan(blocks: list[SubtitleBlock]) -> str: - """Serialize subtitle text for the scan pass — no numbers, no timestamps. - - For large files we can't fit every block in the scan call, so we stride- - sample evenly across the whole file. Sampling preserves each character's - chance of appearing at least once in the glossary, regardless of where - they're first introduced. - """ + """Text for the scan pass. Stride-samples large files so characters + introduced late still have a chance to land in the glossary.""" total_chars = sum(len(b.text) + 1 for b in blocks) if total_chars <= _SCAN_CHAR_BUDGET or len(blocks) <= 1: return "\n".join(b.text for b in blocks) - # Estimate how many blocks fit in the budget, then sample evenly. take_n = max(1, int(len(blocks) * _SCAN_CHAR_BUDGET / total_chars)) step = len(blocks) / take_n sampled = [blocks[int(i * step)] for i in range(take_n)] @@ -147,7 +126,6 @@ def parse_context_response(text: str) -> FileContext: for m in _SECTION_RE.finditer(text or "") } - # Register is free-form but must be a single line; collapse any whitespace. register = " ".join(sections.get("register", "").split()).strip().lstrip("-*• ").strip() characters: list[CharacterHint] = [] @@ -219,9 +197,7 @@ async def extract_file_context( context = parse_context_response(strip_markdown_fences(raw)) if context.is_empty(): - # Diagnostic: show a short snippet of what the model actually returned - # so it's obvious whether it ignored the tagged format, truncated, or - # refused. Truncated hard to keep noise down. + # Diagnostic snippet: helps tell whether the model ignored tags, truncated, or refused. snippet = (raw or "").strip().replace("\n", " ")[:240] cfg.warn(f" Context scan returned empty glossary. Raw start: {snippet!r}") return context diff --git a/cli/core/formats/__init__.py b/cli/core/formats/__init__.py index 16eaf81..92d852d 100644 --- a/cli/core/formats/__init__.py +++ b/cli/core/formats/__init__.py @@ -1,11 +1,6 @@ """Multi-format subtitle parsing via pysubs2 (with a small SBV fallback). - -Every supported format is normalized to SRT-shape blocks (sequential numbers -starting at 1 and ``HH:MM:SS,mmm --> HH:MM:SS,mmm`` timestamps) so the LLM -always sees the same structure. Rebuild delegates back to the underlying -library for each source format, preserving headers, styles, and per-cue -metadata. -""" +Every format is normalized to SRT-shape blocks so the LLM sees one structure; +rebuild delegates back to pysubs2 to preserve headers, styles, and per-cue metadata.""" from __future__ import annotations @@ -27,7 +22,6 @@ def parse_subtitle(file_name: str, content: str) -> SubtitleDocument: - """Dispatch to the right parser based on the filename's extension.""" ext = Path(file_name).suffix.lower().lstrip(".") if ext == "sbv": return parse_sbv(content) diff --git a/cli/core/formats/types.py b/cli/core/formats/types.py index 3c51221..2673451 100644 --- a/cli/core/formats/types.py +++ b/cli/core/formats/types.py @@ -12,24 +12,14 @@ @dataclass class SubtitleDocument: - """A parsed subtitle file. - - `blocks` are always normalized into SRT-style shape (sequential numbers - starting at 1 and ``HH:MM:SS,mmm --> HH:MM:SS,mmm`` timestamps) so the LLM - always sees the same structure. `rebuild` takes the translated blocks and - returns the file serialized back into the original format. - """ + """Parsed file. `blocks` use normalized SRT shape; `rebuild` serializes back + to the original format.""" format: SubtitleFormat blocks: list[SubtitleBlock] rebuild: Callable[[list[SubtitleBlock]], str] -# --------------------------------------------------------------------------- # -# Helpers -# --------------------------------------------------------------------------- # - - def strip_bom(s: str) -> str: return s[1:] if s.startswith("\ufeff") else s diff --git a/cli/core/live_status.py b/cli/core/live_status.py index ebb7570..234c6b5 100644 --- a/cli/core/live_status.py +++ b/cli/core/live_status.py @@ -1,8 +1,6 @@ -"""Terminal-friendly live status rendering: ANSI colors, in-place line -updates, and a background ticker for multi-file progress. +"""Terminal-friendly live status: ANSI colors, in-place line updates, ticker. -All output is inert (plain text) when stdout isn't a TTY or when the -user sets `NO_COLOR=1` — this way logs piped to a file stay clean. +Output is plain text when stdout isn't a TTY or when NO_COLOR=1. """ from __future__ import annotations @@ -13,13 +11,7 @@ from typing import Callable -# --------------------------------------------------------------------------- -# Colors -# --------------------------------------------------------------------------- - class Colors: - """ANSI color helpers. Becomes a no-op when colors are disabled.""" - def __init__(self, enabled: bool | None = None) -> None: if enabled is None: enabled = ( @@ -41,19 +33,10 @@ def dim(self, t: str) -> str: return self._wrap("2", t) def bold(self, t: str) -> str: return self._wrap("1", t) -# --------------------------------------------------------------------------- -# In-place single-line updates -# --------------------------------------------------------------------------- - class LiveLine: - """A single terminal line you can rewrite over and over with `update()`. - - Use `println()` to print something *above* the live line (e.g. an error - or a completion message) without clobbering it. When you're done, call - `finalize()` to move the cursor to a fresh line. - - When the output isn't a TTY, `update()` just prints each message as its - own line so logs stay readable. + """A single terminal line rewritten via update(). Use println() to emit + text above the live line without clobbering it. Falls back to plain lines + on non-TTY streams. """ def __init__(self, enabled: bool | None = None, stream=sys.stdout) -> None: @@ -69,7 +52,7 @@ def update(self, text: str) -> None: with self._lock: print(text, file=self.stream, flush=True) return - # Strip ANSI codes when measuring width so padding works correctly. + # Strip ANSI when measuring width so padding clears the full line. visible_len = _visible_len(text) with self._lock: pad = " " * max(0, self._last_len - visible_len) @@ -78,7 +61,6 @@ def update(self, text: str) -> None: self._last_len = visible_len def println(self, text: str, file=None) -> None: - """Print `text` on its own line, above the live status.""" target = file or self.stream with self._lock: if self.enabled and self._last_len: @@ -88,7 +70,6 @@ def println(self, text: str, file=None) -> None: self._last_len = 0 def finalize(self) -> None: - """Drop a newline after the live line so normal prints resume below.""" with self._lock: if self.enabled and self._last_len: self.stream.write("\n") @@ -97,12 +78,10 @@ def finalize(self) -> None: def _visible_len(text: str) -> int: - """Length of `text` ignoring ANSI escape sequences.""" out = 0 i = 0 while i < len(text): if text[i] == "\033" and i + 1 < len(text) and text[i + 1] == "[": - # Skip until the final 'm'. j = text.find("m", i + 2) if j == -1: break @@ -113,15 +92,9 @@ def _visible_len(text: str) -> int: return out -# --------------------------------------------------------------------------- -# Background ticker -# --------------------------------------------------------------------------- - class Ticker: - """Calls `render_fn` from a background thread every `interval` seconds. - - Swallows exceptions inside the callback so a transient render error can't - kill the whole translation run. + """Calls render_fn from a background thread every interval seconds. + Swallows callback exceptions so a transient render error can't kill the run. """ def __init__(self, render_fn: Callable[[], None], interval: float = 1.0) -> None: diff --git a/cli/core/srt_parser.py b/cli/core/srt_parser.py index 8c22c44..5811893 100644 --- a/cli/core/srt_parser.py +++ b/cli/core/srt_parser.py @@ -12,72 +12,49 @@ @dataclass class SubtitleBlock: - """A single .srt subtitle block.""" - number: int timestamp: str - text: str # may contain multiple lines separated by \n + text: str def parse_srt(content: str) -> list[SubtitleBlock]: - """Parse raw .srt content into a list of SubtitleBlock objects. - - Handles various line-ending styles and tolerates minor formatting issues. - """ - # Normalize line endings content = content.replace("\r\n", "\n").replace("\r", "\n") - - # Strip BOM if present if content.startswith("\ufeff"): content = content[1:] - - # Split on double newlines (one or more blank lines between blocks) raw_blocks = re.split(r"\n\n+", content.strip()) blocks: list[SubtitleBlock] = [] - for raw in raw_blocks: lines = raw.strip().split("\n") if len(lines) < 2: continue - - # First line: block number try: number = int(lines[0].strip()) except ValueError: continue - - # Second line: timestamp timestamp = lines[1].strip() if not TIMESTAMP_RE.match(timestamp): continue - - # Remaining lines: subtitle text text = "\n".join(lines[2:]) if len(lines) > 2 else "" - blocks.append(SubtitleBlock(number=number, timestamp=timestamp, text=text)) return blocks def serialize_srt(blocks: list[SubtitleBlock]) -> str: - """Serialize SubtitleBlock list back into .srt file content.""" parts: list[str] = [] for block in blocks: parts.append(f"{block.number}\n{block.timestamp}\n{block.text}") return "\n\n".join(parts) + "\n" -# Wire format sent to the LLM: number + text only. Timestamps are pure noise -# for the model — it just echoes them back, and small models sometimes corrupt -# a digit. We strip them before sending and reattach from the original input. +# Wire format: number + text only. Timestamps are stripped before sending +# because small models sometimes corrupt them; callers reattach positionally. def serialize_lite(blocks: list[SubtitleBlock]) -> str: return "\n\n".join(f"{b.number}\n{b.text}" for b in blocks) + "\n" def parse_lite(content: str) -> list[SubtitleBlock]: - """Parse the wire-format response. Timestamps are left empty — callers - reattach them positionally from the original batch.""" content = content.replace("\r\n", "\n").replace("\r", "\n") if content.startswith("\ufeff"): content = content[1:] @@ -98,14 +75,11 @@ def parse_lite(content: str) -> list[SubtitleBlock]: def split_batches(blocks: list[SubtitleBlock], batch_size: int = 15) -> list[list[SubtitleBlock]]: - """Split blocks into batches of the given size.""" return [blocks[i : i + batch_size] for i in range(0, len(blocks), batch_size)] @dataclass class ValidationResult: - """Result of batch validation.""" - ok: bool error: str = "" @@ -114,15 +88,12 @@ def validate_batch( input_blocks: list[SubtitleBlock], output_blocks: list[SubtitleBlock], ) -> ValidationResult: - """Validate that the translated batch matches the input structure.""" - # 1. Block count if len(input_blocks) != len(output_blocks): return ValidationResult( ok=False, error=f"Block count mismatch: expected {len(input_blocks)}, got {len(output_blocks)}", ) - # 2. Block number sequence for i, (inp, out) in enumerate(zip(input_blocks, output_blocks)): if inp.number != out.number: return ValidationResult( @@ -130,7 +101,6 @@ def validate_batch( error=f"Block number mismatch at index {i}: expected {inp.number}, got {out.number}", ) - # 3. Timestamps unchanged for i, (inp, out) in enumerate(zip(input_blocks, output_blocks)): if inp.timestamp != out.timestamp: return ValidationResult( @@ -138,9 +108,8 @@ def validate_batch( error=f"Timestamp modified at block {inp.number}: expected '{inp.timestamp}', got '{out.timestamp}'", ) - # 4. Non-empty input must produce non-empty output. Catches the silent - # data-loss case where the model shifts blocks and leaves a tail block blank - # while preserving count/numbers/timestamps. + # Catches silent data-loss where the model shifts blocks and leaves a + # tail block blank while preserving count/numbers/timestamps. for inp, out in zip(input_blocks, output_blocks): if inp.text.strip() and not out.text.strip(): return ValidationResult( diff --git a/cli/core/time_tracker.py b/cli/core/time_tracker.py index 62b3526..b8bbf1e 100644 --- a/cli/core/time_tracker.py +++ b/cli/core/time_tracker.py @@ -4,7 +4,6 @@ def format_duration(seconds: float) -> str: - """'42s', '3m 20s', '1h 5m'.""" total = int(round(seconds)) if total < 60: return f"{total}s" @@ -14,12 +13,8 @@ def format_duration(seconds: float) -> str: class EtaEstimator: - """Estimates remaining time based on how long completed batches took. - - We ignore the first `concurrency` completions because they all finish - roughly at once (they started together) — using them would wildly - underestimate the true rate. - """ + """ETA based on completed batches. Skips the first `concurrency` completions + since they all finish near-simultaneously and would skew the rate.""" def __init__(self, total: int, concurrency: int, start: float) -> None: self.total = total diff --git a/cli/core/translator.py b/cli/core/translator.py index a13b6cf..a4d68a6 100644 --- a/cli/core/translator.py +++ b/cli/core/translator.py @@ -1,9 +1,4 @@ -"""Translate an .srt file end-to-end by sending batches to an LLM chat API. - -Pipeline: - read file -> parse blocks -> split into batches -> send each batch in - parallel -> validate response -> stitch translated batches back together. -""" +"""Per-file orchestration: parse, prepass scan, batched translate, stitch.""" from __future__ import annotations @@ -23,8 +18,7 @@ from .live_status import Colors, LiveLine, Ticker -# Re-exports — callers (translora.py) import these names from translator -# so they don't need to know the internal module layout. +# Re-exported so translora.py doesn't need to import from submodules directly. __all__ = [ "DEFAULT_MAX_RETRIES", "FileTranslationError", @@ -39,7 +33,7 @@ async def translate_file_async( output_path: Path, cfg: TranslationConfig, ) -> None: - """Translate one .srt file end-to-end. + """Translate one subtitle file end-to-end. Raises FileTranslationError on any batch that exhausts retries. """ @@ -90,8 +84,6 @@ async def translate_file_async( batches, cfg, colors, started_at, file_context, ) - # Stitch in order (they completed out-of-order but `_run_batches` returns - # them indexed by their original position). translated: list[SubtitleBlock] = [] for r in translated_batches: translated.extend(r) @@ -114,31 +106,27 @@ async def _run_batches( started_at: float, file_context: FileContext | None = None, ) -> list[list[SubtitleBlock]]: - """Translate every batch with up to `cfg.concurrency` requests in flight. + """Translate every batch with up to cfg.concurrency requests in flight. - Returns results in original batch order. Raises FileTranslationError - and cancels remaining work the moment any batch fails fatally. + Results are returned in original batch order. A fatal batch failure cancels + remaining work. """ total = len(batches) results: list[list[SubtitleBlock] | None] = [None] * total eta = EtaEstimator(total, cfg.concurrency, started_at) semaphore = asyncio.Semaphore(cfg.concurrency) - # Shared cancellation flag — as soon as any batch fails fatally we stop - # scheduling new work rather than wasting retries on doomed batches. failure: FileTranslationError | None = None live = LiveLine() if not cfg.quiet else None - # Route batch-level retry/error messages above the live line so they don't - # get clobbered by the progress refresh. Only when verbose — otherwise warn - # is a no-op and the routing wrapper would undo that. + # Route verbose warnings above the live line so the progress refresh + # doesn't clobber them. In non-verbose mode warn is a no-op, so leave it. original_warn = cfg.warn if live is not None and cfg.verbose: cfg.warn = lambda msg: live.println(colors.yellow(msg), file=sys.stderr) - # Shared with the ticker so the "batch" column keeps showing the last - # completed batch's time between completions. + # Held between completions so the ticker keeps showing the last batch time. last_batch_elapsed = 0.0 def render() -> None: @@ -182,7 +170,6 @@ async def run_one(idx: int) -> None: if failure: raise failure - # All slots must be filled now — `failure` would have been raised otherwise. return [r for r in results if r is not None] @@ -193,7 +180,6 @@ def _render_status( batch_elapsed: float, eta: EtaEstimator, ) -> None: - """Draw the single in-place progress line for one batch completion.""" done = eta.done pct = int(100 * done / total) if total else 0 elapsed = time.time() - eta.start diff --git a/cli/tests/test_context_pass.py b/cli/tests/test_context_pass.py index 7742d45..50fefa2 100644 --- a/cli/tests/test_context_pass.py +++ b/cli/tests/test_context_pass.py @@ -16,36 +16,36 @@ def _block(n: int, text: str) -> SubtitleBlock: def test_parse_well_formed_response(): raw = """ -Modern Standard Arabic, neutral +Target language, neutral register -Amy => إيمي | female -Jake => جيك | male -Stranger => غريب | unknown +Alice => TargetAlice | female +Bob => TargetBob | male +Stranger => TargetStranger | unknown -precinct => قسم الشرطة +headquarters => TargetHQ -- Modern police procedural +- Workplace drama - Casual register """ ctx = parse_context_response(raw) - assert ctx.register == "Modern Standard Arabic, neutral" + assert ctx.register == "Target language, neutral register" assert ctx.characters == [ - CharacterHint("Amy", "إيمي", "female"), - CharacterHint("Jake", "جيك", "male"), - CharacterHint("Stranger", "غريب", "unknown"), + CharacterHint("Alice", "TargetAlice", "female"), + CharacterHint("Bob", "TargetBob", "male"), + CharacterHint("Stranger", "TargetStranger", "unknown"), ] - assert ctx.terms == [TermHint("precinct", "قسم الشرطة")] - assert ctx.notes == ["Modern police procedural", "Casual register"] + assert ctx.terms == [TermHint("headquarters", "TargetHQ")] + assert ctx.notes == ["Workplace drama", "Casual register"] def test_parse_register_collapses_whitespace_and_bullet(): raw = """ - - Brazilian Portuguese, + - Target language, casual @@ -56,36 +56,36 @@ def test_parse_register_collapses_whitespace_and_bullet(): """ ctx = parse_context_response(raw) - assert ctx.register == "Brazilian Portuguese, casual" + assert ctx.register == "Target language, casual" def test_render_includes_register_line_even_when_no_matches(): ctx = FileContext( - register="Modern Standard Arabic, neutral", - characters=[CharacterHint("Amy", "إيمي", "female")], + register="Target language, neutral", + characters=[CharacterHint("Alice", "TargetAlice", "female")], terms=[], notes=[], ) batch = [_block(1, "Nobody named here.")] rendered = ctx.render_for_batch(batch) - assert "Target register: Modern Standard Arabic, neutral" in rendered - assert "Amy" not in rendered + assert "Target register: Target language, neutral" in rendered + assert "Alice" not in rendered def test_is_empty_considers_register(): assert FileContext().is_empty() - assert not FileContext(register="MSA").is_empty() + assert not FileContext(register="Target language").is_empty() def test_parse_tolerates_missing_sections_and_bullets(): raw = """ -- Amy => إيمي | female -* Jake => جيك | MALE +- Alice => TargetAlice | female +* Bob => TargetBob | MALE """ ctx = parse_context_response(raw) - assert [h.source for h in ctx.characters] == ["Amy", "Jake"] + assert [h.source for h in ctx.characters] == ["Alice", "Bob"] assert ctx.characters[1].gender == "male" assert ctx.terms == [] assert ctx.notes == [] @@ -99,23 +99,23 @@ def test_parse_garbage_returns_empty(): def test_render_for_batch_only_includes_matching_characters(): ctx = FileContext( characters=[ - CharacterHint("Amy", "إيمي", "female"), - CharacterHint("Jake", "جيك", "male"), + CharacterHint("Alice", "TargetAlice", "female"), + CharacterHint("Bob", "TargetBob", "male"), ], - terms=[TermHint("precinct", "قسم الشرطة")], - notes=["Police procedural"], + terms=[TermHint("headquarters", "TargetHQ")], + notes=["Workplace drama"], ) - batch = [_block(1, "Amy, come here."), _block(2, "I'm tired.")] + batch = [_block(1, "Alice, come here."), _block(2, "I'm tired.")] rendered = ctx.render_for_batch(batch) - assert "Amy" in rendered - assert "Jake" not in rendered - assert "precinct" not in rendered - assert "Police procedural" in rendered + assert "Alice" in rendered + assert "Bob" not in rendered + assert "headquarters" not in rendered + assert "Workplace drama" in rendered def test_render_for_batch_empty_when_nothing_matches_and_no_notes(): ctx = FileContext( - characters=[CharacterHint("Amy", "إيمي", "female")], + characters=[CharacterHint("Alice", "TargetAlice", "female")], terms=[], notes=[], ) @@ -125,13 +125,13 @@ def test_render_for_batch_empty_when_nothing_matches_and_no_notes(): def test_render_word_boundary_does_not_match_substrings(): ctx = FileContext( - characters=[CharacterHint("Amy", "إيمي", "female")], + characters=[CharacterHint("Alice", "TargetAlice", "female")], terms=[], notes=[], ) - # "Amy" inside "Amyloid" should not match. - batch = [_block(1, "Amyloid plaques.")] - assert "Amy" not in ctx.render_for_batch(batch) + # "Alice" as a substring of a longer word must not trigger a match. + batch = [_block(1, "Alicebot is online.")] + assert "Alice" not in ctx.render_for_batch(batch) def test_serialize_for_scan_returns_all_text_when_under_budget(): diff --git a/cli/translora.py b/cli/translora.py index 61b5baf..8758df9 100644 --- a/cli/translora.py +++ b/cli/translora.py @@ -39,14 +39,9 @@ """ -# Module-level palette — colors auto-disable on non-TTY / NO_COLOR. C = Colors() -# --------------------------------------------------------------------------- -# Argument parsing -# --------------------------------------------------------------------------- - def _build_parser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( description="TransLora — translate subtitle files using LLMs", @@ -84,13 +79,8 @@ def _build_parser() -> argparse.ArgumentParser: return p -# --------------------------------------------------------------------------- -# File discovery & output naming -# --------------------------------------------------------------------------- - - def _collect_files(paths: list[Path]) -> list[Path]: - """Expand user-supplied paths into a flat list of subtitle files.""" + """Expand paths into a flat list of subtitle files.""" files: list[Path] = [] for p in paths: if p.is_dir(): @@ -119,7 +109,7 @@ class Job: def _plan_jobs(args, srt_files: list[Path]) -> tuple[list[Job], int]: - """Decide which files still need translating. Returns (jobs, skipped).""" + """Return (jobs to run, skipped count) based on existing outputs.""" jobs: list[Job] = [] skipped = 0 total = len(srt_files) @@ -143,16 +133,11 @@ def _plan_jobs(args, srt_files: list[Path]) -> tuple[list[Job], int]: return jobs, skipped -# --------------------------------------------------------------------------- -# Parallel execution -# --------------------------------------------------------------------------- - async def _translate_all(args, jobs: list[Job]) -> tuple[int, list[tuple[Path, str]]]: - """Run all translation jobs with the configured parallelism.""" parallel = max(1, args.parallel_files) total_jobs = len(jobs) - # With 2+ jobs in flight, per-file live progress can't share the terminal. - # Switch translator into quiet mode and drive an overall ticker instead. + # Multi-file mode: per-file live progress can't share the terminal, so + # suppress per-file output and drive a run-wide ticker instead. multi_file = total_jobs > 1 cfg = TranslationConfig( source_lang=args.source, @@ -174,13 +159,11 @@ async def _translate_all(args, jobs: list[Job]) -> tuple[int, list[tuple[Path, s completed = 0 failed: list[tuple[Path, str]] = [] - # State used by both the ticker thread and the coroutines. Integer/list - # reads are atomic under the GIL — stale ticker data is just cosmetic. + # Shared with the ticker thread — atomic reads under the GIL, stale data + # is cosmetic. live = LiveLine() if multi_file else None use_ticker = live is not None and live.enabled - # Route any batch-level warnings above the ticker line — only when the - # user asked for verbose output; otherwise warn stays silent. if live is not None and cfg.verbose: cfg.warn = lambda msg: live.println(C.yellow(msg), file=sys.stderr) @@ -216,8 +199,8 @@ async def run_job(job: Job) -> None: elapsed = time.time() - start file_times.append(elapsed) completed += 1 - # In single-file mode the translator already printed its - # completion banner — avoid duplicating it. + # Single-file mode already prints a completion banner from + # the translator itself — don't duplicate it here. if live is not None: done = completed + len(failed) line = ( @@ -254,10 +237,6 @@ async def run_job(job: Job) -> None: return completed, failed -# --------------------------------------------------------------------------- -# Entry point -# --------------------------------------------------------------------------- - def _print_header(jobs_count: int, total_files: int, parallel: int, concurrency: int, skipped: int) -> None: print( From 06e92cba0f245cab579897b0c782643ede25a00a Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Tue, 21 Apr 2026 21:11:16 +0300 Subject: [PATCH 13/15] Web: Remove unneeded comments. --- web/src/app/app.component.ts | 39 +++--------------- web/src/app/core/context-pass.ts | 41 ++++-------------- web/src/app/core/file-types.ts | 4 +- web/src/app/core/languages.ts | 8 +--- web/src/app/core/srt-parser.ts | 22 +--------- web/src/app/core/subtitle-formats/index.ts | 7 +--- web/src/app/core/subtitle-formats/types.ts | 12 +----- web/src/app/core/time-tracker.ts | 35 ++++------------ web/src/app/core/translation.service.ts | 48 ++++------------------ 9 files changed, 36 insertions(+), 180 deletions(-) diff --git a/web/src/app/app.component.ts b/web/src/app/app.component.ts index dbb8bc0..a3d8d2c 100644 --- a/web/src/app/app.component.ts +++ b/web/src/app/app.component.ts @@ -21,7 +21,6 @@ import { } from './core/file-types'; import { TimeTracker } from './core/time-tracker'; -// Defaults — kept in one place so "Reset defaults" is trivial const DEFAULTS = { sourceLang: '', targetLang: 'Arabic', @@ -44,15 +43,12 @@ export class AppComponent implements OnDestroy { providerKeys = PROVIDER_KEYS; presets = PROVIDER_PRESETS; - // Languages sourceLang = signal(DEFAULTS.sourceLang); targetLang = signal(DEFAULTS.targetLang); - // Files files = signal([]); dragOver = signal(false); - // Provider providerType = signal('custom'); apiUrl = signal(''); apiKey = signal(''); @@ -63,10 +59,8 @@ export class AppComponent implements OnDestroy { parallelFiles = signal(DEFAULTS.parallelFiles); maxRetries = signal(DEFAULTS.maxRetries); - // Theme theme = signal<'light' | 'dark'>('light'); - // Translation state isTranslating = signal(false); isCancelling = signal(false); fileStatuses = signal([]); @@ -78,7 +72,6 @@ export class AppComponent implements OnDestroy { private runController: AbortController | null = null; private cancelRequested = false; - // Computed currentPreset = computed(() => PROVIDER_PRESETS[this.providerType()]); totalBlocks = computed(() => @@ -113,7 +106,6 @@ export class AppComponent implements OnDestroy { return Math.round((sum / all.length) * 100); }); - // Elapsed / avg / ETA timing lives in a dedicated helper. tracker = new TimeTracker( this.doneFiles, this.inProgressFiles, @@ -204,8 +196,6 @@ export class AppComponent implements OnDestroy { this.tracker.destroy(); } - // --- Theme --- - private initTheme() { if (typeof window === 'undefined') { this.setTheme('light'); @@ -227,8 +217,6 @@ export class AppComponent implements OnDestroy { } } - // --- Files --- - onDragOver(event: DragEvent) { event.preventDefault(); event.stopPropagation(); @@ -327,8 +315,6 @@ export class AppComponent implements OnDestroy { }); } - // --- Provider --- - onProviderTypeChange(type: string) { this.providerType.set(type); const preset = PROVIDER_PRESETS[type]; @@ -348,14 +334,11 @@ export class AppComponent implements OnDestroy { swapLanguages() { const source = this.sourceLang(); - // Nothing sensible to swap when source is "auto-detect" — target stays as-is. if (!source) return; this.sourceLang.set(this.targetLang()); this.targetLang.set(source); } - // --- Translation --- - startTranslation() { if (!this.canTranslate()) return; @@ -371,7 +354,6 @@ export class AppComponent implements OnDestroy { status: 'pending' as const, })) ); - this.enqueue(this.files().map((_, i) => i), false); } @@ -381,7 +363,6 @@ export class AppComponent implements OnDestroy { this.errorMessage.set(''); this.isCancelling.set(false); - // Reset failed files to pending and collect their indices. const retryIndices: number[] = []; this.fileStatuses.update((statuses) => statuses.map((s, i) => { @@ -403,12 +384,9 @@ export class AppComponent implements OnDestroy { this.enqueue(retryIndices, true); } - /** - * Push indices onto the shared work queue and make sure enough workers are - * running. Safe to call while a previous run is still in flight — new items - * are picked up by idle workers, or fresh ones are spawned up to - * `parallelFiles()`. - */ + // Pushes indices onto the shared queue and ensures enough workers are running. + // Safe to call mid-run: idle workers pick up new items, or fresh ones spawn + // up to parallelFiles(). private enqueue(indices: number[], isRetry: boolean) { if (indices.length === 0) return; @@ -467,8 +445,8 @@ export class AppComponent implements OnDestroy { if (cancelled) { this.cancelRequested = false; - // Mark anything still pending/translating as failed so the user - // keeps done/failed entries and can retry the interrupted ones. + // Mark still-pending/translating entries as failed so they remain + // retryable from the UI. this.fileStatuses.update((arr) => arr.map((s) => s.status === 'pending' || s.status === 'translating' @@ -548,8 +526,6 @@ export class AppComponent implements OnDestroy { }); } - // --- Downloads --- - downloadFile(f: FileStatus) { if (f.content) this.downloadBlob(f.content, f.outputName); } @@ -596,15 +572,11 @@ export class AppComponent implements OnDestroy { URL.revokeObjectURL(url); } - // --- Reset --- - reset() { this.files.set([]); this.clearRunState(true); } - // --- Helpers --- - private clearRunState(clearError: boolean) { this.workQueue = []; this.fileStatuses.set([]); @@ -620,7 +592,6 @@ export class AppComponent implements OnDestroy { return m ? `${m[1]}.${code}${m[2]}` : `${name}.${code}`; } - /** Look up the 2-letter code for the current target language. */ private targetLangCode(): string { const name = this.targetLang(); return this.languages.find((l) => l.name === name)?.code diff --git a/web/src/app/core/context-pass.ts b/web/src/app/core/context-pass.ts index 2ca2b1e..9f31cf5 100644 --- a/web/src/app/core/context-pass.ts +++ b/web/src/app/core/context-pass.ts @@ -1,15 +1,5 @@ -/** - * One-shot prepass: scan the whole file for cast, recurring terms, and tone - * notes before batched translation begins. - * - * Goal is to fix gendered-pronoun errors in languages like Arabic, where the - * model must pick masculine/feminine forms but English gives no signal. We - * send the full source text once, ask for a compact glossary, and inject the - * relevant slice into each batch's prompt. - * - * If the scan fails for any reason, callers get an empty FileContext and - * translation proceeds exactly as it did before. - */ +// One-shot prepass: scans the file once for cast/terms/register so every batch +// shares the same glossary. Fails silently to an empty FileContext. import { SubtitleBlock } from './srt-parser'; @@ -37,9 +27,7 @@ Rules: - Include up to 20 named characters, 10 recurring proper terms or jargon, 4 brief notes on setting/tone. - Leave a section empty (tags only) if nothing qualifies. Never omit a section.`; -// Rough cap on source text sent to the scan. Tuned so small-context models -// (4k-8k total) still have room for the system prompt, the output, and a -// safety margin. ~4 chars ≈ 1 token on Latin text. +// Sized so small-context models (4k-8k) still have room for prompt + output. export const SCAN_CHAR_BUDGET = 12_000; export const SCAN_MAX_TOKENS = 1500; @@ -68,13 +56,8 @@ export class FileContext { return !(this.register || this.characters.length || this.terms.length || this.notes.length); } - /** - * Return only the glossary slice relevant to this batch. - * - * Register and notes are always included (short, file-wide). Characters and - * terms are included only if their source form appears in the batch. - * Returns an empty string when there is nothing worth injecting. - */ + // Glossary slice scoped to names/terms present in this batch. Register and + // notes are file-wide and always included if set. renderForBatch(batch: SubtitleBlock[]): string { const text = batch.map((b) => b.text).join('\n'); const chars = this.characters.filter((h) => containsWord(text, h.source)); @@ -113,21 +96,14 @@ function containsWord(text: string, word: string): boolean { return re.test(text); } -/** - * Serialize subtitle text for the scan pass — no numbers, no timestamps. - * - * For large files we can't fit every block in the scan call, so we stride- - * sample evenly across the whole file. Sampling preserves each character's - * chance of appearing at least once in the glossary, regardless of where - * they're first introduced. - */ +// Stride-samples large files so characters introduced late still have a +// chance to land in the glossary. export function serializeForScan(blocks: SubtitleBlock[]): string { const totalChars = blocks.reduce((sum, b) => sum + b.text.length + 1, 0); if (totalChars <= SCAN_CHAR_BUDGET || blocks.length <= 1) { return blocks.map((b) => b.text).join('\n'); } - // Estimate how many blocks fit in the budget, then sample evenly. const takeN = Math.max(1, Math.floor((blocks.length * SCAN_CHAR_BUDGET) / totalChars)); const step = blocks.length / takeN; const sampled: SubtitleBlock[] = []; @@ -143,7 +119,7 @@ function stripBullet(line: string): string { return line.trim().replace(/^[-*•]\s*/, '').trim(); } -/** Parse the tagged response. Tolerates extra whitespace and bullet markers. */ +// Parse tagged response. Tolerates extra whitespace and bullet markers. export function parseContextResponse(text: string): FileContext { const sections: Record = {}; const src = text || ''; @@ -153,7 +129,6 @@ export function parseContextResponse(text: string): FileContext { sections[m[1].toLowerCase()] = m[2]; } - // Register is free-form but must be a single line; collapse any whitespace. const rawRegister = sections['register'] ?? ''; const register = stripBullet(rawRegister.split(/\s+/).join(' ')); diff --git a/web/src/app/core/file-types.ts b/web/src/app/core/file-types.ts index 80a5c5b..08e83d5 100644 --- a/web/src/app/core/file-types.ts +++ b/web/src/app/core/file-types.ts @@ -3,15 +3,13 @@ import { SubtitleDocument } from './subtitle-formats/types'; export const SUBTITLE_EXTS = ['.srt', '.vtt', '.ass', '.ssa', '.sub', '.sbv']; export const SUBTITLE_ACCEPT = SUBTITLE_EXTS.join(','); -/** A file the user has picked but not yet translated. */ export interface UploadedFile { name: string; blockCount: number; - /** Parsed once at upload so we know the format before translation starts. */ + // Parsed at upload time so the format is known before translation starts. doc: SubtitleDocument; } -/** The live state of a single file during and after translation. */ export interface FileStatus { name: string; outputName: string; diff --git a/web/src/app/core/languages.ts b/web/src/app/core/languages.ts index 3baf3dc..34fa20c 100644 --- a/web/src/app/core/languages.ts +++ b/web/src/app/core/languages.ts @@ -1,14 +1,8 @@ export interface LanguageOption { - /** ISO 639-1 two-letter code — used in output filenames (e.g. movie.ar.srt). */ - code: string; - /** Display name shown in the UI. */ + code: string; // ISO 639-1, used in output filenames (e.g. movie.ar.srt) name: string; } -/** - * Subtitle translation languages, alphabetical by English name. - * Codes follow ISO 639-1. Default source is "Auto-detect" (empty), default target is "Arabic". - */ export const LANGUAGES: LanguageOption[] = [ { code: 'af', name: 'Afrikaans' }, { code: 'sq', name: 'Albanian' }, diff --git a/web/src/app/core/srt-parser.ts b/web/src/app/core/srt-parser.ts index 7d4c2c4..54d6183 100644 --- a/web/src/app/core/srt-parser.ts +++ b/web/src/app/core/srt-parser.ts @@ -11,11 +11,7 @@ export interface ValidationResult { const TIMESTAMP_RE = /^\d{2}:\d{2}:\d{2},\d{3}\s*-->\s*\d{2}:\d{2}:\d{2},\d{3}$/; -/** - * Parse raw .srt content into SubtitleBlock array. - */ export function parseSrt(content: string): SubtitleBlock[] { - // Normalize line endings and strip BOM content = content.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); if (content.charCodeAt(0) === 0xfeff) { content = content.slice(1); @@ -42,9 +38,6 @@ export function parseSrt(content: string): SubtitleBlock[] { return blocks; } -/** - * Serialize SubtitleBlock array back to .srt file content. - */ export function serializeSrt(blocks: SubtitleBlock[]): string { return ( blocks.map((b) => `${b.number}\n${b.timestamp}\n${b.text}`).join('\n\n') + @@ -52,17 +45,12 @@ export function serializeSrt(blocks: SubtitleBlock[]): string { ); } -// Wire format sent to the LLM: number + text only. Timestamps are pure noise -// for the model — it echoes them back, and small models sometimes corrupt a -// digit. We strip them before sending and reattach from the original input. +// Wire format: number + text only. Timestamps are stripped before sending +// because small models sometimes corrupt them; callers reattach positionally. export function serializeLite(blocks: SubtitleBlock[]): string { return blocks.map((b) => `${b.number}\n${b.text}`).join('\n\n') + '\n'; } -/** - * Parse the wire-format response. Timestamps are left empty — callers reattach - * them positionally from the original batch. - */ export function parseLite(content: string): SubtitleBlock[] { content = content.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); if (content.charCodeAt(0) === 0xfeff) content = content.slice(1); @@ -83,9 +71,6 @@ export function parseLite(content: string): SubtitleBlock[] { return blocks; } -/** - * Split blocks into batches of the given size. - */ export function splitBatches( blocks: SubtitleBlock[], batchSize: number = 15 @@ -97,9 +82,6 @@ export function splitBatches( return batches; } -/** - * Validate that translated output matches input structure. - */ export function validateBatch( inputBlocks: SubtitleBlock[], outputBlocks: SubtitleBlock[] diff --git a/web/src/app/core/subtitle-formats/index.ts b/web/src/app/core/subtitle-formats/index.ts index 642e019..eb88e1b 100644 --- a/web/src/app/core/subtitle-formats/index.ts +++ b/web/src/app/core/subtitle-formats/index.ts @@ -18,11 +18,8 @@ const EXT_TO_FORMAT: Record = { sbv: 'sbv', }; -/** - * Parse any supported subtitle file via subsrt-ts. All formats normalize to - * SRT-shape blocks for the LLM; rebuild round-trips through subsrt-ts back - * into the original on-disk format. - */ +// Parses via subsrt-ts. All formats normalize to SRT-shape blocks; rebuild +// round-trips back through subsrt-ts into the original on-disk format. export function parseSubtitle(fileName: string, content: string): SubtitleDocument { const ext = fileExt(fileName); const format = EXT_TO_FORMAT[ext]; diff --git a/web/src/app/core/subtitle-formats/types.ts b/web/src/app/core/subtitle-formats/types.ts index a0864ca..a91e671 100644 --- a/web/src/app/core/subtitle-formats/types.ts +++ b/web/src/app/core/subtitle-formats/types.ts @@ -2,16 +2,8 @@ import { SubtitleBlock } from '../srt-parser'; export type SubtitleFormat = 'srt' | 'vtt' | 'ass' | 'ssa' | 'sbv' | 'sub'; -/** - * A file that has been parsed from one of the supported subtitle formats. - * - * `blocks` are always normalized into SRT-style shape — sequential numbers - * starting at 1 and `HH:MM:SS,mmm --> HH:MM:SS,mmm` timestamps — so the LLM - * always sees the same structure regardless of the source format. The - * original on-disk structure (headers, styles, per-cue metadata) is kept - * inside `rebuild`, which serializes the translated blocks back into the - * original format. - */ +// Parsed file. `blocks` use normalized SRT shape; `rebuild` serializes back +// to the original format, preserving headers/styles/per-cue metadata. export interface SubtitleDocument { format: SubtitleFormat; blocks: SubtitleBlock[]; diff --git a/web/src/app/core/time-tracker.ts b/web/src/app/core/time-tracker.ts index eb6e60b..7934fc0 100644 --- a/web/src/app/core/time-tracker.ts +++ b/web/src/app/core/time-tracker.ts @@ -9,11 +9,8 @@ interface ProgressItem { totalBatches?: number; } -/** - * Tracks elapsed time, per-file average, and ETA for a translation run. - * Owns its own ticker (updates every 500 ms while running) and exposes - * pre-formatted strings the template can render directly. - */ +// Elapsed / avg / ETA for a translation run. Owns its own 500ms ticker and +// exposes pre-formatted strings for the template. export class TimeTracker { private startMs = signal(0); private nowMs = signal(0); @@ -27,8 +24,6 @@ export class TimeTracker { private pendingCount: Signal, ) {} - // --- Raw numbers ------------------------------------------------------- - elapsedMs = computed(() => { this.nowMs(); // subscribe to ticks return this.running() @@ -42,14 +37,9 @@ export class TimeTracker { return done.reduce((s, f) => s + (f.timeMs ?? 0), 0) / done.length; }); - /** - * Remaining wall-clock time in ms, or null until we have any progress. - * - * Uses fractional file-equivalents: completed files count as 1, in-progress - * files contribute `currentBatch / totalBatches`. Dividing elapsed time by - * that fraction gives a rate we can extrapolate from — so ETA starts showing - * as soon as any file reports its first batch, including single-file runs. - */ + // Fractional file-equivalents: in-progress files contribute + // currentBatch/totalBatches, so ETA shows up as soon as any file reports a + // first batch (including single-file runs). etaMs = computed(() => { const done = this.doneFiles(); const inProgress = this.inProgressFiles(); @@ -71,8 +61,6 @@ export class TimeTracker { return (elapsed * (totalFiles - fractionalDone)) / fractionalDone; }); - // --- Formatted for templates ------------------------------------------ - elapsedFormatted = computed(() => this.formatMs(this.elapsedMs())); totalFormatted = computed(() => { @@ -90,9 +78,6 @@ export class TimeTracker { return ms === null || ms <= 0 ? '' : this.formatMs(ms); }); - // --- Control ---------------------------------------------------------- - - /** Start a fresh run: resets totals, captures the start time, starts ticking. */ begin(): void { const now = performance.now(); this.totalMs.set(0); @@ -102,10 +87,8 @@ export class TimeTracker { this.startTicker(); } - /** - * Continue an existing run after idle (e.g. retry-failed). Elapsed time - * picks up where the previous `finish()` left off instead of resetting to 0. - */ + // Continues after idle (e.g. retry-failed) so elapsed time picks up where + // finish() left off instead of resetting. resume(): void { const now = performance.now(); const prev = this.totalMs(); @@ -115,24 +98,20 @@ export class TimeTracker { this.startTicker(); } - /** Freeze the final elapsed time and stop ticking. */ finish(): void { this.totalMs.set(performance.now() - this.startMs()); this.running.set(false); this.stopTicker(); } - /** Clear the frozen total (used when the user resets the UI). */ reset(): void { this.totalMs.set(0); } - /** Stop the ticker — call from ngOnDestroy. */ destroy(): void { this.stopTicker(); } - /** '42s', '3m 20s', '1h 5m'. Public so the template can format one-off values. */ formatMs(ms: number): string { const totalSec = Math.round(ms / 1000); if (totalSec < 60) return `${totalSec}s`; diff --git a/web/src/app/core/translation.service.ts b/web/src/app/core/translation.service.ts index cf33d38..c974289 100644 --- a/web/src/app/core/translation.service.ts +++ b/web/src/app/core/translation.service.ts @@ -24,7 +24,6 @@ export interface ProviderConfig { model: string; } -/** Sent to the caller every time a batch starts or finishes. */ export interface TranslationProgress { currentBatch: number; totalBatches: number; @@ -50,11 +49,6 @@ type ChatResponse = { choices: { message: { content: string } }[] }; export class TranslationService { constructor(private http: HttpClient) {} - /** - * Translate a parsed subtitle document. The document's blocks are translated - * in batches and then stitched back together using the document's own - * `rebuild`, which preserves the source file's original format. - */ async translateDocument( doc: SubtitleDocument, sourceLang: string, @@ -78,7 +72,6 @@ export class TranslationService { const batches = splitBatches(doc.blocks, batchSize); const results: SubtitleBlock[][] = new Array(batches.length); - // Simple worker pool: each worker pulls the next index until none left. let nextIdx = 0; let completed = 0; const emit = () => onProgress?.({ @@ -108,10 +101,6 @@ export class TranslationService { return doc.rebuild(translated); } - // --------------------------------------------------------------------- - // Prepass: one scan call for cast, recurring terms, and tone notes. - // --------------------------------------------------------------------- - private async extractFileContext( blocks: SubtitleBlock[], sourceLang: string, @@ -137,10 +126,6 @@ export class TranslationService { } } - // --------------------------------------------------------------------- - // Per-batch translation with retry + recursive split on validation failure - // --------------------------------------------------------------------- - private async translateBatch( inputBlocks: SubtitleBlock[], sourceLang: string, @@ -153,9 +138,8 @@ export class TranslationService { throwIfCancelled(cancelSignal); const canSplit = inputBlocks.length > 1; - // Give splittable batches fewer retries before halving — persistent count - // mismatches almost always resolve when we hand the model fewer similar - // blocks. Single-block batches can't be split, so let them exhaust. + // Splittable batches give up early — halving resolves persistent count + // mismatches faster than more retries on the same payload. const attempts = canSplit ? ATTEMPTS_BEFORE_SPLIT : maxRetries; const firstBlockNum = inputBlocks[0].number; @@ -173,7 +157,7 @@ export class TranslationService { Math.max(inputBlocks.length, 1) * 120, cancelSignal, ); let output = parseLite(stripMarkdownFences(raw)); - // Reattach timestamps from the original input positionally. + // Wire format strips timestamps; reattach positionally. if (output.length === inputBlocks.length) { output = output.map((b, i) => ({ number: inputBlocks[i].number, @@ -201,12 +185,10 @@ export class TranslationService { lastError, ); - // Fail fast on non-retryable errors (bad key, bad request, etc.) if (!isRetryableStatus(status)) { throw new Error(`HTTP ${status}: ${lastError} (block ${firstBlockNum})`); } - // Rate-limited: exponential backoff before retrying. if (status === 429 && attempt < attempts) { const delay = 2 ** attempt * 1000; console.warn(`Rate limited — waiting ${delay / 1000}s...`); @@ -215,15 +197,13 @@ export class TranslationService { } } - // Small linear backoff between other retries (1s, 2s, 3s cap). if (attempt < attempts) { await sleep(Math.min(attempt, 3) * 1000, cancelSignal); } } - // Attempts exhausted. If we hit validation errors and can still split, - // halve and retry each half independently. Recurse until single-block - // batches, which can't have count mismatches. + // Recursive split: halve on persistent validation failure. Terminates at + // N=1 where count mismatch is impossible. if (hitValidationFailure && canSplit) { const mid = Math.floor(inputBlocks.length / 2); const left = inputBlocks.slice(0, mid); @@ -231,8 +211,7 @@ export class TranslationService { console.warn( `Batch splitting ${inputBlocks.length} -> ${left.length} + ${right.length} blocks`, ); - // Sequential: parallel halves would oversubscribe the worker pool slot - // and starve other batches. + // Sequential: parallel halves would oversubscribe the worker pool slot. const leftResult = await this.translateBatch( left, sourceLang, targetLang, provider, maxRetries, fileContext, cancelSignal, ); @@ -247,10 +226,6 @@ export class TranslationService { ); } - // --------------------------------------------------------------------- - // Chat HTTP call (shared by scan + translation) - // --------------------------------------------------------------------- - private async callChat( systemPrompt: string, userMessage: string, @@ -320,7 +295,6 @@ export class TranslationService { }); } - /** Pull a human-readable message out of whatever shape the provider returned. */ private extractServerMessage(err: unknown): string { if (!(err instanceof HttpErrorResponse) || !err.error) return ''; const body = Array.isArray(err.error) ? err.error[0] : err.error; @@ -340,13 +314,9 @@ export class TranslationService { } -// --------------------------------------------------------------------------- -// HELPERS -// --------------------------------------------------------------------------- - const CRED_QUERY_PARAMS = ['key', 'api_key', 'apikey', 'access_token']; -/** Drop credential query params like `?key=...` — we authenticate via header. */ +// We authenticate via header, so strip credential query params before sending. function sanitizeApiUrl(url: string): string { const trimmed = (url ?? '').trim(); if (!trimmed) return trimmed; @@ -359,7 +329,6 @@ function sanitizeApiUrl(url: string): string { } } -/** Strip whitespace, surrounding quotes, and any accidental `Bearer ` prefix. */ function sanitizeApiKey(key: string): string { let k = (key ?? '').trim(); if ((k.startsWith('"') && k.endsWith('"')) || (k.startsWith("'") && k.endsWith("'"))) { @@ -375,7 +344,7 @@ function buildHeaders(apiKey: string): Record { return headers; } -/** LLMs sometimes wrap output in ```...``` despite being told not to. */ +// LLMs sometimes wrap output in ```...``` even when told not to. function stripMarkdownFences(text: string): string { let t = text.trim(); if (t.startsWith('```')) { @@ -384,7 +353,6 @@ function stripMarkdownFences(text: string): string { return t; } -/** Retry on timeout, rate-limit, 5xx, or network errors. Everything else is fatal. */ function isRetryableStatus(status: number): boolean { return status === 0 || status === 408 || status === 429 || status >= 500; } From 947aa06908a804972f1a2854610d4845faf3bd9b Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Tue, 21 Apr 2026 21:12:16 +0300 Subject: [PATCH 14/15] Web: Bump version to 0.4.0 --- web/package-lock.json | 4 ++-- web/package.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/web/package-lock.json b/web/package-lock.json index d235689..ba2e745 100644 --- a/web/package-lock.json +++ b/web/package-lock.json @@ -1,12 +1,12 @@ { "name": "web", - "version": "0.3.0", + "version": "0.4.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "web", - "version": "0.3.0", + "version": "0.4.0", "dependencies": { "@angular/animations": "^19.1.0", "@angular/common": "^19.1.0", diff --git a/web/package.json b/web/package.json index 86e0103..4a1a9e9 100644 --- a/web/package.json +++ b/web/package.json @@ -1,6 +1,6 @@ { "name": "web", - "version": "0.3.0", + "version": "0.4.0", "scripts": { "ng": "ng", "start": "ng serve", From eb20dcee94ccbd0db8942aa4a34939660e2b12cb Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Tue, 21 Apr 2026 21:13:15 +0300 Subject: [PATCH 15/15] CLI: Bump version to 0.4.0 --- cli/pyproject.toml | 2 +- cli/translora.py | 2 +- cli/uv.lock | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cli/pyproject.toml b/cli/pyproject.toml index 81ab99c..4ecd425 100644 --- a/cli/pyproject.toml +++ b/cli/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "translora" -version = "0.3.1" +version = "0.4.0" description = "AI-powered subtitle translator with batched LLM calls and block-level validation." readme = "../README.md" requires-python = ">=3.10" diff --git a/cli/translora.py b/cli/translora.py index 8758df9..1e42605 100644 --- a/cli/translora.py +++ b/cli/translora.py @@ -15,7 +15,7 @@ from core.translator import translate_file_async from core.live_status import Colors, LiveLine, Ticker -__version__ = "0.3.1" +__version__ = "0.4.0" SUBTITLE_EXTS = {".srt", ".vtt", ".ass", ".ssa", ".sub", ".sbv"} diff --git a/cli/uv.lock b/cli/uv.lock index acfb38d..aed67ef 100644 --- a/cli/uv.lock +++ b/cli/uv.lock @@ -211,7 +211,7 @@ wheels = [ [[package]] name = "translora" -version = "0.3.1" +version = "0.4.0" source = { virtual = "." } dependencies = [ { name = "httpx" },