From a3553aafd8d25b3c1139cb6fb3d14d90e6d1551e Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Sat, 25 Apr 2026 23:12:47 +0300 Subject: [PATCH 1/6] Web: Enhance translation quality features - Added new options for context overlap, scan budget, review, and speaker attribution. - Improved LLM calls for consistent translation quality with refined context handling. - Introduced UI elements for configuring translation quality settings. - Enabled more accurate speaker attribution to fix gender-specific translation errors. - Implemented a conservative review process for first-pass translations to ensure fidelity. --- web/src/app/app.component.html | 28 +++ web/src/app/app.component.scss | 28 +++ web/src/app/app.component.ts | 22 ++ web/src/app/core/context-pass.ts | 316 +++++++++++++++++++----- web/src/app/core/translation-prompt.ts | 46 +++- web/src/app/core/translation.service.ts | 142 ++++++++++- 6 files changed, 503 insertions(+), 79 deletions(-) diff --git a/web/src/app/app.component.html b/web/src/app/app.component.html index ff48ae2..326d572 100644 --- a/web/src/app/app.component.html +++ b/web/src/app/app.component.html @@ -239,6 +239,34 @@

Advanced

Attempts before failing.

+ +
+ + +

Chars sent to the prepass. Lower for tight-context local models, raise for full-file scans on large-context cloud models.

+
+ +
+ + +

Previous-batch blocks shown as read-only context. 0 disables.

+
+ + +
+ +

One extra LLM call per batch fixing gender/number/consistency. Doubles cost — disable on metered providers.

+
+ +
+ +

One small call per ambiguous scene to fix cross-gender addressee slips.

diff --git a/web/src/app/app.component.scss b/web/src/app/app.component.scss index 9fd2866..2b92182 100644 --- a/web/src/app/app.component.scss +++ b/web/src/app/app.component.scss @@ -485,6 +485,34 @@ color: var(--text-muted); } +.field-toggle { + display: flex; + flex-direction: column; + gap: 0.25rem; + padding-top: 0.15rem; + + .toggle { + display: inline-flex; + align-items: center; + gap: 0.55rem; + font-size: 0.85rem; + font-weight: 600; + color: var(--text-primary); + cursor: pointer; + } + + .toggle input[type='checkbox'] { + width: 1rem; + height: 1rem; + accent-color: var(--color-primary-strong, var(--border-accent)); + cursor: pointer; + } + + .field-hint { + margin-left: 1.55rem; + } +} + .lang-hints { display: grid; grid-template-columns: minmax(0, 1fr) auto minmax(0, 1fr); diff --git a/web/src/app/app.component.ts b/web/src/app/app.component.ts index a3d8d2c..e0e890b 100644 --- a/web/src/app/app.component.ts +++ b/web/src/app/app.component.ts @@ -8,6 +8,10 @@ import { DEFAULT_BATCH_SIZE, DEFAULT_CONCURRENCY, DEFAULT_PARALLEL_FILES, + DEFAULT_CONTEXT_OVERLAP, + DEFAULT_SCAN_BUDGET, + DEFAULT_REFINE_ATTRIBUTION, + DEFAULT_REVIEW, TranslationCancelledError, } from './core/translation.service'; import { parseSubtitle } from './core/subtitle-formats'; @@ -28,6 +32,10 @@ const DEFAULTS = { concurrency: DEFAULT_CONCURRENCY, parallelFiles: DEFAULT_PARALLEL_FILES, maxRetries: DEFAULT_MAX_RETRIES, + contextOverlap: DEFAULT_CONTEXT_OVERLAP, + scanBudget: DEFAULT_SCAN_BUDGET, + refineAttribution: DEFAULT_REFINE_ATTRIBUTION, + review: DEFAULT_REVIEW, }; @Component({ @@ -58,6 +66,10 @@ export class AppComponent implements OnDestroy { batchSize = signal(DEFAULTS.batchSize); parallelFiles = signal(DEFAULTS.parallelFiles); maxRetries = signal(DEFAULTS.maxRetries); + contextOverlap = signal(DEFAULTS.contextOverlap); + scanBudget = signal(DEFAULTS.scanBudget); + refineAttribution = signal(DEFAULTS.refineAttribution); + review = signal(DEFAULTS.review); theme = signal<'light' | 'dark'>('light'); @@ -330,6 +342,10 @@ export class AppComponent implements OnDestroy { this.concurrency.set(this.currentPreset().defaultConcurrency); this.parallelFiles.set(DEFAULTS.parallelFiles); this.maxRetries.set(DEFAULTS.maxRetries); + this.contextOverlap.set(DEFAULTS.contextOverlap); + this.scanBudget.set(DEFAULTS.scanBudget); + this.refineAttribution.set(DEFAULTS.refineAttribution); + this.review.set(DEFAULTS.review); } swapLanguages() { @@ -493,6 +509,12 @@ export class AppComponent implements OnDestroy { }); }, cancelSignal, + { + contextOverlap: this.contextOverlap(), + scanBudget: this.scanBudget(), + refineAttribution: this.refineAttribution(), + review: this.review(), + }, ); if (cancelSignal.aborted || this.cancelRequested) return; diff --git a/web/src/app/core/context-pass.ts b/web/src/app/core/context-pass.ts index 9f31cf5..a881012 100644 --- a/web/src/app/core/context-pass.ts +++ b/web/src/app/core/context-pass.ts @@ -1,14 +1,16 @@ -// One-shot prepass: scans the file once for cast/terms/register so every batch -// shares the same glossary. Fails silently to an empty FileContext. +// One-shot prepass: scans the file once for cast/terms/scenes/register so every +// batch shares the same glossary. Fails silently to an empty FileContext. import { SubtitleBlock } from './srt-parser'; export const CONTEXT_SYSTEM_PROMPT = `You analyze a subtitle file before it is translated. Return a compact glossary for the translator to use when picking correct pronouns, consistent names, and a single consistent register. -Your reply MUST start with \`\` and MUST contain all four sections below, in this exact order, with no other text before, between, or after them. No commentary. No code fences. No explanations. Tags only. +Input blocks are prefixed with their block number as \`[N] text\`. + +Reply with all five sections below in this exact order. No commentary, no fences — tags only. -ONE LINE describing the target-language variant and formality the translator should use for the ENTIRE file. +ONE LINE describing the target-language variant and formality. NAME => TARGET_NAME | GENDER @@ -16,20 +18,27 @@ NAME => TARGET_NAME | GENDER SOURCE => TARGET + +START-END => description that NAMES the characters involved + - NOTE Rules: -- The line names the specific target-language variant and formality (e.g. "Modern Standard Arabic, neutral", "Brazilian Portuguese, casual", "Simplified Mandarin, neutral", "Japanese, polite です/ます form"). Pick ONE and commit to it for the whole file. Base the choice on the source's tone; default to the standard written form of the target language unless the source is clearly colloquial. -- GENDER is "male", "female", or "unknown". Use "unknown" only when the text gives no signal at all. -- TARGET_NAME is how the character's name should appear in the target language (transliterated or localized). -- Include up to 20 named characters, 10 recurring proper terms or jargon, 4 brief notes on setting/tone. +- : name the exact target variant (e.g. "Modern Standard Arabic, neutral", "Brazilian Portuguese, casual", "Japanese, polite です/ます form"). Pick one for the whole file. +- GENDER is "male", "female", or "unknown". Use "unknown" only when the text gives no signal. +- TARGET_NAME is how the character's name should appear in the target language. +- : every ≥3-block stretch of dialogue between named characters. Name the characters explicitly using the names from so the translator can apply the right gender per range. Ranges may touch but must not overlap. +- Example: \`105-119 => Maria reassures Alex about the interview\` (use the actual names from YOUR section). +- Include up to 20 characters, 10 terms, 40 scenes, 4 notes. - Leave a section empty (tags only) if nothing qualifies. Never omit a section.`; -// Sized so small-context models (4k-8k) still have room for prompt + output. -export const SCAN_CHAR_BUDGET = 12_000; -export const SCAN_MAX_TOKENS = 1500; +export const ATTRIBUTION_SYSTEM_PROMPT = `You identify the speaker of each subtitle line in a short scene. Given a character list and a block-numbered scene excerpt (\`[N] text\`), reply with exactly one line per input block as \`N=SpeakerName\`. SpeakerName MUST be one of the listed characters or the literal "unknown". No commentary, no fences.`; + +export const SCAN_MAX_TOKENS = 3000; +const MIN_NAME_LEN = 3; +const ATTRIB_MIN_BLOCKS = 3; export type Gender = 'male' | 'female' | 'unknown'; @@ -44,97 +53,193 @@ export interface TermHint { target: string; } +export interface SceneHint { + start: number; + end: number; + description: string; + participants: string[]; + attribution: Record; // Per-block speaker map (block_number -> character source name). +} + export class FileContext { constructor( public register = '', public characters: CharacterHint[] = [], public terms: TermHint[] = [], + public scenes: SceneHint[] = [], public notes: string[] = [], ) {} isEmpty(): boolean { - return !(this.register || this.characters.length || this.terms.length || this.notes.length); + return !(this.register || this.characters.length || this.terms.length + || this.scenes.length || this.notes.length); } - // Glossary slice scoped to names/terms present in this batch. Register and - // notes are file-wide and always included if set. + // Glossary slice scoped to this batch. Register/notes are file-wide. renderForBatch(batch: SubtitleBlock[]): string { const text = batch.map((b) => b.text).join('\n'); - const chars = this.characters.filter((h) => containsWord(text, h.source)); - const terms = this.terms.filter((h) => containsWord(text, h.source)); - if (!this.register && !chars.length && !terms.length && !this.notes.length) { + const scenes = scenesOverlapping(this.scenes, batch); + // Include characters named in the batch AND scene participants — the + // latter covers speakers who address each other as "you" without + // vocatives, so the translator still learns their gender. + const sceneNames = new Set(scenes.flatMap((s) => s.participants)); + const chars = this.characters.filter( + (h) => findWord(text, h.source) >= 0 || sceneNames.has(h.source), + ); + const terms = this.terms.filter((h) => findWord(text, h.source) >= 0); + if (!this.register && !chars.length && !terms.length && !scenes.length && !this.notes.length) { return ''; } + const genderBy = new Map(this.characters.map((h) => [h.source.toLowerCase(), h.gender])); const parts: string[] = []; if (this.register) { parts.push(`Target register: ${this.register} (use consistently across every block)`); } if (chars.length) { - const lines = chars.map((h) => `- ${h.source} => ${h.target} (${h.gender})`); - parts.push('Characters:\n' + lines.join('\n')); + parts.push('Characters:\n' + chars.map((h) => `- ${h.source} => ${h.target} (${h.gender})`).join('\n')); } if (terms.length) { - const lines = terms.map((h) => `- ${h.source} => ${h.target}`); - parts.push('Terms:\n' + lines.join('\n')); + parts.push('Terms:\n' + terms.map((h) => `- ${h.source} => ${h.target}`).join('\n')); + } + if (scenes.length) { + parts.push(renderScenes(scenes, genderBy)); } if (this.notes.length) { - const lines = this.notes.slice(0, 4).map((n) => `- ${n}`); - parts.push('Notes:\n' + lines.join('\n')); + parts.push('Notes:\n' + this.notes.slice(0, 4).map((n) => `- ${n}`).join('\n')); } return parts.join('\n\n'); } } -function escapeRegExp(s: string): string { - return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +function scenesOverlapping(scenes: SceneHint[], batch: SubtitleBlock[]): SceneHint[] { + if (!scenes.length || !batch.length) return []; + const first = batch[0].number; + const last = batch[batch.length - 1].number; + return scenes.filter((s) => s.end >= first && s.start <= last); +} + +function genderMark(g: Gender | undefined): string { + return g === 'male' ? 'M' : g === 'female' ? 'F' : ''; } -function containsWord(text: string, word: string): boolean { - if (!word) return false; - const re = new RegExp(`(?): string { + const lines: string[] = []; + for (const s of scenes) { + const tagged = s.participants.map((n) => { + const mark = genderMark(genderBy.get(n.toLowerCase())); + return mark ? `${n} (${mark})` : n; + }).join(', '); + const prefix = `- Blocks ${s.start}-${s.end}:`; + lines.push(tagged ? `${prefix} [${tagged}] — ${s.description}` : `${prefix} ${s.description}`); + const nums = Object.keys(s.attribution).map(Number).sort((a, b) => a - b); + if (nums.length) { + lines.push(' speakers: ' + nums.map((n) => `${n}=${s.attribution[n]}`).join(' ')); + } + } + return ( + "Scene guidance — each entry applies ONLY to its listed block range. " + + "Participants and genders in [brackets]; a 'speakers:' line names the " + + "speaker per block so you pick the right gender for the ADDRESSEE:\n" + + lines.join('\n') + ); } -// Stride-samples large files so characters introduced late still have a -// chance to land in the glossary. -export function serializeForScan(blocks: SubtitleBlock[]): string { - const totalChars = blocks.reduce((sum, b) => sum + b.text.length + 1, 0); - if (totalChars <= SCAN_CHAR_BUDGET || blocks.length <= 1) { - return blocks.map((b) => b.text).join('\n'); +// Case-insensitive whole-word search with Unicode-aware boundaries. +// Works for Latin, Arabic, CJK, etc. Returns first match index or -1. +function findWord(text: string, word: string): number { + if (!text || !word) return -1; + const haystack = text.toLowerCase(); + const needle = word.toLowerCase(); + const nlen = needle.length; + let i = 0; + while (i <= haystack.length - nlen) { + const j = haystack.indexOf(needle, i); + if (j < 0) return -1; + const before = j > 0 ? text[j - 1] : ''; + const after = j + nlen < text.length ? text[j + nlen] : ''; + if (!extendsWord(before) && !extendsWord(after)) return j; + i = j + 1; } + return -1; +} - const takeN = Math.max(1, Math.floor((blocks.length * SCAN_CHAR_BUDGET) / totalChars)); +function extendsWord(ch: string): boolean { + if (!ch) return false; + if (ch === '_') return true; + return /\p{L}|\p{N}/u.test(ch); +} + +function detectParticipants(text: string, characters: CharacterHint[]): string[] { + // Match source AND target forms so descriptions in the target language + // still resolve to the canonical source name. + const aliases: Array<{ alias: string; name: string }> = []; + for (const h of characters) { + if (h.source.length >= MIN_NAME_LEN) aliases.push({ alias: h.source, name: h.source }); + if (h.target !== h.source && h.target.length >= MIN_NAME_LEN) { + aliases.push({ alias: h.target, name: h.source }); + } + } + aliases.sort((a, b) => b.alias.length - a.alias.length); + const firstAt = new Map(); + for (const { alias, name } of aliases) { + if (firstAt.has(name)) continue; + const idx = findWord(text, alias); + if (idx >= 0) firstAt.set(name, idx); + } + return [...firstAt.entries()].sort((a, b) => a[1] - b[1]).map(([n]) => n); +} + +function formatScanLine(b: SubtitleBlock): string { + return `[${b.number}] ${b.text.replace(/\n/g, ' ')}`; +} + +// Stride-samples large files so characters introduced late still land in +// the glossary. +export function serializeForScan( + blocks: SubtitleBlock[], + charBudget: number, +): string { + const total = blocks.reduce((sum, b) => sum + formatScanLine(b).length + 1, 0); + if (total <= charBudget || blocks.length <= 1) { + return blocks.map(formatScanLine).join('\n'); + } + const takeN = Math.max(1, Math.floor((blocks.length * charBudget) / total)); const step = blocks.length / takeN; const sampled: SubtitleBlock[] = []; - for (let i = 0; i < takeN; i++) { - sampled.push(blocks[Math.floor(i * step)]); - } - return sampled.map((b) => b.text).join('\n'); + for (let i = 0; i < takeN; i++) sampled.push(blocks[Math.floor(i * step)]); + return sampled.map(formatScanLine).join('\n'); } -const SECTION_RE = /<(register|characters|terms|notes)>\s*([\s\S]*?)\s*<\/\1>/gi; +// Closing tag optional so a truncated reply still parses. +const SECTION_RE = + /<(register|characters|terms|scenes|notes)>\s*([\s\S]*?)\s*(?=<\/\1>|<(?:register|characters|terms|scenes|notes)>|$)/gi; +const SCENE_RANGE_RE = /^(\d+)\s*(?:-\s*(\d+))?$/; +const ATTRIB_LINE_RE = /^\s*(\d+)\s*=\s*(.+?)\s*$/; function stripBullet(line: string): string { return line.trim().replace(/^[-*•]\s*/, '').trim(); } -// Parse tagged response. Tolerates extra whitespace and bullet markers. +function splitOnce(s: string, sep: string): [string, string] { + const i = s.indexOf(sep); + return i < 0 ? [s, ''] : [s.slice(0, i), s.slice(i + sep.length)]; +} + +// Parse the tagged response. Tolerates whitespace and bullet markers. export function parseContextResponse(text: string): FileContext { const sections: Record = {}; - const src = text || ''; SECTION_RE.lastIndex = 0; let m: RegExpExecArray | null; - while ((m = SECTION_RE.exec(src)) !== null) { + while ((m = SECTION_RE.exec(text || '')) !== null) { sections[m[1].toLowerCase()] = m[2]; } - const rawRegister = sections['register'] ?? ''; - const register = stripBullet(rawRegister.split(/\s+/).join(' ')); + const register = stripBullet((sections['register'] ?? '').split(/\s+/).join(' ')); const characters: CharacterHint[] = []; - for (const rawLine of (sections['characters'] ?? '').split('\n')) { - const line = stripBullet(rawLine); + for (const raw of (sections['characters'] ?? '').split('\n')) { + const line = stripBullet(raw); if (!line || !line.includes('=>')) continue; const [srcPart, restPart] = splitOnce(line, '=>'); let tgt: string, gender: string; @@ -146,29 +251,42 @@ export function parseContextResponse(text: string): FileContext { tgt = restPart.trim(); gender = 'unknown'; } - const normalizedGender: Gender = - gender === 'male' || gender === 'female' ? gender : 'unknown'; - const src2 = srcPart.trim(); - if (src2 && tgt) { - characters.push({ source: src2, target: tgt, gender: normalizedGender }); - } + const g: Gender = gender === 'male' || gender === 'female' ? gender : 'unknown'; + const src = srcPart.trim(); + if (src && tgt) characters.push({ source: src, target: tgt, gender: g }); } const terms: TermHint[] = []; - for (const rawLine of (sections['terms'] ?? '').split('\n')) { - const line = stripBullet(rawLine); + for (const raw of (sections['terms'] ?? '').split('\n')) { + const line = stripBullet(raw); if (!line || !line.includes('=>')) continue; const [srcPart, tgtPart] = splitOnce(line, '=>'); - const src2 = srcPart.trim(); + const src = srcPart.trim(); const tgt = tgtPart.trim(); - if (src2 && tgt) { - terms.push({ source: src2, target: tgt }); - } + if (src && tgt) terms.push({ source: src, target: tgt }); + } + + const scenes: SceneHint[] = []; + for (const raw of (sections['scenes'] ?? '').split('\n')) { + const line = stripBullet(raw); + if (!line || !line.includes('=>')) continue; + const [rangePart, descPart] = splitOnce(line, '=>'); + const desc = descPart.trim(); + const rm = SCENE_RANGE_RE.exec(rangePart.trim()); + if (!desc || !rm) continue; + let start = parseInt(rm[1], 10); + let end = rm[2] ? parseInt(rm[2], 10) : start; + if (end < start) [start, end] = [end, start]; + scenes.push({ + start, end, description: desc, + participants: detectParticipants(desc, characters), + attribution: {}, + }); } const notes: string[] = []; - for (const rawLine of (sections['notes'] ?? '').split('\n')) { - const line = stripBullet(rawLine); + for (const raw of (sections['notes'] ?? '').split('\n')) { + const line = stripBullet(raw); if (line) notes.push(line); } @@ -176,12 +294,78 @@ export function parseContextResponse(text: string): FileContext { register, characters.slice(0, 20), terms.slice(0, 10), + scenes.slice(0, 80), notes.slice(0, 4), ); } -function splitOnce(s: string, sep: string): [string, string] { - const i = s.indexOf(sep); - if (i < 0) return [s, '']; - return [s.slice(0, i), s.slice(i + sep.length)]; +// Reconcile scene participants with what's in the source blocks. Block-text +// names are primary truth: description-named participants are kept only if +// grounded in the text; missed block-text names are appended. +export function enrichScenesWithBlockText( + context: FileContext, + blocks: SubtitleBlock[], +): FileContext { + if (!context.scenes.length || !context.characters.length) return context; + const byNum = new Map(blocks.map((b) => [b.number, b])); + const enriched = context.scenes.map((scene) => { + const parts: string[] = []; + for (let n = scene.start; n <= scene.end; n++) { + const b = byNum.get(n); + if (b) parts.push(b.text); + } + const inText = detectParticipants(parts.join('\n'), context.characters); + const inTextSet = new Set(inText); + const kept = scene.participants.filter((p) => inTextSet.has(p)); + const seen = new Set(kept); + for (const name of inText) { + if (!seen.has(name)) { + kept.push(name); + seen.add(name); + } + } + return { ...scene, participants: kept }; + }); + return new FileContext( + context.register, context.characters, context.terms, enriched, context.notes, + ); +} + +export function needsAttribution(scene: SceneHint): boolean { + return (scene.end - scene.start + 1) >= ATTRIB_MIN_BLOCKS && scene.participants.length >= 1; +} + +export function buildAttributionUserMessage( + scene: SceneHint, + blocks: SubtitleBlock[], + characters: CharacterHint[], +): string { + const byNum = new Map(blocks.map((b) => [b.number, b])); + const present = new Set(scene.participants); + const roster = characters + .filter((h) => present.has(h.source)) + .map((h) => `- ${h.source} (${genderMark(h.gender) || '?'})`) + .join('\n'); + const sceneLines: string[] = []; + for (let n = scene.start; n <= scene.end; n++) { + const b = byNum.get(n); + if (b) sceneLines.push(`[${n}] ${b.text.replace(/\n/g, ' ')}`); + } + return `Characters:\n${roster}\n\nScene:\n${sceneLines.join('\n')}`; +} + +export function parseAttributionResponse( + raw: string, scene: SceneHint, characters: CharacterHint[], +): Record { + const valid = new Set(characters.map((h) => h.source)); + valid.add('unknown'); + const out: Record = {}; + for (const line of (raw || '').split('\n')) { + const m = ATTRIB_LINE_RE.exec(line); + if (!m) continue; + const n = parseInt(m[1], 10); + const name = m[2].trim().replace(/^["']|["']$/g, ''); + if (n >= scene.start && n <= scene.end && valid.has(name)) out[n] = name; + } + return out; } diff --git a/web/src/app/core/translation-prompt.ts b/web/src/app/core/translation-prompt.ts index 181327f..b762c5f 100644 --- a/web/src/app/core/translation-prompt.ts +++ b/web/src/app/core/translation-prompt.ts @@ -1,3 +1,5 @@ +import { SubtitleBlock, serializeLite } from './srt-parser'; + export const SYSTEM_PROMPT = `You are a subtitle translator. You will receive numbered subtitle blocks (no timestamps) and translate them. Input format for each block: @@ -12,8 +14,11 @@ RULES (violating any = corrupt file): - Translate each block independently — never combine split sentences. - Translate faithfully: profanity, slurs, slang — match the original register. - Conversational tone, concise — must fit the original timing. -- If a glossary is provided, use each character's listed gender when choosing pronouns and verb forms in the target language, and use the listed target-language name consistently. -- Use ONE consistent register and variant of the target language across every block. Do not switch dialects or formality between batches. If the target language has a standard written form (e.g., Modern Standard Arabic), use it by default unless the source is clearly colloquial. +- If a glossary is provided, use each character's listed gender for pronouns/verb forms, and the listed target-language name consistently. +- "Scene guidance" entries apply PER BLOCK RANGE only. Match the addressee's gender (not just the speaker's). For exactly-two referents addressed together, use the target's dual form if it has one. +- A \`speakers:\` line (e.g. \`120=Alice 121=Alice 122=Bob\`) names the speaker per block. The ADDRESSEE is usually the other named participant — use the addressee's gender (from [brackets]) for second-person forms. +- "Previous context" blocks (if shown) are read-only — infer speaker/addressee from them, do NOT translate or include them. +- Use ONE consistent register and variant of the target language across every block. If the target language has a standard written form (e.g. Modern Standard Arabic), use it unless the source is clearly colloquial. DO NOT TRANSLATE (copy verbatim): - HTML tags, music symbols, formatting tags (\\N, {\\an8}) @@ -23,17 +28,50 @@ SHORT BLOCKS like "Oh!", "No!", "Hmm." are the #1 cause of missing blocks. Trans Output ONLY the translated .srt blocks. No commentary, no markdown fences.`; +export const REVIEW_SYSTEM_PROMPT = `You are a conservative subtitle translation reviewer. You receive a glossary, source blocks, and a first-pass translation in \`\\ntext\` wire format. + +DEFAULT: output the first-pass UNCHANGED. Only fix clear violations of the glossary: +- Wrong addressee gender (pronouns, verb conjugation, adjective ending, honorific level) when the glossary unambiguously names the addressee's gender. +- Character name spelled differently from the target form in the glossary. +- Dual/plural/singular agreement when the glossary explicitly flags the count. + +If uncertain, keep the block verbatim. Do NOT rephrase, restyle, or "polish". Same number of blocks, same block numbers, same line-count per block. + +Output: same wire format, one blank line between blocks. ALL blocks. No commentary, no fences.`; + export function buildUserMessage( sourceLang: string, targetLang: string, srtContent: string, glossary?: string, + prevTail: SubtitleBlock[] = [], ): string { const header = sourceLang ? `Translate from ${sourceLang} to ${targetLang}:` : `Translate to ${targetLang}:`; + const sections: string[] = []; if (glossary && glossary.trim()) { - return `Glossary for this scene:\n${glossary}\n\n${header}\n\n${srtContent}`; + sections.push(`Glossary for this scene:\n${glossary}`); } - return `${header}\n\n${srtContent}`; + if (prevTail.length) { + const lines = prevTail + .map((b) => ` [prev #${b.number}] ${b.text.replace(/\n/g, ' ')}`) + .join('\n'); + sections.push('Previous context (read-only, do NOT translate or output):\n' + lines); + } + sections.push(`${header}\n\n${srtContent}`); + return sections.join('\n\n'); +} + +export function buildReviewUserMessage( + batch: SubtitleBlock[], + firstPass: SubtitleBlock[], + glossary: string, +): string { + return ( + `Glossary:\n${glossary}\n\n` + + `Source blocks:\n${serializeLite(batch)}\n\n` + + `First-pass translation:\n${serializeLite(firstPass)}\n\n` + + 'Output the corrected translation (same wire format):' + ); } diff --git a/web/src/app/core/translation.service.ts b/web/src/app/core/translation.service.ts index d5e1923..70b5d29 100644 --- a/web/src/app/core/translation.service.ts +++ b/web/src/app/core/translation.service.ts @@ -9,11 +9,21 @@ import { validateBatch, } from './srt-parser'; import { SubtitleDocument } from './subtitle-formats/types'; -import { SYSTEM_PROMPT, buildUserMessage } from './translation-prompt'; import { + REVIEW_SYSTEM_PROMPT, + SYSTEM_PROMPT, + buildReviewUserMessage, + buildUserMessage, +} from './translation-prompt'; +import { + ATTRIBUTION_SYSTEM_PROMPT, CONTEXT_SYSTEM_PROMPT, FileContext, SCAN_MAX_TOKENS, + buildAttributionUserMessage, + enrichScenesWithBlockText, + needsAttribution, + parseAttributionResponse, parseContextResponse, serializeForScan, } from './context-pass'; @@ -40,6 +50,17 @@ export const DEFAULT_MAX_RETRIES = 5; export const DEFAULT_BATCH_SIZE = 10; export const DEFAULT_CONCURRENCY = 5; export const DEFAULT_PARALLEL_FILES = 1; +export const DEFAULT_CONTEXT_OVERLAP = 2; +export const DEFAULT_REVIEW = true; +export const DEFAULT_REFINE_ATTRIBUTION = true; +export const DEFAULT_SCAN_BUDGET = 24_000; + +export interface QualityOptions { + contextOverlap?: number; + scanBudget?: number; + refineAttribution?: boolean; + review?: boolean; +} const ATTEMPTS_BEFORE_SPLIT = 2; @@ -59,15 +80,26 @@ export class TranslationService { maxRetries = DEFAULT_MAX_RETRIES, onProgress?: (p: TranslationProgress) => void, cancelSignal?: AbortSignal, + quality: QualityOptions = {}, ): Promise { if (doc.blocks.length === 0) { throw new Error('No subtitle blocks found in file'); } throwIfCancelled(cancelSignal); + const contextOverlap = quality.contextOverlap ?? DEFAULT_CONTEXT_OVERLAP; + const scanBudget = quality.scanBudget ?? DEFAULT_SCAN_BUDGET; + const refineAttribution = quality.refineAttribution ?? DEFAULT_REFINE_ATTRIBUTION; + const review = quality.review ?? DEFAULT_REVIEW; + const fileContext = await this.extractFileContext( - doc.blocks, sourceLang, targetLang, provider, cancelSignal, + doc.blocks, sourceLang, targetLang, provider, scanBudget, cancelSignal, ); + if (refineAttribution && !fileContext.isEmpty()) { + await this.refineSceneAttribution( + fileContext, doc.blocks, provider, concurrency, cancelSignal, + ); + } const batches = splitBatches(doc.blocks, batchSize); const results: SubtitleBlock[][] = new Array(batches.length); @@ -84,8 +116,13 @@ export class TranslationService { throwIfCancelled(cancelSignal); const i = nextIdx++; if (i >= batches.length) return; + const prevTail = + i > 0 && contextOverlap > 0 + ? batches[i - 1].slice(-contextOverlap) + : []; results[i] = await this.translateBatch( - batches[i], sourceLang, targetLang, provider, maxRetries, fileContext, cancelSignal, + batches[i], sourceLang, targetLang, provider, maxRetries, fileContext, + prevTail, contextOverlap, review, cancelSignal, ); completed++; emit(); @@ -106,19 +143,22 @@ export class TranslationService { sourceLang: string, targetLang: string, provider: ProviderConfig, + scanBudget: number, cancelSignal?: AbortSignal, ): Promise { const sourceLine = sourceLang ? `Source language: ${sourceLang}\n` : ''; const userMessage = sourceLine + `Target language: ${targetLang}\n\n` + - serializeForScan(blocks); + serializeForScan(blocks, scanBudget); try { const raw = await this.callChat( CONTEXT_SYSTEM_PROMPT, userMessage, provider, SCAN_MAX_TOKENS, cancelSignal, ); - return parseContextResponse(stripMarkdownFences(raw)); + const ctx = parseContextResponse(stripMarkdownFences(raw)); + if (ctx.isEmpty()) return ctx; + return enrichScenesWithBlockText(ctx, blocks); } catch (err) { if (err instanceof TranslationCancelledError) throw err; console.warn('Context scan failed, proceeding without:', err); @@ -126,6 +166,74 @@ export class TranslationService { } } + private async refineSceneAttribution( + ctx: FileContext, + blocks: SubtitleBlock[], + provider: ProviderConfig, + concurrency: number, + cancelSignal?: AbortSignal, + ): Promise { + const targets = ctx.scenes.filter(needsAttribution); + if (!targets.length) return; + + let nextIdx = 0; + const worker = async () => { + while (true) { + throwIfCancelled(cancelSignal); + const i = nextIdx++; + if (i >= targets.length) return; + const scene = targets[i]; + try { + const userMsg = buildAttributionUserMessage(scene, blocks, ctx.characters); + const raw = await this.callChat( + ATTRIBUTION_SYSTEM_PROMPT, userMsg, provider, + (scene.end - scene.start + 1) * 20 + 100, cancelSignal, + ); + scene.attribution = parseAttributionResponse( + stripMarkdownFences(raw), scene, ctx.characters, + ); + } catch (err) { + if (err instanceof TranslationCancelledError) throw err; + console.warn( + `Attribution failed for blocks ${scene.start}-${scene.end}:`, err, + ); + } + } + }; + + const workerCount = Math.min(concurrency, targets.length); + await Promise.all(Array.from({ length: workerCount }, worker)); + } + + private async reviewBatch( + batch: SubtitleBlock[], + firstPass: SubtitleBlock[], + glossary: string, + provider: ProviderConfig, + cancelSignal?: AbortSignal, + ): Promise { + if (!glossary.trim()) return firstPass; + try { + const raw = await this.callChat( + REVIEW_SYSTEM_PROMPT, + buildReviewUserMessage(batch, firstPass, glossary), + provider, Math.max(batch.length, 1) * 120, cancelSignal, + ); + const parsed = parseLite(stripMarkdownFences(raw)); + if (parsed.length !== batch.length) return firstPass; + const revised = parsed.map((b, i) => ({ + number: batch[i].number, + timestamp: batch[i].timestamp, + text: b.text, + })); + return validateBatch(batch, revised).ok ? revised : firstPass; + } catch (err) { + if (err instanceof TranslationCancelledError) throw err; + console.warn('Review failed, keeping first-pass:', err); + return firstPass; + } + } + private async translateBatch( inputBlocks: SubtitleBlock[], sourceLang: string, @@ -133,6 +241,9 @@ export class TranslationService { provider: ProviderConfig, maxRetries: number, fileContext: FileContext, + prevTail: SubtitleBlock[], + contextOverlap: number, + review: boolean, cancelSignal?: AbortSignal, ): Promise { throwIfCancelled(cancelSignal); @@ -145,7 +256,9 @@ export class TranslationService { const batchWire = serializeLite(inputBlocks); const glossary = fileContext.renderForBatch(inputBlocks); - const userMessage = buildUserMessage(sourceLang, targetLang, batchWire, glossary); + const userMessage = buildUserMessage( + sourceLang, targetLang, batchWire, glossary, prevTail, + ); let hitValidationFailure = false; let lastError = ''; @@ -166,7 +279,14 @@ export class TranslationService { })); } const check = validateBatch(inputBlocks, output); - if (check.ok) return output; + if (check.ok) { + if (review) { + output = await this.reviewBatch( + inputBlocks, output, glossary, provider, cancelSignal, + ); + } + return output; + } hitValidationFailure = true; lastError = `validation: ${check.error}`; @@ -213,10 +333,14 @@ export class TranslationService { ); // Sequential: parallel halves would oversubscribe the worker pool slot. const leftResult = await this.translateBatch( - left, sourceLang, targetLang, provider, maxRetries, fileContext, cancelSignal, + left, sourceLang, targetLang, provider, maxRetries, fileContext, + prevTail, contextOverlap, review, cancelSignal, ); + const rightPrev = + contextOverlap > 0 ? left.slice(-contextOverlap) : []; const rightResult = await this.translateBatch( - right, sourceLang, targetLang, provider, maxRetries, fileContext, cancelSignal, + right, sourceLang, targetLang, provider, maxRetries, fileContext, + rightPrev, contextOverlap, review, cancelSignal, ); return [...leftResult, ...rightResult]; } From 4c710b4e525c092bd389b2782fdbe026f647a1e3 Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Sat, 25 Apr 2026 23:15:36 +0300 Subject: [PATCH 2/6] CLI: Boost translation quality with context - Adds options for context overlap, scan budget, and review to boost translation accuracy. - Enhances consistency by refining speaker attribution and glossary handling. - Improves gender-specific translation with conservative review of first-pass outputs. - Supports better handling of context between batches, reducing translation errors. --- cli/core/batch_runner.py | 75 ++++++- cli/core/config.py | 9 + cli/core/context_pass.py | 349 +++++++++++++++++++++++++++------ cli/core/prompt.py | 23 ++- cli/core/translator.py | 27 ++- cli/tests/test_context_pass.py | 334 ++++++++++++++++++++++++++++++- cli/tests/test_review_pass.py | 91 +++++++++ cli/translora.py | 21 ++ 8 files changed, 847 insertions(+), 82 deletions(-) create mode 100644 cli/tests/test_review_pass.py diff --git a/cli/core/batch_runner.py b/cli/core/batch_runner.py index c596f29..b83a590 100644 --- a/cli/core/batch_runner.py +++ b/cli/core/batch_runner.py @@ -11,7 +11,7 @@ from .context_pass import FileContext from .srt_parser import SubtitleBlock, parse_lite, serialize_lite, validate_batch from .config import TranslationConfig -from .prompt import SYSTEM_PROMPT +from .prompt import SYSTEM_PROMPT, REVIEW_SYSTEM_PROMPT REQUEST_TIMEOUT_SECS = 120.0 @@ -95,21 +95,68 @@ def _build_user_message( batch_wire: str, file_context: FileContext | None, batch: list[SubtitleBlock], + prev_tail: list[SubtitleBlock] | None = None, ) -> str: - if cfg.source_lang: - header = f"Translate from {cfg.source_lang} to {cfg.target_lang}:" - else: - header = f"Translate to {cfg.target_lang}:" + header = ( + f"Translate from {cfg.source_lang} to {cfg.target_lang}:" + if cfg.source_lang else f"Translate to {cfg.target_lang}:" + ) + sections: list[str] = [] if file_context is not None: - ctx = file_context.render_for_batch(batch) - if ctx: - return f"Glossary for this scene:\n{ctx}\n\n{header}\n\n{batch_wire}" - return f"{header}\n\n{batch_wire}" + glossary = file_context.render_for_batch(batch) + if glossary: + sections.append(f"Glossary for this scene:\n{glossary}") + if prev_tail: + # Non-numbered so the parser can't confuse these with real input blocks. + prev_lines = "\n".join( + f" [prev #{b.number}] {b.text.replace(chr(10), ' ')}" for b in prev_tail + ) + sections.append( + "Previous context (read-only, do NOT translate or output):\n" + prev_lines + ) + sections.append(f"{header}\n\n{batch_wire}") + return "\n\n".join(sections) _ATTEMPTS_BEFORE_SPLIT = 2 +async def _review_pass( + client: httpx.AsyncClient, + batch: list[SubtitleBlock], + first_pass: list[SubtitleBlock], + cfg: TranslationConfig, + file_context: FileContext | None, +) -> list[SubtitleBlock]: + """Re-check first-pass against the glossary; returns first-pass unchanged + if review output fails validation or there's no glossary to check against.""" + glossary = file_context.render_for_batch(batch) if file_context else "" + if not glossary: + return first_pass + user_msg = ( + f"Glossary:\n{glossary}\n\n" + f"Source blocks:\n{serialize_lite(batch)}\n\n" + f"First-pass translation:\n{serialize_lite(first_pass)}\n\n" + "Output the corrected translation (same wire format):" + ) + try: + raw = await call_chat_api( + client, REVIEW_SYSTEM_PROMPT, user_msg, cfg, max(len(batch), 1) * 120) + except Exception as e: + cfg.warn(f" Review failed, keeping first-pass: {e}") + return first_pass + parsed = parse_lite(strip_markdown_fences(raw)) + if len(parsed) != len(batch): + return first_pass + revised = [ + SubtitleBlock(number=batch[i].number, + timestamp=batch[i].timestamp, + text=parsed[i].text) + for i in range(len(batch)) + ] + return revised if validate_batch(batch, revised).ok else first_pass + + async def translate_batch_with_retry( client: httpx.AsyncClient, batch_idx: int, @@ -117,6 +164,7 @@ async def translate_batch_with_retry( cfg: TranslationConfig, file_context: FileContext | None = None, _split_path: str = "", + prev_tail: list[SubtitleBlock] | None = None, ) -> list[SubtitleBlock]: """Translate one batch; on repeated validation failure, halve and recurse. @@ -125,7 +173,7 @@ async def translate_batch_with_retry( because at N=1 a count mismatch is impossible. """ batch_wire = serialize_lite(batch) - user_msg = _build_user_message(cfg, batch_wire, file_context, batch) + user_msg = _build_user_message(cfg, batch_wire, file_context, batch, prev_tail) label = f"Batch {batch_idx + 1}" + (f".{_split_path}" if _split_path else "") first_block = batch[0].number @@ -149,6 +197,10 @@ async def translate_batch_with_retry( ] check = validate_batch(batch, output) if check.ok: + if cfg.review: + output = await _review_pass( + client, batch, output, cfg, file_context, + ) return output hit_validation_failure = True cfg.warn(f" {label} validation failed ({tag}): {check.error}") @@ -185,9 +237,12 @@ async def translate_batch_with_retry( # Sequential: parallel halves would oversubscribe the outer semaphore. left_result = await translate_batch_with_retry( client, batch_idx, left, cfg, file_context, left_path, + prev_tail=prev_tail, ) + right_prev = left[-cfg.context_overlap:] if cfg.context_overlap else [] right_result = await translate_batch_with_retry( client, batch_idx, right, cfg, file_context, right_path, + prev_tail=right_prev, ) return left_result + right_result diff --git a/cli/core/config.py b/cli/core/config.py index 67b98bf..93f7305 100644 --- a/cli/core/config.py +++ b/cli/core/config.py @@ -28,6 +28,15 @@ class TranslationConfig: batch_size: int = 10 concurrency: int = 1 max_retries: int = DEFAULT_MAX_RETRIES + # Prepass scan budget (chars). Sized for full-quality scans on typical + # TV episodes; lower on tight-context local models (~8k window). + scan_char_budget: int = 24_000 + # Previous-batch source blocks shown as read-only context; 0 disables. + context_overlap: int = 2 + # One small LLM call per ambiguous scene; fixes cross-gender addressee slips. + refine_attribution: bool = True + # One extra call per batch; fixes gender/number/consistency slips. Doubles cost. + review: bool = True quiet: bool = False verbose: bool = False warn: Callable[[str], None] = field(default=_silent_warn) diff --git a/cli/core/context_pass.py b/cli/core/context_pass.py index 8b3dedf..30b560d 100644 --- a/cli/core/context_pass.py +++ b/cli/core/context_pass.py @@ -1,8 +1,8 @@ -"""Prepass scan: extract cast, terms, and register from the whole file once -so every batch shares the same glossary. Fails silently to an empty context.""" +"""Prepass scan: one call extracts cast, terms, scenes, and register.""" from __future__ import annotations +import asyncio import re from dataclasses import dataclass, field @@ -17,12 +17,13 @@ for the translator to use when picking correct pronouns, consistent names, and a single consistent register. -Your reply MUST start with `` and MUST contain all four sections -below, in this exact order, with no other text before, between, or after them. -No commentary. No code fences. No explanations. Tags only. +Input blocks are prefixed with their block number as `[N] text`. + +Reply with all five sections below in this exact order. No commentary, no +fences — tags only. -ONE LINE describing the target-language variant and formality the translator should use for the ENTIRE file. +ONE LINE describing the target-language variant and formality. NAME => TARGET_NAME | GENDER @@ -30,27 +31,42 @@ SOURCE => TARGET + +START-END => description that NAMES the characters involved + - NOTE Rules: -- The line names the specific target-language variant and formality (e.g. "Modern Standard Arabic, neutral", "Brazilian Portuguese, casual", "Simplified Mandarin, neutral", "Japanese, polite です/ます form"). Pick ONE and commit to it for the whole file. Base the choice on the source's tone; default to the standard written form of the target language unless the source is clearly colloquial. -- GENDER is "male", "female", or "unknown". Use "unknown" only when the text gives no signal at all. -- TARGET_NAME is how the character's name should appear in the target language (transliterated or localized). -- Include up to 20 named characters, 10 recurring proper terms or jargon, 4 brief notes on setting/tone. +- : name the exact target variant (e.g. "Modern Standard Arabic, neutral", "Brazilian Portuguese, casual", "Japanese, polite です/ます form"). Pick one for the whole file. +- GENDER is "male", "female", or "unknown". Use "unknown" only when the text gives no signal. +- TARGET_NAME is how the character's name should appear in the target language. +- : every ≥3-block stretch of dialogue between named characters. Name the characters explicitly using the names from so the translator can apply the right gender per range. Ranges may touch but must not overlap. +- Example: `105-119 => Maria reassures Alex about the interview` (use the actual names from YOUR section). +- Include up to 20 characters, 10 terms, 40 scenes, 4 notes. - Leave a section empty (tags only) if nothing qualifies. Never omit a section.\ """ -# Sized so small-context models (4k-8k) still have room for prompt + output. -_SCAN_CHAR_BUDGET = 12_000 -_SCAN_MAX_TOKENS = 1500 +_ATTRIBUTION_SYSTEM_PROMPT = """\ +You identify the speaker of each subtitle line in a short scene. Given a +character list and a block-numbered scene excerpt (`[N] text`), reply with +exactly one line per input block as `N=SpeakerName`. SpeakerName MUST be one +of the listed characters or the literal "unknown". No commentary, no fences.\ +""" +_SCAN_MAX_TOKENS = 3000 +_MIN_NAME_LEN = 3 +_ATTRIB_MIN_BLOCKS = 3 _SECTION_RE = re.compile( - r"<(?Pregister|characters|terms|notes)>\s*(?P.*?)\s*", + r"<(?Pregister|characters|terms|scenes|notes)>\s*" + r"(?P.*?)\s*" + r"(?=|<(?:register|characters|terms|scenes|notes)>|\Z)", re.I | re.S, ) +_SCENE_RANGE_RE = re.compile(r"^(\d+)\s*(?:-\s*(\d+))?$") +_ATTRIB_LINE_RE = re.compile(r"^\s*(\d+)\s*=\s*(.+?)\s*$") @dataclass @@ -66,61 +82,164 @@ class TermHint: target: str +@dataclass +class SceneHint: + start: int + end: int + description: str + participants: list[str] = field(default_factory=list) + # Per-block speaker map (block_number -> character source name), filled + # by refine_scene_attribution. + attribution: dict[int, str] = field(default_factory=dict) + + @dataclass class FileContext: register: str = "" characters: list[CharacterHint] = field(default_factory=list) terms: list[TermHint] = field(default_factory=list) + scenes: list[SceneHint] = field(default_factory=list) notes: list[str] = field(default_factory=list) def is_empty(self) -> bool: - return not (self.register or self.characters or self.terms or self.notes) + return not (self.register or self.characters or self.terms + or self.scenes or self.notes) def render_for_batch(self, batch: list[SubtitleBlock]) -> str: - """Return a glossary slice scoped to names/terms present in this batch. - Register and notes are file-wide and always included if set.""" + """Glossary slice scoped to this batch. Register/notes are file-wide.""" text = "\n".join(b.text for b in batch) - chars = [h for h in self.characters if _contains_word(text, h.source)] - terms = [h for h in self.terms if _contains_word(text, h.source)] - if not self.register and not chars and not terms and not self.notes: + scenes = _scenes_overlapping(self.scenes, batch) + # Include characters named in the batch AND scene participants — the + # latter covers speakers who address each other as "you" without + # vocatives, so the translator still learns their gender. + scene_names = {p for s in scenes for p in s.participants} + chars = [h for h in self.characters + if _find_word(text, h.source) >= 0 or h.source in scene_names] + terms = [h for h in self.terms if _find_word(text, h.source) >= 0] + if not (self.register or chars or terms or scenes or self.notes): return "" + gender_by = {h.source.casefold(): h.gender for h in self.characters} parts: list[str] = [] if self.register: parts.append(f"Target register: {self.register} (use consistently across every block)") if chars: - lines = [f"- {h.source} => {h.target} ({h.gender})" for h in chars] - parts.append("Characters:\n" + "\n".join(lines)) + parts.append("Characters:\n" + "\n".join( + f"- {h.source} => {h.target} ({h.gender})" for h in chars)) if terms: - lines = [f"- {h.source} => {h.target}" for h in terms] - parts.append("Terms:\n" + "\n".join(lines)) + parts.append("Terms:\n" + "\n".join( + f"- {h.source} => {h.target}" for h in terms)) + if scenes: + parts.append(_render_scenes(scenes, gender_by)) if self.notes: - lines = [f"- {n}" for n in self.notes[:4]] - parts.append("Notes:\n" + "\n".join(lines)) + parts.append("Notes:\n" + "\n".join(f"- {n}" for n in self.notes[:4])) return "\n\n".join(parts) -def _contains_word(text: str, word: str) -> bool: - if not word: - return False - return re.search(rf"(? list[SceneHint]: + if not scenes or not batch: + return [] + first, last = batch[0].number, batch[-1].number + return [s for s in scenes if s.end >= first and s.start <= last] -def serialize_for_scan(blocks: list[SubtitleBlock]) -> str: - """Text for the scan pass. Stride-samples large files so characters - introduced late still have a chance to land in the glossary.""" - total_chars = sum(len(b.text) + 1 for b in blocks) - if total_chars <= _SCAN_CHAR_BUDGET or len(blocks) <= 1: - return "\n".join(b.text for b in blocks) +def _gender_mark(g: str | None) -> str: + return "M" if g == "male" else "F" if g == "female" else "" + + +def _render_scenes(scenes: list[SceneHint], gender_by: dict[str, str]) -> str: + lines: list[str] = [] + for s in scenes: + tagged = ", ".join( + f"{n} ({mark})" if (mark := _gender_mark(gender_by.get(n.casefold()))) else n + for n in s.participants + ) + prefix = f"- Blocks {s.start}-{s.end}:" + lines.append( + f"{prefix} [{tagged}] — {s.description}" if tagged + else f"{prefix} {s.description}") + if s.attribution: + speakers = " ".join(f"{n}={s.attribution[n]}" for n in sorted(s.attribution)) + lines.append(f" speakers: {speakers}") + return ( + "Scene guidance — each entry applies ONLY to its listed block range. " + "Participants and genders in [brackets]; a 'speakers:' line names the " + "speaker per block so you pick the right gender for the ADDRESSEE:\n" + + "\n".join(lines) + ) + - take_n = max(1, int(len(blocks) * _SCAN_CHAR_BUDGET / total_chars)) +def _find_word(text: str, word: str) -> int: + """Case-insensitive whole-word search with Unicode-aware boundaries. + Works for Latin, Arabic, CJK, etc. Returns first match index or -1.""" + if not text or not word: + return -1 + haystack, needle = text.casefold(), word.casefold() + nlen = len(needle) + i = 0 + while i <= len(haystack) - nlen: + j = haystack.find(needle, i) + if j < 0: + return -1 + before = text[j - 1] if j > 0 else "" + after = text[j + nlen] if j + nlen < len(text) else "" + # isalnum is Unicode-aware. + if not (before.isalnum() or before == "_") and not (after.isalnum() or after == "_"): + return j + i = j + 1 + return -1 + + +def _detect_participants( + text: str, characters: list[CharacterHint], +) -> list[str]: + """Source names whose source OR target form appears in `text` as a whole + word, in order of first appearance. Matches both forms because scan + descriptions often slip into the target language.""" + aliases: list[tuple[str, str]] = [] # (alias, source_name) + for h in characters: + if len(h.source) >= _MIN_NAME_LEN: + aliases.append((h.source, h.source)) + if h.target != h.source and len(h.target) >= _MIN_NAME_LEN: + aliases.append((h.target, h.source)) + aliases.sort(key=lambda a: len(a[0]), reverse=True) + + first_at: dict[str, int] = {} + for alias, name in aliases: + if name in first_at: + continue + idx = _find_word(text, alias) + if idx >= 0: + first_at[name] = idx + return sorted(first_at, key=first_at.__getitem__) + + +def _format_scan_line(b: SubtitleBlock) -> str: + return f"[{b.number}] " + b.text.replace("\n", " ") + + +def serialize_for_scan( + blocks: list[SubtitleBlock], char_budget: int, +) -> str: + """Text for the scan pass. Stride-samples large files so characters + introduced late still land in the glossary.""" + total = sum(len(_format_scan_line(b)) + 1 for b in blocks) + if total <= char_budget or len(blocks) <= 1: + return "\n".join(_format_scan_line(b) for b in blocks) + take_n = max(1, int(len(blocks) * char_budget / total)) step = len(blocks) / take_n sampled = [blocks[int(i * step)] for i in range(take_n)] - return "\n".join(b.text for b in sampled) + return "\n".join(_format_scan_line(b) for b in sampled) + + +def _strip_bullet(line: str) -> str: + return line.strip().lstrip("-*• ").strip() def parse_context_response(text: str) -> FileContext: - """Parse the tagged response. Tolerates extra whitespace and bullet markers.""" + """Parse the tagged response. Tolerates whitespace and bullet markers.""" sections = { m.group("tag").lower(): m.group("body") for m in _SECTION_RE.finditer(text or "") @@ -130,7 +249,7 @@ def parse_context_response(text: str) -> FileContext: characters: list[CharacterHint] = [] for line in sections.get("characters", "").splitlines(): - line = line.strip().lstrip("-*• ").strip() + line = _strip_bullet(line) if not line or "=>" not in line: continue src, rest = line.split("=>", 1) @@ -141,30 +260,43 @@ def parse_context_response(text: str) -> FileContext: tgt, gender = rest.strip(), "unknown" if gender not in ("male", "female", "unknown"): gender = "unknown" - src = src.strip() - if src and tgt: - characters.append(CharacterHint(src, tgt, gender)) + if src.strip() and tgt: + characters.append(CharacterHint(src.strip(), tgt, gender)) terms: list[TermHint] = [] for line in sections.get("terms", "").splitlines(): - line = line.strip().lstrip("-*• ").strip() + line = _strip_bullet(line) if not line or "=>" not in line: continue src, tgt = line.split("=>", 1) - src, tgt = src.strip(), tgt.strip() - if src and tgt: - terms.append(TermHint(src, tgt)) + if src.strip() and tgt.strip(): + terms.append(TermHint(src.strip(), tgt.strip())) + + scenes: list[SceneHint] = [] + for line in sections.get("scenes", "").splitlines(): + line = _strip_bullet(line) + if not line or "=>" not in line: + continue + rng, desc = line.split("=>", 1) + m = _SCENE_RANGE_RE.match(rng.strip()) + if not m or not desc.strip(): + continue + start = int(m.group(1)) + end = int(m.group(2)) if m.group(2) else start + if end < start: + start, end = end, start + scenes.append(SceneHint( + start=start, end=end, description=desc.strip(), + participants=_detect_participants(desc, characters), + )) - notes: list[str] = [] - for line in sections.get("notes", "").splitlines(): - line = line.strip().lstrip("-*• ").strip() - if line: - notes.append(line) + notes = [_strip_bullet(l) for l in sections.get("notes", "").splitlines() if _strip_bullet(l)] return FileContext( register=register, characters=characters[:20], terms=terms[:10], + scenes=scenes[:80], notes=notes[:4], ) @@ -174,21 +306,17 @@ async def extract_file_context( blocks: list[SubtitleBlock], cfg: TranslationConfig, ) -> FileContext: - """Run one scan call and return a FileContext. Empty on any failure.""" + """Run one scan call. Returns the parsed+enriched context.""" from .batch_runner import call_chat_api, strip_markdown_fences source_line = f"Source language: {cfg.source_lang}\n" if cfg.source_lang else "" - user_message = ( - f"{source_line}" - f"Target language: {cfg.target_lang}\n\n" - f"{serialize_for_scan(blocks)}" + user_msg = ( + f"{source_line}Target language: {cfg.target_lang}\n\n" + f"{serialize_for_scan(blocks, cfg.scan_char_budget)}" ) try: raw = await call_chat_api( - client, - CONTEXT_SYSTEM_PROMPT, - user_message, - cfg, + client, CONTEXT_SYSTEM_PROMPT, user_msg, cfg, max_tokens=_SCAN_MAX_TOKENS, ) except Exception as e: @@ -197,7 +325,104 @@ async def extract_file_context( context = parse_context_response(strip_markdown_fences(raw)) if context.is_empty(): - # Diagnostic snippet: helps tell whether the model ignored tags, truncated, or refused. snippet = (raw or "").strip().replace("\n", " ")[:240] cfg.warn(f" Context scan returned empty glossary. Raw start: {snippet!r}") + else: + enrich_scenes_with_block_text(context, blocks) + return context + + +def enrich_scenes_with_block_text( + context: FileContext, blocks: list[SubtitleBlock], +) -> FileContext: + """Reconcile scene participants with what's actually in the source blocks. + Block-text names are primary truth: description-named participants are + kept only if grounded in the text, and any block-text names missed by the + description are appended.""" + if not context.scenes or not context.characters: + return context + by_num = {b.number: b for b in blocks} + for s in context.scenes: + joined = "\n".join( + by_num[n].text for n in range(s.start, s.end + 1) if n in by_num) + in_text = _detect_participants(joined, context.characters) + in_text_set = set(in_text) + kept = [p for p in s.participants if p in in_text_set] + seen = set(kept) + for name in in_text: + if name not in seen: + kept.append(name) + seen.add(name) + s.participants = kept return context + + +def _needs_attribution(scene: SceneHint, gender_by: dict[str, str]) -> bool: + return (scene.end - scene.start + 1 >= _ATTRIB_MIN_BLOCKS + and len(scene.participants) >= 1) + + +async def _attribute_scene( + client: httpx.AsyncClient, + scene: SceneHint, + by_num: dict[int, SubtitleBlock], + cfg: TranslationConfig, + characters: list[CharacterHint], +) -> dict[int, str]: + from .batch_runner import call_chat_api + present = set(scene.participants) + roster = "\n".join( + f"- {h.source} ({_gender_mark(h.gender) or '?'})" + for h in characters if h.source in present + ) + block_lines = [ + f"[{n}] {by_num[n].text.replace(chr(10), ' ')}" + for n in range(scene.start, scene.end + 1) if n in by_num + ] + if not block_lines or not roster: + return {} + user_msg = f"Characters:\n{roster}\n\nScene:\n" + "\n".join(block_lines) + try: + raw = await call_chat_api( + client, _ATTRIBUTION_SYSTEM_PROMPT, user_msg, cfg, + max_tokens=len(block_lines) * 20 + 100, + ) + except Exception as e: + cfg.warn(f" Attribution failed for blocks {scene.start}-{scene.end}: {e}") + return {} + out: dict[int, str] = {} + valid = {h.source for h in characters} | {"unknown"} + for line in (raw or "").splitlines(): + m = _ATTRIB_LINE_RE.match(line) + if not m: + continue + n = int(m.group(1)) + name = m.group(2).strip().strip('"\'') + if scene.start <= n <= scene.end and name in valid: + out[n] = name + return out + + +async def refine_scene_attribution( + client: httpx.AsyncClient, + context: FileContext, + blocks: list[SubtitleBlock], + cfg: TranslationConfig, +) -> None: + """Fill `SceneHint.attribution` for multi-block scenes with named + participants. One small LLM call per target scene, bounded by concurrency.""" + if not context.scenes or not context.characters: + return + gender_by = {h.source.casefold(): h.gender for h in context.characters} + targets = [s for s in context.scenes if _needs_attribution(s, gender_by)] + if not targets: + return + by_num = {b.number: b for b in blocks} + sem = asyncio.Semaphore(max(1, cfg.concurrency)) + + async def do(scene: SceneHint) -> None: + async with sem: + scene.attribution = await _attribute_scene( + client, scene, by_num, cfg, context.characters) + + await asyncio.gather(*(do(s) for s in targets)) diff --git a/cli/core/prompt.py b/cli/core/prompt.py index 45763af..4888500 100644 --- a/cli/core/prompt.py +++ b/cli/core/prompt.py @@ -1,4 +1,4 @@ -"""The translation prompt, kept in one place so it can be iterated on.""" +"""The translation prompts, kept in one place so they can be iterated on.""" SYSTEM_PROMPT = """\ You are a subtitle translator. You will receive numbered subtitle blocks (no timestamps) and translate them. @@ -15,8 +15,11 @@ - Translate each block independently — never combine split sentences. - Translate faithfully: profanity, slurs, slang — match the original register. - Conversational tone, concise — must fit the original timing. -- If a glossary is provided, use each character's listed gender when choosing pronouns and verb forms in the target language, and use the listed target-language name consistently. -- Use ONE consistent register and variant of the target language across every block. Do not switch dialects or formality between batches. If the target language has a standard written form (e.g., Modern Standard Arabic), use it by default unless the source is clearly colloquial. +- If a glossary is provided, use each character's listed gender for pronouns/verb forms, and the listed target-language name consistently. +- "Scene guidance" entries apply PER BLOCK RANGE only. Match the addressee's gender (not just the speaker's). For exactly-two referents addressed together, use the target's dual form if it has one. +- A `speakers:` line (e.g. `120=Alice 121=Alice 122=Bob`) names the speaker per block. The ADDRESSEE is usually the other named participant — use the addressee's gender (from [brackets]) for second-person forms. +- "Previous context" blocks (if shown) are read-only — infer speaker/addressee from them, do NOT translate or include them. +- Use ONE consistent register and variant of the target language across every block. If the target language has a standard written form (e.g. Modern Standard Arabic), use it unless the source is clearly colloquial. DO NOT TRANSLATE (copy verbatim): - HTML tags, music symbols, formatting tags (\\N, {\\an8}) @@ -26,3 +29,17 @@ Output ONLY the translated .srt blocks. No commentary, no markdown fences.\ """ + + +REVIEW_SYSTEM_PROMPT = """\ +You are a conservative subtitle translation reviewer. You receive a glossary, source blocks, and a first-pass translation in `\\ntext` wire format. + +DEFAULT: output the first-pass UNCHANGED. Only fix clear violations of the glossary: +- Wrong addressee gender (pronouns, verb conjugation, adjective ending, honorific level) when the glossary unambiguously names the addressee's gender. +- Character name spelled differently from the target form in the glossary. +- Dual/plural/singular agreement when the glossary explicitly flags the count. + +If uncertain, keep the block verbatim. Do NOT rephrase, restyle, or "polish". Same number of blocks, same block numbers, same line-count per block. + +Output: same wire format, one blank line between blocks. ALL blocks. No commentary, no fences.\ +""" diff --git a/cli/core/translator.py b/cli/core/translator.py index a4d68a6..039c5c4 100644 --- a/cli/core/translator.py +++ b/cli/core/translator.py @@ -13,7 +13,7 @@ from .formats import parse_subtitle from .config import DEFAULT_MAX_RETRIES, TranslationConfig from .batch_runner import FileTranslationError, translate_batch_with_retry -from .context_pass import FileContext, extract_file_context +from .context_pass import FileContext, extract_file_context, refine_scene_attribution from .time_tracker import EtaEstimator, format_duration from .live_status import Colors, LiveLine, Ticker @@ -66,16 +66,27 @@ async def translate_file_async( async with httpx.AsyncClient() as scan_client: if not cfg.quiet: print(colors.dim(" Scanning for cast and context...")) - file_context = await extract_file_context(scan_client, doc.blocks, cfg) + file_context = await extract_file_context( + scan_client, doc.blocks, cfg, + ) + if cfg.refine_attribution and not file_context.is_empty(): + if not cfg.quiet: + print(colors.dim(" Attributing speakers in mixed-gender scenes...")) + await refine_scene_attribution( + scan_client, file_context, doc.blocks, cfg, + ) if not cfg.quiet: if file_context.is_empty(): print(colors.dim(" Glossary: empty (proceeding without context hints)")) else: chars = len(file_context.characters) terms = len(file_context.terms) + scenes = len(file_context.scenes) + attrib = sum(1 for s in file_context.scenes if s.attribution) notes = len(file_context.notes) print(colors.dim( - f" Glossary: {chars} character(s), {terms} term(s), {notes} note(s)" + f" Glossary: {chars} character(s), {terms} term(s), " + f"{scenes} scene(s) ({attrib} attributed), {notes} note(s)" )) if file_context.register: print(colors.dim(f" Register: {file_context.register}")) @@ -99,6 +110,14 @@ async def translate_file_async( print(colors.dim(f" Output: {output_path}")) +def _prev_tail( + batches: list[list[SubtitleBlock]], idx: int, overlap: int, +) -> list[SubtitleBlock]: + if idx <= 0 or overlap <= 0: + return [] + return batches[idx - 1][-overlap:] + + async def _run_batches( batches: list[list[SubtitleBlock]], cfg: TranslationConfig, @@ -147,9 +166,11 @@ async def run_one(idx: int) -> None: if failure: return batch_start = time.time() + prev_tail = _prev_tail(batches, idx, cfg.context_overlap) try: results[idx] = await translate_batch_with_retry( client, idx, batches[idx], cfg, file_context, + prev_tail=prev_tail, ) except FileTranslationError as e: failure = e diff --git a/cli/tests/test_context_pass.py b/cli/tests/test_context_pass.py index 50fefa2..9fb07e4 100644 --- a/cli/tests/test_context_pass.py +++ b/cli/tests/test_context_pass.py @@ -2,12 +2,16 @@ FileContext, CharacterHint, TermHint, + SceneHint, + _needs_attribution, + enrich_scenes_with_block_text, parse_context_response, serialize_for_scan, - _SCAN_CHAR_BUDGET, ) from core.srt_parser import SubtitleBlock +_TEST_BUDGET = 24_000 + def _block(n: int, text: str) -> SubtitleBlock: return SubtitleBlock(number=n, timestamp="00:00:00,000 --> 00:00:01,000", text=text) @@ -77,6 +81,31 @@ def test_is_empty_considers_register(): assert not FileContext(register="Target language").is_empty() +def test_parse_tolerates_missing_closing_tag(): + # Real scan models sometimes drop the closing tag before the + # next section. The body should still parse up to the next opening tag. + raw = """ + +Target variant + + +Alice => آليس | female + + + + +1-5 => Alice speaks +6-10 => Alice continues + +- tone note + +""" + ctx = parse_context_response(raw) + assert ctx.register == "Target variant" + assert len(ctx.scenes) == 2 + assert ctx.notes == ["tone note"] + + def test_parse_tolerates_missing_sections_and_bullets(): raw = """ @@ -134,20 +163,317 @@ def test_render_word_boundary_does_not_match_substrings(): assert "Alice" not in ctx.render_for_batch(batch) +def test_parse_scenes(): + raw = """ + + + + + + + +97-117 => Alice and Carol discuss a concern +279-284 => Dave talks about his daughters +42 => Bob monologues + + + +""" + ctx = parse_context_response(raw) + assert [(s.start, s.end, s.description) for s in ctx.scenes] == [ + (97, 117, "Alice and Carol discuss a concern"), + (279, 284, "Dave talks about his daughters"), + (42, 42, "Bob monologues"), + ] + # No characters section, so no participants should be detected. + for s in ctx.scenes: + assert s.participants == [] + + +def test_parse_scenes_detects_participants_from_characters(): + raw = """ + + + +Alice => Alice | female +Carol => Carol | female +Dave => Dave | male + + + + +97-117 => Alice tells Carol her worries +279-284 => Dave complains about his daughters + + + +""" + ctx = parse_context_response(raw) + assert ctx.scenes[0].participants == ["Alice", "Carol"] + assert ctx.scenes[1].participants == ["Dave"] + + +def test_parse_scenes_detects_participants_via_target_name(): + # Scan model wrote the scene description using the target-language form + # of the character's name (common when prompt output slips into the + # target language). We should still resolve it back to the source name. + raw = """ + +Alice => آليس | female +Carol => كارول | female + + +97-117 => آليس تخبر كارول بمخاوفها + +""" + ctx = parse_context_response(raw) + assert ctx.scenes[0].participants == ["Alice", "Carol"] + + +def test_needs_attribution_triggers_on_multi_block_named_scenes(): + g = {"alice": "female", "bob": "male"} + multi_named = SceneHint(start=1, end=5, description="x", participants=["Alice", "Bob"]) + multi_one = SceneHint(start=1, end=5, description="x", participants=["Alice"]) + two_block = SceneHint(start=1, end=2, description="x", participants=["Alice"]) + no_one = SceneHint(start=1, end=5, description="x", participants=[]) + assert _needs_attribution(multi_named, g) is True + assert _needs_attribution(multi_one, g) is True + assert _needs_attribution(two_block, g) is False + assert _needs_attribution(no_one, g) is False + + +def test_render_for_batch_includes_speakers_line_when_attribution_present(): + ctx = FileContext( + characters=[ + CharacterHint("Alice", "Alice", "female"), + CharacterHint("Bob", "Bob", "male"), + ], + scenes=[SceneHint( + start=10, end=12, description="Alice advises Bob", + participants=["Alice", "Bob"], + attribution={10: "Alice", 11: "Alice", 12: "Bob"}, + )], + ) + batch = [_block(10, "x"), _block(11, "y"), _block(12, "z")] + out = ctx.render_for_batch(batch) + assert "speakers: 10=Alice 11=Alice 12=Bob" in out + assert "[Alice (F), Bob (M)]" in out + + +def test_enrich_scenes_pulls_names_from_block_text_when_description_omits_them(): + # Description says nothing about who's speaking, but the block text + # contains a vocative — the classic "the summary is abstract but the + # dialogue names names" case. + ctx = FileContext( + characters=[ + CharacterHint("Alice", "Alice", "female"), + CharacterHint("Dave", "Dave", "male"), + ], + scenes=[SceneHint(start=1, end=3, description="A tense conversation")], + ) + blocks = [ + _block(1, "Alice, I need to talk to you."), + _block(2, "About what?"), + _block(3, "Dave said he's leaving."), + ] + enriched = enrich_scenes_with_block_text(ctx, blocks) + assert enriched.scenes[0].participants == ["Alice", "Dave"] + + +def test_enrich_scenes_preserves_description_order_and_dedups(): + ctx = FileContext( + characters=[ + CharacterHint("Alice", "Alice", "female"), + CharacterHint("Dave", "Dave", "male"), + ], + scenes=[SceneHint( + start=1, end=2, + description="Dave talks to someone", + participants=["Dave"], + )], + ) + blocks = [ + _block(1, "Alice, look at this."), + _block(2, "Dave, calm down."), + ] + enriched = enrich_scenes_with_block_text(ctx, blocks) + # "Dave" kept (grounded in block 2), "Alice" appended (found in block 1). + assert enriched.scenes[0].participants == ["Dave", "Alice"] + + +def test_enrich_drops_description_name_not_in_blocks(): + # Scan hallucinated "Alice" into the description but she never actually + # speaks in these blocks — drop her, keep only Dave who's really there. + ctx = FileContext( + characters=[ + CharacterHint("Alice", "Alice", "female"), + CharacterHint("Dave", "Dave", "male"), + ], + scenes=[SceneHint( + start=1, end=2, + description="Alice and Dave talk", + participants=["Alice", "Dave"], + )], + ) + blocks = [ + _block(1, "Dave, are you okay?"), + _block(2, "I'm fine."), + ] + enriched = enrich_scenes_with_block_text(ctx, blocks) + assert enriched.scenes[0].participants == ["Dave"] + + +def test_parse_scenes_rejects_substring_match_inside_other_words(): + # A 2-char Arabic transliteration like "لو" would substring-match inside + # many Arabic words (e.g. "الوقوف" contains "لو"). We require whole-word + # matching AND a minimum alias length of 3 to avoid these collisions. + raw = """ + +Lou => لو | male +Alice => آليس | female + + +10-20 => نصائح حول الوقوف وتأثيره على الصحة +21-25 => آليس تطمئن + +""" + ctx = parse_context_response(raw) + # "Lou" (2-char target "لو") must NOT match inside "الوقوف" (standing). + assert ctx.scenes[0].participants == [] + assert ctx.scenes[1].participants == ["Alice"] + + +def test_parse_scenes_skips_malformed_lines(): + raw = """ + +- 10-20 => Two characters (M, F) +- no-range => missing range +- 30 40 => bad separator +- 50-60 => +- 70-80 => good one + +""" + ctx = parse_context_response(raw) + assert [(s.start, s.end) for s in ctx.scenes] == [(10, 20), (70, 80)] + + +def test_parse_scenes_swaps_reversed_range(): + raw = """ + +200-100 => Accidentally reversed + +""" + ctx = parse_context_response(raw) + assert ctx.scenes[0].start == 100 + assert ctx.scenes[0].end == 200 + + +def test_render_includes_overlapping_scenes_only(): + ctx = FileContext( + scenes=[ + SceneHint(start=1, end=5, description="Scene A"), + SceneHint(start=10, end=20, description="Scene B"), + SceneHint(start=50, end=60, description="Scene C"), + ], + ) + # Batch covers blocks 15-25 — touches scene B only. + batch = [_block(15, "line"), _block(25, "line")] + rendered = ctx.render_for_batch(batch) + assert "Scene B" in rendered + assert "Scene A" not in rendered + assert "Scene C" not in rendered + assert "Blocks 10-20" in rendered + + +def test_render_scene_boundary_touch_is_match(): + # Batch first-block equals scene end — still overlaps. + ctx = FileContext(scenes=[SceneHint(start=5, end=10, description="Boundary scene")]) + batch = [_block(10, "line"), _block(15, "line")] + assert "Boundary scene" in ctx.render_for_batch(batch) + + +def test_render_includes_scene_participants_even_if_unnamed_in_batch_text(): + # Carol's name isn't vocatively spoken in the batch blocks, but she IS a + # scene participant — the translator still needs to know her gender. + ctx = FileContext( + characters=[ + CharacterHint("Carol", "Carol", "female"), + CharacterHint("Dave", "Dave", "male"), + ], + scenes=[SceneHint( + start=1, end=2, description="A conversation", + participants=["Carol"], + )], + ) + batch = [_block(1, "Drink water."), _block(2, "Oh, right.")] + out = ctx.render_for_batch(batch) + assert "Carol => Carol (female)" in out + # Dave isn't a participant and isn't in the text — must NOT be listed. + assert "Dave" not in out + + +def test_render_scene_tags_participants_with_gender(): + ctx = FileContext( + characters=[ + CharacterHint("Alice", "Alice", "female"), + CharacterHint("Bob", "Bob", "male"), + ], + scenes=[ + SceneHint( + start=10, end=20, + description="Alice gives Bob an update", + participants=["Alice", "Bob"], + ), + ], + ) + batch = [_block(10, "x"), _block(20, "y")] + rendered = ctx.render_for_batch(batch) + assert "Alice (F)" in rendered + assert "Bob (M)" in rendered + assert "Alice gives Bob an update" in rendered + + +def test_render_scene_without_participants_falls_back_to_description(): + ctx = FileContext( + scenes=[SceneHint(start=1, end=5, description="Crowd murmurs")], + ) + batch = [_block(1, "x")] + rendered = ctx.render_for_batch(batch) + assert "Crowd murmurs" in rendered + # No square-bracket prefix when no participants were detected. + assert "[" not in rendered.split("Crowd murmurs")[0].split("Blocks 1-5:")[-1] + + +def test_is_empty_considers_scenes(): + ctx = FileContext(scenes=[SceneHint(start=1, end=2, description="x")]) + assert not ctx.is_empty() + + def test_serialize_for_scan_returns_all_text_when_under_budget(): blocks = [_block(i, f"Line {i}.") for i in range(1, 6)] - out = serialize_for_scan(blocks) + out = serialize_for_scan(blocks, _TEST_BUDGET) for i in range(1, 6): assert f"Line {i}." in out + assert f"[{i}]" in out def test_serialize_for_scan_samples_large_files_under_budget(): # Build a file that clearly exceeds the scan budget. long_line = "x" * 500 blocks = [_block(i, f"{long_line}-{i}") for i in range(1, 500)] - out = serialize_for_scan(blocks) - assert len(out) <= _SCAN_CHAR_BUDGET * 1.1 # small slack for newlines + out = serialize_for_scan(blocks, _TEST_BUDGET) + assert len(out) <= _TEST_BUDGET * 1.1 # small slack for newlines # Sampled output must include blocks from across the whole file, # not just the first N. assert any(f"-{i}" in out for i in range(1, 20)) assert any(f"-{i}" in out for i in range(450, 500)) + + +def test_serialize_for_scan_joins_multiline_block_text(): + # Multi-line text must be joined onto the [N] line so the prefix stays + # usable for scene-range references. + blocks = [_block(1, "First line\nSecond line")] + out = serialize_for_scan(blocks, _TEST_BUDGET) + assert out.splitlines()[0].startswith("[1] ") + assert "First line" in out + assert "Second line" in out diff --git a/cli/tests/test_review_pass.py b/cli/tests/test_review_pass.py new file mode 100644 index 0000000..6132e37 --- /dev/null +++ b/cli/tests/test_review_pass.py @@ -0,0 +1,91 @@ +"""Tests for the post-edit review pass in batch_runner.""" +from __future__ import annotations + +import asyncio +from dataclasses import dataclass + +import core.batch_runner as br +from core.context_pass import CharacterHint, FileContext +from core.srt_parser import SubtitleBlock + + +def _block(n: int, text: str) -> SubtitleBlock: + return SubtitleBlock(number=n, timestamp="00:00:00,000 --> 00:00:01,000", text=text) + + +@dataclass +class _StubCfg: + source_lang: str = "English" + target_lang: str = "French" + model: str | None = None + api_url: str = "" + api_key: str = "" + review: bool = True + + def warn(self, msg: str) -> None: + pass + + +def _run(coro): + return asyncio.new_event_loop().run_until_complete(coro) + + +def _ctx_with_char() -> FileContext: + return FileContext( + characters=[CharacterHint("Alice", "Alice", "female")], + ) + + +def test_review_keeps_first_pass_when_block_count_mismatches(monkeypatch): + batch = [_block(1, "Hello Alice."), _block(2, "World.")] + first_pass = [_block(1, "Bonjour."), _block(2, "Monde.")] + + async def fake_call(*a, **k): + return "1\nsingle" # only 1 block; mismatched count + + monkeypatch.setattr(br, "call_chat_api", fake_call) + out = _run(br._review_pass(None, batch, first_pass, _StubCfg(), _ctx_with_char())) + assert out is first_pass + + +def test_review_accepts_valid_revision(monkeypatch): + batch = [_block(1, "Hello Alice."), _block(2, "World.")] + first_pass = [_block(1, "Bonjour."), _block(2, "Monde.")] + + async def fake_call(*a, **k): + return "1\nSalut.\n\n2\nMonde." + + monkeypatch.setattr(br, "call_chat_api", fake_call) + out = _run(br._review_pass(None, batch, first_pass, _StubCfg(), _ctx_with_char())) + assert [b.text for b in out] == ["Salut.", "Monde."] + assert [b.number for b in out] == [1, 2] + assert [b.timestamp for b in out] == [batch[0].timestamp, batch[1].timestamp] + + +def test_review_skips_when_no_glossary_and_never_calls_api(monkeypatch): + # Without scene/character guidance, there's no principled reason to touch + # the first-pass — the review must not fire at all. + batch = [_block(1, "Hi.")] + first_pass = [_block(1, "Salut.")] + calls = {"n": 0} + + async def fake_call(*a, **k): + calls["n"] += 1 + return "1\nX." + + monkeypatch.setattr(br, "call_chat_api", fake_call) + out = _run(br._review_pass(None, batch, first_pass, _StubCfg(), None)) + assert out is first_pass + assert calls["n"] == 0 + + +def test_review_keeps_first_pass_on_api_error(monkeypatch): + batch = [_block(1, "Hi Alice.")] + first_pass = [_block(1, "Salut.")] + + async def boom(*a, **k): + raise RuntimeError("network down") + + monkeypatch.setattr(br, "call_chat_api", boom) + out = _run(br._review_pass(None, batch, first_pass, _StubCfg(), _ctx_with_char())) + assert out is first_pass diff --git a/cli/translora.py b/cli/translora.py index 96a30ec..2fb387a 100644 --- a/cli/translora.py +++ b/cli/translora.py @@ -76,6 +76,23 @@ def _build_parser() -> argparse.ArgumentParser: help="Show retry/validation warnings (hidden by default)") p.add_argument("--output", "-o", type=Path, default=None, help="Output file path (single file only)") + p.add_argument("--scan-budget", type=int, default=24_000, metavar="CHARS", + help="Character budget for the prepass scan (default: 24000). " + "Tuned for best-quality scans on typical TV episodes; " + "lower on tight-context local models (~8k window), " + "raise on large-context cloud models for full-file scans.") + p.add_argument("--context-overlap", type=int, default=2, metavar="N", + help="Source blocks from the previous batch shown as read-only " + "context (default: 2). Helps maintain speaker continuity " + "across batch boundaries. Set to 0 to disable.") + p.add_argument("--no-refine-attribution", dest="refine_attribution", + action="store_false", default=True, + help="Disable per-block speaker attribution for mixed-gender " + "scenes (saves one small LLM call per ambiguous scene).") + p.add_argument("--no-review", dest="review", + action="store_false", default=True, + help="Disable the post-edit review pass (one extra call per " + "batch that fixes gender/number/consistency slips).") return p @@ -148,6 +165,10 @@ async def _translate_all(args, jobs: list[Job]) -> tuple[int, list[tuple[Path, s batch_size=args.batch_size, concurrency=args.concurrency, max_retries=args.max_retries, + scan_char_budget=args.scan_budget, + context_overlap=args.context_overlap, + refine_attribution=args.refine_attribution, + review=args.review, quiet=multi_file, verbose=args.verbose, ) From a59318bf48a128d340bf4161dc4d5192994fdeb0 Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Sat, 25 Apr 2026 23:16:48 +0300 Subject: [PATCH 3/6] Doc: Add options for optimized translation - Introduces new flags for scan budget and context overlap. - Allows disabling review and speaker attribution for performance. - Enhances translation quality with default settings. - Provides flexibility for metered cloud and tight-context local models. --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index a417254..1605784 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,12 @@ Frequently used flags: | `--force` | Re-translate even if the output exists | | `-v, --verbose` | Show retry/validation warnings (hidden by default) | | `-o, --output` | Output path (single file only) | +| `--scan-budget` | Chars sent to the prepass scan (default **24000**). Lower on tight-context local models (~8k window); raise on large-context cloud models for full-file scans. | +| `--context-overlap` | Previous-batch source blocks shown as read-only context (default **2**, `0` to disable). Helps speaker continuity across batch boundaries. | +| `--no-review` | Disable the post-edit review pass. Saves one extra LLM call per batch — useful on metered providers. | +| `--no-refine-attribution` | Disable per-block speaker attribution for mixed-gender scenes (saves one small call per ambiguous scene). | + +The defaults are tuned for best translation quality. On metered cloud providers you can pass `--no-review` and/or `--no-refine-attribution` to cut LLM calls. On tight-context local models, lower `--scan-budget` (e.g. `8000`) so the scan prompt fits. Set `NO_COLOR=1` to disable ANSI colors; output auto-falls back to plain lines when piped. From d3c9a83f77c5c43efdea8ed167afa8567b2e41ef Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Sat, 25 Apr 2026 23:32:22 +0300 Subject: [PATCH 4/6] Web: Refactor constants - Extracted translation defaults and tuning constants to a separate file for better organization. - Enhanced attribution user message construction for improved scene handling. - Enabled a flexible translation prompt system, facilitating iterative improvements. - Optimized the translation process by restructuring imports and refining functions. --- web/src/app/app.component.ts | 12 ++-- web/src/app/core/constants.ts | 26 +++++++ web/src/app/core/context-pass.ts | 59 +--------------- web/src/app/core/translation-prompt.ts | 49 ++++++++++++++ web/src/app/core/translation.service.ts | 90 +++++++++++++++---------- 5 files changed, 138 insertions(+), 98 deletions(-) create mode 100644 web/src/app/core/constants.ts diff --git a/web/src/app/app.component.ts b/web/src/app/app.component.ts index e0e890b..16eb52a 100644 --- a/web/src/app/app.component.ts +++ b/web/src/app/app.component.ts @@ -4,16 +4,18 @@ import JSZip from 'jszip'; import { TranslationService, ProviderConfig, - DEFAULT_MAX_RETRIES, + TranslationCancelledError, +} from './core/translation.service'; +import { DEFAULT_BATCH_SIZE, DEFAULT_CONCURRENCY, - DEFAULT_PARALLEL_FILES, DEFAULT_CONTEXT_OVERLAP, - DEFAULT_SCAN_BUDGET, + DEFAULT_MAX_RETRIES, + DEFAULT_PARALLEL_FILES, DEFAULT_REFINE_ATTRIBUTION, DEFAULT_REVIEW, - TranslationCancelledError, -} from './core/translation.service'; + DEFAULT_SCAN_BUDGET, +} from './core/constants'; import { parseSubtitle } from './core/subtitle-formats'; import { LANGUAGES } from './core/languages'; import { PROVIDER_PRESETS, PROVIDER_KEYS } from './core/providers'; diff --git a/web/src/app/core/constants.ts b/web/src/app/core/constants.ts new file mode 100644 index 0000000..ac851a0 --- /dev/null +++ b/web/src/app/core/constants.ts @@ -0,0 +1,26 @@ +// Public defaults and tuning constants shared across web modules. + +// === Translation defaults === +export const DEFAULT_MAX_RETRIES = 5; +export const DEFAULT_BATCH_SIZE = 10; +export const DEFAULT_CONCURRENCY = 5; +export const DEFAULT_PARALLEL_FILES = 1; +export const DEFAULT_CONTEXT_OVERLAP = 2; +export const DEFAULT_REVIEW = true; +export const DEFAULT_REFINE_ATTRIBUTION = true; +// Sized for full-quality scans on typical TV episodes; lower on tight-context +// local models (~8k window), raise on large-context cloud models. +export const DEFAULT_SCAN_BUDGET = 24_000; + +// === Prepass / attribution scan === +export const SCAN_MAX_TOKENS = 3000; +// 2-char names collide with common target-language words. +export const MIN_NAME_LEN = 3; +// Single-block scenes never need per-block speaker attribution. +export const ATTRIB_MIN_BLOCKS = 3; + +// === Batch retry/split === +export const ATTEMPTS_BEFORE_SPLIT = 2; + +// === HTTP === +export const CRED_QUERY_PARAMS = ['key', 'api_key', 'apikey', 'access_token'] as const; diff --git a/web/src/app/core/context-pass.ts b/web/src/app/core/context-pass.ts index a881012..4525947 100644 --- a/web/src/app/core/context-pass.ts +++ b/web/src/app/core/context-pass.ts @@ -1,45 +1,9 @@ // One-shot prepass: scans the file once for cast/terms/scenes/register so every // batch shares the same glossary. Fails silently to an empty FileContext. +import { ATTRIB_MIN_BLOCKS, MIN_NAME_LEN } from './constants'; import { SubtitleBlock } from './srt-parser'; -export const CONTEXT_SYSTEM_PROMPT = `You analyze a subtitle file before it is translated. Return a compact glossary for the translator to use when picking correct pronouns, consistent names, and a single consistent register. - -Input blocks are prefixed with their block number as \`[N] text\`. - -Reply with all five sections below in this exact order. No commentary, no fences — tags only. - - -ONE LINE describing the target-language variant and formality. - - -NAME => TARGET_NAME | GENDER - - -SOURCE => TARGET - - -START-END => description that NAMES the characters involved - - -- NOTE - - -Rules: -- : name the exact target variant (e.g. "Modern Standard Arabic, neutral", "Brazilian Portuguese, casual", "Japanese, polite です/ます form"). Pick one for the whole file. -- GENDER is "male", "female", or "unknown". Use "unknown" only when the text gives no signal. -- TARGET_NAME is how the character's name should appear in the target language. -- : every ≥3-block stretch of dialogue between named characters. Name the characters explicitly using the names from so the translator can apply the right gender per range. Ranges may touch but must not overlap. -- Example: \`105-119 => Maria reassures Alex about the interview\` (use the actual names from YOUR section). -- Include up to 20 characters, 10 terms, 40 scenes, 4 notes. -- Leave a section empty (tags only) if nothing qualifies. Never omit a section.`; - -export const ATTRIBUTION_SYSTEM_PROMPT = `You identify the speaker of each subtitle line in a short scene. Given a character list and a block-numbered scene excerpt (\`[N] text\`), reply with exactly one line per input block as \`N=SpeakerName\`. SpeakerName MUST be one of the listed characters or the literal "unknown". No commentary, no fences.`; - -export const SCAN_MAX_TOKENS = 3000; -const MIN_NAME_LEN = 3; -const ATTRIB_MIN_BLOCKS = 3; - export type Gender = 'male' | 'female' | 'unknown'; export interface CharacterHint { @@ -119,7 +83,7 @@ function scenesOverlapping(scenes: SceneHint[], batch: SubtitleBlock[]): SceneHi return scenes.filter((s) => s.end >= first && s.start <= last); } -function genderMark(g: Gender | undefined): string { +export function genderMark(g: Gender | undefined): string { return g === 'male' ? 'M' : g === 'female' ? 'F' : ''; } @@ -335,25 +299,6 @@ export function needsAttribution(scene: SceneHint): boolean { return (scene.end - scene.start + 1) >= ATTRIB_MIN_BLOCKS && scene.participants.length >= 1; } -export function buildAttributionUserMessage( - scene: SceneHint, - blocks: SubtitleBlock[], - characters: CharacterHint[], -): string { - const byNum = new Map(blocks.map((b) => [b.number, b])); - const present = new Set(scene.participants); - const roster = characters - .filter((h) => present.has(h.source)) - .map((h) => `- ${h.source} (${genderMark(h.gender) || '?'})`) - .join('\n'); - const sceneLines: string[] = []; - for (let n = scene.start; n <= scene.end; n++) { - const b = byNum.get(n); - if (b) sceneLines.push(`[${n}] ${b.text.replace(/\n/g, ' ')}`); - } - return `Characters:\n${roster}\n\nScene:\n${sceneLines.join('\n')}`; -} - export function parseAttributionResponse( raw: string, scene: SceneHint, characters: CharacterHint[], ): Record { diff --git a/web/src/app/core/translation-prompt.ts b/web/src/app/core/translation-prompt.ts index b762c5f..f10fd25 100644 --- a/web/src/app/core/translation-prompt.ts +++ b/web/src/app/core/translation-prompt.ts @@ -1,3 +1,6 @@ +// All LLM-facing prompts and user-message builders, kept in one place so they +// can be iterated on alongside their counterparts. + import { SubtitleBlock, serializeLite } from './srt-parser'; export const SYSTEM_PROMPT = `You are a subtitle translator. You will receive numbered subtitle blocks (no timestamps) and translate them. @@ -39,6 +42,39 @@ If uncertain, keep the block verbatim. Do NOT rephrase, restyle, or "polish". Sa Output: same wire format, one blank line between blocks. ALL blocks. No commentary, no fences.`; +export const CONTEXT_SYSTEM_PROMPT = `You analyze a subtitle file before it is translated. Return a compact glossary for the translator to use when picking correct pronouns, consistent names, and a single consistent register. + +Input blocks are prefixed with their block number as \`[N] text\`. + +Reply with all five sections below in this exact order. No commentary, no fences — tags only. + + +ONE LINE describing the target-language variant and formality. + + +NAME => TARGET_NAME | GENDER + + +SOURCE => TARGET + + +START-END => description that NAMES the characters involved + + +- NOTE + + +Rules: +- : name the exact target variant (e.g. "Modern Standard Arabic, neutral", "Brazilian Portuguese, casual", "Japanese, polite です/ます form"). Pick one for the whole file. +- GENDER is "male", "female", or "unknown". Use "unknown" only when the text gives no signal. +- TARGET_NAME is how the character's name should appear in the target language. +- : every ≥3-block stretch of dialogue between named characters. Name the characters explicitly using the names from so the translator can apply the right gender per range. Ranges may touch but must not overlap. +- Example: \`105-119 => Maria reassures Alex about the interview\` (use the actual names from YOUR section). +- Include up to 20 characters, 10 terms, 40 scenes, 4 notes. +- Leave a section empty (tags only) if nothing qualifies. Never omit a section.`; + +export const ATTRIBUTION_SYSTEM_PROMPT = `You identify the speaker of each subtitle line in a short scene. Given a character list and a block-numbered scene excerpt (\`[N] text\`), reply with exactly one line per input block as \`N=SpeakerName\`. SpeakerName MUST be one of the listed characters or the literal "unknown". No commentary, no fences.`; + export function buildUserMessage( sourceLang: string, targetLang: string, @@ -75,3 +111,16 @@ export function buildReviewUserMessage( 'Output the corrected translation (same wire format):' ); } + +export function buildScanUserMessage( + sourceLang: string, + targetLang: string, + scanText: string, +): string { + const sourceLine = sourceLang ? `Source language: ${sourceLang}\n` : ''; + return `${sourceLine}Target language: ${targetLang}\n\n${scanText}`; +} + +export function buildAttributionUserMessage(roster: string, sceneLines: string[]): string { + return `Characters:\n${roster}\n\nScene:\n${sceneLines.join('\n')}`; +} diff --git a/web/src/app/core/translation.service.ts b/web/src/app/core/translation.service.ts index 70b5d29..aee6b2e 100644 --- a/web/src/app/core/translation.service.ts +++ b/web/src/app/core/translation.service.ts @@ -2,31 +2,46 @@ import { Injectable } from '@angular/core'; import { HttpClient, HttpErrorResponse } from '@angular/common/http'; import { Subscription } from 'rxjs'; import { - SubtitleBlock, - parseLite, - serializeLite, - splitBatches, - validateBatch, -} from './srt-parser'; -import { SubtitleDocument } from './subtitle-formats/types'; -import { - REVIEW_SYSTEM_PROMPT, - SYSTEM_PROMPT, - buildReviewUserMessage, - buildUserMessage, -} from './translation-prompt'; + ATTEMPTS_BEFORE_SPLIT, + CRED_QUERY_PARAMS, + DEFAULT_BATCH_SIZE, + DEFAULT_CONCURRENCY, + DEFAULT_CONTEXT_OVERLAP, + DEFAULT_MAX_RETRIES, + DEFAULT_REFINE_ATTRIBUTION, + DEFAULT_REVIEW, + DEFAULT_SCAN_BUDGET, + SCAN_MAX_TOKENS, +} from './constants'; import { - ATTRIBUTION_SYSTEM_PROMPT, - CONTEXT_SYSTEM_PROMPT, FileContext, - SCAN_MAX_TOKENS, - buildAttributionUserMessage, enrichScenesWithBlockText, + genderMark, needsAttribution, parseAttributionResponse, parseContextResponse, serializeForScan, + type CharacterHint, + type SceneHint, } from './context-pass'; +import { + ATTRIBUTION_SYSTEM_PROMPT, + CONTEXT_SYSTEM_PROMPT, + REVIEW_SYSTEM_PROMPT, + SYSTEM_PROMPT, + buildAttributionUserMessage, + buildReviewUserMessage, + buildScanUserMessage, + buildUserMessage, +} from './translation-prompt'; +import { + SubtitleBlock, + parseLite, + serializeLite, + splitBatches, + validateBatch, +} from './srt-parser'; +import { SubtitleDocument } from './subtitle-formats/types'; export interface ProviderConfig { apiUrl: string; @@ -46,15 +61,6 @@ export class TranslationCancelledError extends Error { } } -export const DEFAULT_MAX_RETRIES = 5; -export const DEFAULT_BATCH_SIZE = 10; -export const DEFAULT_CONCURRENCY = 5; -export const DEFAULT_PARALLEL_FILES = 1; -export const DEFAULT_CONTEXT_OVERLAP = 2; -export const DEFAULT_REVIEW = true; -export const DEFAULT_REFINE_ATTRIBUTION = true; -export const DEFAULT_SCAN_BUDGET = 24_000; - export interface QualityOptions { contextOverlap?: number; scanBudget?: number; @@ -62,8 +68,6 @@ export interface QualityOptions { review?: boolean; } -const ATTEMPTS_BEFORE_SPLIT = 2; - type ChatResponse = { choices: { message: { content: string } }[] }; @Injectable({ providedIn: 'root' }) @@ -146,12 +150,9 @@ export class TranslationService { scanBudget: number, cancelSignal?: AbortSignal, ): Promise { - const sourceLine = sourceLang ? `Source language: ${sourceLang}\n` : ''; - const userMessage = - sourceLine + - `Target language: ${targetLang}\n\n` + - serializeForScan(blocks, scanBudget); - + const userMessage = buildScanUserMessage( + sourceLang, targetLang, serializeForScan(blocks, scanBudget), + ); try { const raw = await this.callChat( CONTEXT_SYSTEM_PROMPT, userMessage, provider, SCAN_MAX_TOKENS, cancelSignal, @@ -176,6 +177,7 @@ export class TranslationService { const targets = ctx.scenes.filter(needsAttribution); if (!targets.length) return; + const byNum = new Map(blocks.map((b) => [b.number, b])); let nextIdx = 0; const worker = async () => { while (true) { @@ -184,7 +186,7 @@ export class TranslationService { if (i >= targets.length) return; const scene = targets[i]; try { - const userMsg = buildAttributionUserMessage(scene, blocks, ctx.characters); + const userMsg = buildSceneAttributionMessage(scene, byNum, ctx.characters); const raw = await this.callChat( ATTRIBUTION_SYSTEM_PROMPT, userMsg, provider, (scene.end - scene.start + 1) * 20 + 100, cancelSignal, @@ -437,7 +439,23 @@ export class TranslationService { } -const CRED_QUERY_PARAMS = ['key', 'api_key', 'apikey', 'access_token']; +function buildSceneAttributionMessage( + scene: SceneHint, + byNum: Map, + characters: CharacterHint[], +): string { + const present = new Set(scene.participants); + const roster = characters + .filter((h) => present.has(h.source)) + .map((h) => `- ${h.source} (${genderMark(h.gender) || '?'})`) + .join('\n'); + const sceneLines: string[] = []; + for (let n = scene.start; n <= scene.end; n++) { + const b = byNum.get(n); + if (b) sceneLines.push(`[${n}] ${b.text.replace(/\n/g, ' ')}`); + } + return buildAttributionUserMessage(roster, sceneLines); +} // We authenticate via header, so strip credential query params before sending. function sanitizeApiUrl(url: string): string { From f36399e4091f3fde096e678195d4305bd5e4c904 Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Sat, 25 Apr 2026 23:32:45 +0300 Subject: [PATCH 5/6] CLI: Refactor constants - Centralizes LLM-facing prompts for better management and iteration. - Improves translation and review user messages with builder functions. - Consolidates constants for cleaner code and shared access. - Optimizes retry logic by aligning attempt constants. - Enhances context handling, improving translation accuracy and quality. --- cli/core/batch_runner.py | 66 ++++++++----------------- cli/core/config.py | 20 ++++---- cli/core/constants.py | 24 +++++++++ cli/core/context_pass.py | 73 ++++++--------------------- cli/core/prompt.py | 103 ++++++++++++++++++++++++++++++++++++++- cli/core/translator.py | 3 +- cli/translora.py | 3 +- 7 files changed, 176 insertions(+), 116 deletions(-) create mode 100644 cli/core/constants.py diff --git a/cli/core/batch_runner.py b/cli/core/batch_runner.py index b83a590..a1e56a3 100644 --- a/cli/core/batch_runner.py +++ b/cli/core/batch_runner.py @@ -8,15 +8,20 @@ import httpx +from .config import TranslationConfig +from .constants import ( + ATTEMPTS_BEFORE_SPLIT, + CRED_QUERY_PARAMS, + REQUEST_TIMEOUT_SECS, +) from .context_pass import FileContext +from .prompt import ( + REVIEW_SYSTEM_PROMPT, + SYSTEM_PROMPT, + build_review_user_message, + build_translate_user_message, +) from .srt_parser import SubtitleBlock, parse_lite, serialize_lite, validate_batch -from .config import TranslationConfig -from .prompt import SYSTEM_PROMPT, REVIEW_SYSTEM_PROMPT - - -REQUEST_TIMEOUT_SECS = 120.0 - -_CRED_QUERY_PARAMS = {"key", "api_key", "apikey", "access_token"} class FileTranslationError(Exception): @@ -31,7 +36,7 @@ def sanitize_api_url(url: str) -> str: try: parts = urlsplit(url) kept = [(k, v) for k, v in parse_qsl(parts.query, keep_blank_values=True) - if k.lower() not in _CRED_QUERY_PARAMS] + if k.lower() not in CRED_QUERY_PARAMS] return urlunsplit((parts.scheme, parts.netloc, parts.path, urlencode(kept), parts.fragment)) except Exception: @@ -90,37 +95,6 @@ async def call_chat_api( return resp.json()["choices"][0]["message"]["content"] -def _build_user_message( - cfg: TranslationConfig, - batch_wire: str, - file_context: FileContext | None, - batch: list[SubtitleBlock], - prev_tail: list[SubtitleBlock] | None = None, -) -> str: - header = ( - f"Translate from {cfg.source_lang} to {cfg.target_lang}:" - if cfg.source_lang else f"Translate to {cfg.target_lang}:" - ) - sections: list[str] = [] - if file_context is not None: - glossary = file_context.render_for_batch(batch) - if glossary: - sections.append(f"Glossary for this scene:\n{glossary}") - if prev_tail: - # Non-numbered so the parser can't confuse these with real input blocks. - prev_lines = "\n".join( - f" [prev #{b.number}] {b.text.replace(chr(10), ' ')}" for b in prev_tail - ) - sections.append( - "Previous context (read-only, do NOT translate or output):\n" + prev_lines - ) - sections.append(f"{header}\n\n{batch_wire}") - return "\n\n".join(sections) - - -_ATTEMPTS_BEFORE_SPLIT = 2 - - async def _review_pass( client: httpx.AsyncClient, batch: list[SubtitleBlock], @@ -133,12 +107,7 @@ async def _review_pass( glossary = file_context.render_for_batch(batch) if file_context else "" if not glossary: return first_pass - user_msg = ( - f"Glossary:\n{glossary}\n\n" - f"Source blocks:\n{serialize_lite(batch)}\n\n" - f"First-pass translation:\n{serialize_lite(first_pass)}\n\n" - "Output the corrected translation (same wire format):" - ) + user_msg = build_review_user_message(batch, first_pass, glossary) try: raw = await call_chat_api( client, REVIEW_SYSTEM_PROMPT, user_msg, cfg, max(len(batch), 1) * 120) @@ -173,12 +142,15 @@ async def translate_batch_with_retry( because at N=1 a count mismatch is impossible. """ batch_wire = serialize_lite(batch) - user_msg = _build_user_message(cfg, batch_wire, file_context, batch, prev_tail) + glossary = file_context.render_for_batch(batch) if file_context else "" + user_msg = build_translate_user_message( + cfg.source_lang, cfg.target_lang, batch_wire, glossary, prev_tail or [], + ) label = f"Batch {batch_idx + 1}" + (f".{_split_path}" if _split_path else "") first_block = batch[0].number can_split = len(batch) > 1 - attempts = _ATTEMPTS_BEFORE_SPLIT if can_split else cfg.max_retries + attempts = ATTEMPTS_BEFORE_SPLIT if can_split else cfg.max_retries hit_validation_failure = False for attempt in range(1, attempts + 1): diff --git a/cli/core/config.py b/cli/core/config.py index 93f7305..64badc0 100644 --- a/cli/core/config.py +++ b/cli/core/config.py @@ -4,8 +4,13 @@ from dataclasses import dataclass, field from typing import Callable - -DEFAULT_MAX_RETRIES = 5 +from .constants import ( + DEFAULT_BATCH_SIZE, + DEFAULT_CONCURRENCY, + DEFAULT_CONTEXT_OVERLAP, + DEFAULT_MAX_RETRIES, + DEFAULT_SCAN_CHAR_BUDGET, +) def _silent_warn(msg: str) -> None: @@ -25,14 +30,11 @@ class TranslationConfig: api_url: str api_key: str model: str | None = None - batch_size: int = 10 - concurrency: int = 1 + batch_size: int = DEFAULT_BATCH_SIZE + concurrency: int = DEFAULT_CONCURRENCY max_retries: int = DEFAULT_MAX_RETRIES - # Prepass scan budget (chars). Sized for full-quality scans on typical - # TV episodes; lower on tight-context local models (~8k window). - scan_char_budget: int = 24_000 - # Previous-batch source blocks shown as read-only context; 0 disables. - context_overlap: int = 2 + scan_char_budget: int = DEFAULT_SCAN_CHAR_BUDGET + context_overlap: int = DEFAULT_CONTEXT_OVERLAP # One small LLM call per ambiguous scene; fixes cross-gender addressee slips. refine_attribution: bool = True # One extra call per batch; fixes gender/number/consistency slips. Doubles cost. diff --git a/cli/core/constants.py b/cli/core/constants.py new file mode 100644 index 0000000..2cffc47 --- /dev/null +++ b/cli/core/constants.py @@ -0,0 +1,24 @@ +"""Public defaults and tuning constants shared across CLI modules.""" + +# === Translation defaults (mirrored in TranslationConfig field defaults) === +DEFAULT_BATCH_SIZE = 10 +DEFAULT_CONCURRENCY = 1 +DEFAULT_MAX_RETRIES = 5 +# Sized for full-quality scans on typical TV episodes; lower on tight-context +# local models (~8k window), raise on large-context cloud models. +DEFAULT_SCAN_CHAR_BUDGET = 24_000 +DEFAULT_CONTEXT_OVERLAP = 2 + +# === Prepass / attribution scan === +SCAN_MAX_TOKENS = 3000 +# 2-char names collide with common target-language words. +MIN_NAME_LEN = 3 +# Single-block scenes never need per-block speaker attribution. +ATTRIB_MIN_BLOCKS = 3 + +# === Batch retry/split === +ATTEMPTS_BEFORE_SPLIT = 2 + +# === HTTP === +REQUEST_TIMEOUT_SECS = 120.0 +CRED_QUERY_PARAMS = frozenset({"key", "api_key", "apikey", "access_token"}) diff --git a/cli/core/context_pass.py b/cli/core/context_pass.py index 30b560d..bc093e8 100644 --- a/cli/core/context_pass.py +++ b/cli/core/context_pass.py @@ -9,56 +9,16 @@ import httpx from .config import TranslationConfig +from .constants import ATTRIB_MIN_BLOCKS, MIN_NAME_LEN, SCAN_MAX_TOKENS +from .prompt import ( + ATTRIBUTION_SYSTEM_PROMPT, + CONTEXT_SYSTEM_PROMPT, + build_attribution_user_message, + build_scan_user_message, +) from .srt_parser import SubtitleBlock -CONTEXT_SYSTEM_PROMPT = """\ -You analyze a subtitle file before it is translated. Return a compact glossary -for the translator to use when picking correct pronouns, consistent names, and -a single consistent register. - -Input blocks are prefixed with their block number as `[N] text`. - -Reply with all five sections below in this exact order. No commentary, no -fences — tags only. - - -ONE LINE describing the target-language variant and formality. - - -NAME => TARGET_NAME | GENDER - - -SOURCE => TARGET - - -START-END => description that NAMES the characters involved - - -- NOTE - - -Rules: -- : name the exact target variant (e.g. "Modern Standard Arabic, neutral", "Brazilian Portuguese, casual", "Japanese, polite です/ます form"). Pick one for the whole file. -- GENDER is "male", "female", or "unknown". Use "unknown" only when the text gives no signal. -- TARGET_NAME is how the character's name should appear in the target language. -- : every ≥3-block stretch of dialogue between named characters. Name the characters explicitly using the names from so the translator can apply the right gender per range. Ranges may touch but must not overlap. -- Example: `105-119 => Maria reassures Alex about the interview` (use the actual names from YOUR section). -- Include up to 20 characters, 10 terms, 40 scenes, 4 notes. -- Leave a section empty (tags only) if nothing qualifies. Never omit a section.\ -""" - -_ATTRIBUTION_SYSTEM_PROMPT = """\ -You identify the speaker of each subtitle line in a short scene. Given a -character list and a block-numbered scene excerpt (`[N] text`), reply with -exactly one line per input block as `N=SpeakerName`. SpeakerName MUST be one -of the listed characters or the literal "unknown". No commentary, no fences.\ -""" - -_SCAN_MAX_TOKENS = 3000 -_MIN_NAME_LEN = 3 -_ATTRIB_MIN_BLOCKS = 3 - _SECTION_RE = re.compile( r"<(?Pregister|characters|terms|scenes|notes)>\s*" r"(?P.*?)\s*" @@ -200,9 +160,9 @@ def _detect_participants( descriptions often slip into the target language.""" aliases: list[tuple[str, str]] = [] # (alias, source_name) for h in characters: - if len(h.source) >= _MIN_NAME_LEN: + if len(h.source) >= MIN_NAME_LEN: aliases.append((h.source, h.source)) - if h.target != h.source and len(h.target) >= _MIN_NAME_LEN: + if h.target != h.source and len(h.target) >= MIN_NAME_LEN: aliases.append((h.target, h.source)) aliases.sort(key=lambda a: len(a[0]), reverse=True) @@ -309,15 +269,14 @@ async def extract_file_context( """Run one scan call. Returns the parsed+enriched context.""" from .batch_runner import call_chat_api, strip_markdown_fences - source_line = f"Source language: {cfg.source_lang}\n" if cfg.source_lang else "" - user_msg = ( - f"{source_line}Target language: {cfg.target_lang}\n\n" - f"{serialize_for_scan(blocks, cfg.scan_char_budget)}" + user_msg = build_scan_user_message( + cfg.source_lang, cfg.target_lang, + serialize_for_scan(blocks, cfg.scan_char_budget), ) try: raw = await call_chat_api( client, CONTEXT_SYSTEM_PROMPT, user_msg, cfg, - max_tokens=_SCAN_MAX_TOKENS, + max_tokens=SCAN_MAX_TOKENS, ) except Exception as e: cfg.warn(f" Context scan failed, proceeding without: {e}") @@ -358,7 +317,7 @@ def enrich_scenes_with_block_text( def _needs_attribution(scene: SceneHint, gender_by: dict[str, str]) -> bool: - return (scene.end - scene.start + 1 >= _ATTRIB_MIN_BLOCKS + return (scene.end - scene.start + 1 >= ATTRIB_MIN_BLOCKS and len(scene.participants) >= 1) @@ -381,10 +340,10 @@ async def _attribute_scene( ] if not block_lines or not roster: return {} - user_msg = f"Characters:\n{roster}\n\nScene:\n" + "\n".join(block_lines) + user_msg = build_attribution_user_message(roster, block_lines) try: raw = await call_chat_api( - client, _ATTRIBUTION_SYSTEM_PROMPT, user_msg, cfg, + client, ATTRIBUTION_SYSTEM_PROMPT, user_msg, cfg, max_tokens=len(block_lines) * 20 + 100, ) except Exception as e: diff --git a/cli/core/prompt.py b/cli/core/prompt.py index 4888500..eaba1fe 100644 --- a/cli/core/prompt.py +++ b/cli/core/prompt.py @@ -1,4 +1,10 @@ -"""The translation prompts, kept in one place so they can be iterated on.""" +"""All LLM-facing prompts and user-message builders, kept in one place so they +can be iterated on and reviewed alongside their counterparts.""" + +from __future__ import annotations + +from .srt_parser import SubtitleBlock, serialize_lite + SYSTEM_PROMPT = """\ You are a subtitle translator. You will receive numbered subtitle blocks (no timestamps) and translate them. @@ -43,3 +49,98 @@ Output: same wire format, one blank line between blocks. ALL blocks. No commentary, no fences.\ """ + + +CONTEXT_SYSTEM_PROMPT = """\ +You analyze a subtitle file before it is translated. Return a compact glossary +for the translator to use when picking correct pronouns, consistent names, and +a single consistent register. + +Input blocks are prefixed with their block number as `[N] text`. + +Reply with all five sections below in this exact order. No commentary, no +fences — tags only. + + +ONE LINE describing the target-language variant and formality. + + +NAME => TARGET_NAME | GENDER + + +SOURCE => TARGET + + +START-END => description that NAMES the characters involved + + +- NOTE + + +Rules: +- : name the exact target variant (e.g. "Modern Standard Arabic, neutral", "Brazilian Portuguese, casual", "Japanese, polite です/ます form"). Pick one for the whole file. +- GENDER is "male", "female", or "unknown". Use "unknown" only when the text gives no signal. +- TARGET_NAME is how the character's name should appear in the target language. +- : every ≥3-block stretch of dialogue between named characters. Name the characters explicitly using the names from so the translator can apply the right gender per range. Ranges may touch but must not overlap. +- Example: `105-119 => Maria reassures Alex about the interview` (use the actual names from YOUR section). +- Include up to 20 characters, 10 terms, 40 scenes, 4 notes. +- Leave a section empty (tags only) if nothing qualifies. Never omit a section.\ +""" + + +ATTRIBUTION_SYSTEM_PROMPT = """\ +You identify the speaker of each subtitle line in a short scene. Given a +character list and a block-numbered scene excerpt (`[N] text`), reply with +exactly one line per input block as `N=SpeakerName`. SpeakerName MUST be one +of the listed characters or the literal "unknown". No commentary, no fences.\ +""" + + +def build_translate_user_message( + source_lang: str, + target_lang: str, + batch_wire: str, + glossary: str, + prev_tail: list[SubtitleBlock], +) -> str: + header = ( + f"Translate from {source_lang} to {target_lang}:" + if source_lang else f"Translate to {target_lang}:" + ) + sections: list[str] = [] + if glossary: + sections.append(f"Glossary for this scene:\n{glossary}") + if prev_tail: + # Non-numbered so the parser can't confuse these with real input blocks. + prev_lines = "\n".join( + f" [prev #{b.number}] {b.text.replace(chr(10), ' ')}" for b in prev_tail + ) + sections.append( + "Previous context (read-only, do NOT translate or output):\n" + prev_lines + ) + sections.append(f"{header}\n\n{batch_wire}") + return "\n\n".join(sections) + + +def build_review_user_message( + batch: list[SubtitleBlock], + first_pass: list[SubtitleBlock], + glossary: str, +) -> str: + return ( + f"Glossary:\n{glossary}\n\n" + f"Source blocks:\n{serialize_lite(batch)}\n\n" + f"First-pass translation:\n{serialize_lite(first_pass)}\n\n" + "Output the corrected translation (same wire format):" + ) + + +def build_scan_user_message( + source_lang: str, target_lang: str, scan_text: str, +) -> str: + source_line = f"Source language: {source_lang}\n" if source_lang else "" + return f"{source_line}Target language: {target_lang}\n\n{scan_text}" + + +def build_attribution_user_message(roster: str, block_lines: list[str]) -> str: + return f"Characters:\n{roster}\n\nScene:\n" + "\n".join(block_lines) diff --git a/cli/core/translator.py b/cli/core/translator.py index 039c5c4..5225e25 100644 --- a/cli/core/translator.py +++ b/cli/core/translator.py @@ -11,7 +11,8 @@ from .srt_parser import SubtitleBlock, split_batches from .formats import parse_subtitle -from .config import DEFAULT_MAX_RETRIES, TranslationConfig +from .config import TranslationConfig +from .constants import DEFAULT_MAX_RETRIES from .batch_runner import FileTranslationError, translate_batch_with_retry from .context_pass import FileContext, extract_file_context, refine_scene_attribution from .time_tracker import EtaEstimator, format_duration diff --git a/cli/translora.py b/cli/translora.py index 2fb387a..df801c9 100644 --- a/cli/translora.py +++ b/cli/translora.py @@ -8,7 +8,8 @@ from dataclasses import dataclass from pathlib import Path -from core.config import DEFAULT_MAX_RETRIES, TranslationConfig, _stderr_warn +from core.config import TranslationConfig, _stderr_warn +from core.constants import DEFAULT_MAX_RETRIES from core.batch_runner import FileTranslationError from core.time_tracker import format_duration from core.lang_codes import lang_code From 0f4bcffc20f3aa52b68cce4f7ccdf26578520d2c Mon Sep 17 00:00:00 2001 From: Sulaiman AlRomaih Date: Sat, 25 Apr 2026 23:42:10 +0300 Subject: [PATCH 6/6] Web: Remove sticky positioning from nav - Simplifies navbar by removing sticky positioning. --- web/src/app/app.component.scss | 3 --- 1 file changed, 3 deletions(-) diff --git a/web/src/app/app.component.scss b/web/src/app/app.component.scss index 2b92182..94f2f6f 100644 --- a/web/src/app/app.component.scss +++ b/web/src/app/app.component.scss @@ -3,9 +3,6 @@ } .nav { - position: sticky; - top: 0; - z-index: 20; padding: 0.8rem 1rem 0; }