From 03cf6d83112e6a9370a15eb44f7845c8d9bc8715 Mon Sep 17 00:00:00 2001 From: Ole Magnus Fon Johnsen Date: Mon, 27 Apr 2026 23:42:57 +0200 Subject: [PATCH 1/4] feat: video output --- .gitignore | 4 + README.md | 11 ++- main.py | 205 +++++++++++++++++++++++++++++++++++++++++++++-- requirements.txt | 2 + 4 files changed, 215 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 1f8663e..9953d5f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,9 @@ manuscript.txt episode.mp3 +episode.mp4 .env __pycache__/ *.pyc + +characters/ +background-videos/ diff --git a/README.md b/README.md index 7e86410..bbd88ba 100644 --- a/README.md +++ b/README.md @@ -35,4 +35,13 @@ python main.py --since "3 days ago" Du kan bruke flagget `--no-tts` for å bare generere manuscriptet. -Output lagres i `manuscript.txt`. +For å generere video, pass en bakgrunnsvideofil. Et tilfeldig segment av videoen brukes: + +```bash +python main.py --background background-videos/minecraft-parkour.mp4 +``` + +Output: +- `manuscript.txt` — dialogscriptet +- `episode.mp3` — lydsporet +- `episode.mp4` — videoen (kun om `--background` er gitt) diff --git a/main.py b/main.py index cc44523..016f024 100644 --- a/main.py +++ b/main.py @@ -3,6 +3,9 @@ import re import sys from datetime import datetime, timedelta, timezone +from io import BytesIO +from pathlib import Path +from typing import Any import anthropic import requests @@ -22,6 +25,7 @@ OUTPUT_FILE = "manuscript.txt" AUDIO_OUTPUT_FILE = "episode.mp3" +VIDEO_OUTPUT_FILE = "episode.mp4" MODEL = "claude-haiku-4-5-20251001" PETER_VOICE = "d75c270eaee14c8aa1e9e980cc37cf1b" BRIAN_VOICE = "df7b23b4d67c4340be1170ae6cbc2913" @@ -32,6 +36,12 @@ "BRIAN": BRIAN_VOICE, "STEWIE": STEWIE_VOICE, } +VOICE_TO_CHAR = {v: k.lower() for k, v in VOICE_MAP.items()} + +CHARACTERS_DIR = Path(__file__).parent / "characters" +VIDEO_SIZE = (1080, 1920) +CHAR_WIDTH = 480 +TALKING_INTERVAL = 0.15 SYSTEM_PROMPT = """You are a comedy writer generating a Family Guy-style TV dialog script. @@ -96,7 +106,9 @@ def parse_since(since_str: str) -> datetime: ) -def fetch_commits(owner: str, repo: str, since_iso: str, headers: dict) -> list[dict]: +def fetch_commits( + owner: str, repo: str, since_iso: str, headers: dict[str, str] +) -> list[Any]: url = f"https://api.github.com/repos/{owner}/{repo}/commits" params = {"since": since_iso, "per_page": 100} resp = requests.get(url, headers=headers, params=params, timeout=20) @@ -114,7 +126,9 @@ def fetch_commits(owner: str, repo: str, since_iso: str, headers: dict) -> list[ return [c for c in commits if not c["commit"]["message"].startswith("Merge")] -def fetch_commit_detail(owner: str, repo: str, sha: str, headers: dict) -> dict: +def fetch_commit_detail( + owner: str, repo: str, sha: str, headers: dict[str, str] +) -> Any: url = f"https://api.github.com/repos/{owner}/{repo}/commits/{sha}" resp = requests.get(url, headers=headers, timeout=20) if resp.status_code != 200: @@ -123,7 +137,7 @@ def fetch_commit_detail(owner: str, repo: str, sha: str, headers: dict) -> dict: def build_repo_summary( - owner: str, repo: str, commits: list[dict], headers: dict + owner: str, repo: str, commits: list[Any], headers: dict[str, str] ) -> str: lines = [f"## {repo} ({len(commits)} commits)"] for commit in commits[:10]: @@ -185,20 +199,187 @@ def parse_manuscript(text: str) -> list[tuple[str, str]]: return result -def generate_audio(segments: list[tuple[str, str]], fish_key: str) -> None: +def generate_audio(segments: list[tuple[str, str]], fish_key: str) -> list[float]: + from mutagen.mp3 import MP3 + client = FishAudio(api_key=fish_key) configs = { voice_id: TTSConfig(reference_id=voice_id, format="mp3", latency="balanced") for voice_id in {v for v, _ in segments} } - audio_chunks = [] + audio_chunks: list[bytes] = [] + durations: list[float] = [] for i, (voice_id, dialog) in enumerate(segments, 1): print(f" TTS line {i}/{len(segments)}...") audio = client.tts.convert(text=dialog, config=configs[voice_id]) audio_chunks.append(audio) + durations.append(MP3(BytesIO(audio)).info.length) with open(AUDIO_OUTPUT_FILE, "wb") as f: for chunk in audio_chunks: f.write(chunk) + return durations + + +def generate_video( + segments: list[tuple[str, str]], + durations: list[float], + background_path: str, +) -> None: + import json + import random + import shutil + import subprocess + import tempfile + from collections import Counter + + from PIL import Image + + W, H = VIDEO_SIZE + total_duration = sum(durations) + n_segments = len(segments) + print(f" Duration: {total_duration:.1f}s across {n_segments} dialog lines") + + # Pick a random segment of the background video + probe = subprocess.run( + [ + "ffprobe", + "-v", + "quiet", + "-print_format", + "json", + "-show_format", + background_path, + ], + capture_output=True, + text=True, + check=True, + ) + bg_duration = float(json.loads(probe.stdout)["format"]["duration"]) + bg_start = random.uniform(0, max(0.0, bg_duration - total_duration)) + print(f" Using {Path(background_path).name} from {bg_start:.1f}s") + + tmp_dir = Path(tempfile.mkdtemp()) + try: + # Pre-render each character as an 8-frame looping MOV with alpha + # (4 frames talking + 4 frames still at 8 fps = 1s loop) + unique_chars = sorted({VOICE_TO_CHAR[v] for v, _ in segments}) + char_movs: dict[str, Path] = {} + print(" Rendering character animation loops...") + for char in unique_chars: + print(f" {char}") + still = Image.open(CHARACTERS_DIR / f"{char}_still.png").convert("RGBA") + talking = Image.open(CHARACTERS_DIR / f"{char}_talking.png").convert("RGBA") + char_h = int(CHAR_WIDTH * still.height / still.width) + still = still.resize((CHAR_WIDTH, char_h), Image.Resampling.LANCZOS) + talking = talking.resize((CHAR_WIDTH, char_h), Image.Resampling.LANCZOS) + for i, frame in enumerate([talking] * 4 + [still] * 4): + frame.save(tmp_dir / f"{char}_{i:04d}.png") + mov = tmp_dir / f"{char}.mov" + subprocess.run( + [ + "ffmpeg", + "-y", + "-framerate", + "8", + "-i", + str(tmp_dir / f"{char}_%04d.png"), + "-c:v", + "png", + "-pix_fmt", + "rgba", + str(mov), + ], + check=True, + capture_output=True, + ) + char_movs[char] = mov + + # Build ffmpeg filter complex + # Inputs: [0]=background, [1..N]=character MOVs, [N+1]=episode.mp3 + char_list = unique_chars + char_to_inp = {char: i + 1 for i, char in enumerate(char_list)} + audio_inp = len(char_list) + 1 + char_counts: Counter[str] = Counter(VOICE_TO_CHAR[v] for v, _ in segments) + + filter_parts: list[str] = [] + + # Scale background to output size + filter_parts.append(f"[0:v]scale={W}:{H}[bg]") + + # Loop each character MOV infinitely, scale to CHAR_WIDTH, then split into + # as many copies as that character has speaking segments + char_labels: dict[str, list[str]] = {} + for char in char_list: + inp = char_to_inp[char] + n = char_counts[char] + # loop=-1 = infinite, size=8 = 8-frame loop, start=0 + filter_parts.append( + f"[{inp}:v]loop=-1:size=8:start=0,scale={CHAR_WIDTH}:-2[{char}_s]" + ) + if n == 1: + char_labels[char] = [f"{char}_s"] + else: + labels = [f"{char}_{j}" for j in range(n)] + filter_parts.append( + f"[{char}_s]split={n}" + "".join(f"[{l}]" for l in labels) + ) + char_labels[char] = labels + + # Chain one overlay per segment with time-gated enable and wobble + char_usage: dict[str, int] = {char: 0 for char in char_list} + current = "bg" + t = 0.0 + for seg_idx, ((voice_id, _), duration) in enumerate(zip(segments, durations)): + char = VOICE_TO_CHAR[voice_id] + j = char_usage[char] + char_usage[char] += 1 + label = char_labels[char][j] + t_end = t + duration + x = f"({W}-w)/2+8*sin(4*t)" + y = f"{H}-h-80+4*sin(5*t)" + enable = f"between(t,{t:.3f},{t_end:.3f})" + out = f"v{seg_idx}" + filter_parts.append( + f"[{current}][{label}]overlay=x='{x}':y='{y}':enable='{enable}'[{out}]" + ) + current = out + t = t_end + + filter_complex = ";".join(filter_parts) + + # Assemble and run ffmpeg with VideoToolbox hardware encoding + print(" Encoding (hardware accelerated)...") + cmd: list[str] = ["ffmpeg", "-y"] + cmd += [ + "-ss", + f"{bg_start:.3f}", + "-t", + f"{total_duration:.3f}", + "-i", + background_path, + ] + for char in char_list: + cmd += ["-i", str(char_movs[char])] + cmd += ["-i", AUDIO_OUTPUT_FILE] + cmd += [ + "-filter_complex", + filter_complex, + "-map", + f"[{current}]", + "-map", + f"{audio_inp}:a", + "-c:v", + "h264_videotoolbox", + "-c:a", + "aac", + "-t", + f"{total_duration:.3f}", + VIDEO_OUTPUT_FILE, + ] + subprocess.run(cmd, check=True) + + finally: + shutil.rmtree(tmp_dir, ignore_errors=True) def save_output(dialog: str) -> None: @@ -214,6 +395,11 @@ def main() -> None: "--since", default="1 week ago", help="How far back to look (e.g. '1 week ago')" ) parser.add_argument("--no-tts", action="store_true", help="Skip audio generation") + parser.add_argument( + "--background", + default=None, + help="Path to background video/image for the episode (skips video if omitted)", + ) args = parser.parse_args() github_token = os.environ.get("GITHUB_TOKEN") @@ -269,9 +455,16 @@ def main() -> None: if not args.no_tts: print("\nGenerating audio...") segments = parse_manuscript(dialog) - generate_audio(segments, fish_key) # pyright: ignore[reportArgumentType] + durations = generate_audio(segments, fish_key) # pyright: ignore[reportArgumentType] print(f"Audio saved to {AUDIO_OUTPUT_FILE}") + if args.background: + print("\nGenerating video...") + generate_video(segments, durations, args.background) + print(f"Video saved to {VIDEO_OUTPUT_FILE}") + else: + print("Tip: pass --background to also generate a video.") + if __name__ == "__main__": main() diff --git a/requirements.txt b/requirements.txt index ba1f998..c40eac0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,5 @@ anthropic>=0.50.0 requests>=2.31.0 python-dotenv>=1.0.0 fish-audio-sdk>=1.0.0 +mutagen>=1.47 +Pillow>=10.0 From 37233fdee8ad840adf7367785ac10d8918103618 Mon Sep 17 00:00:00 2001 From: Ole Magnus Fon Johnsen Date: Tue, 28 Apr 2026 12:53:01 +0200 Subject: [PATCH 2/4] refactor: simplify video generation by using ffmpeg looping instead of pre-rendering --- main.py | 204 ++++++++++++++++++++++---------------------------------- 1 file changed, 80 insertions(+), 124 deletions(-) diff --git a/main.py b/main.py index 016f024..5e724fc 100644 --- a/main.py +++ b/main.py @@ -227,13 +227,9 @@ def generate_video( ) -> None: import json import random - import shutil import subprocess - import tempfile from collections import Counter - from PIL import Image - W, H = VIDEO_SIZE total_duration = sum(durations) n_segments = len(segments) @@ -258,128 +254,88 @@ def generate_video( bg_start = random.uniform(0, max(0.0, bg_duration - total_duration)) print(f" Using {Path(background_path).name} from {bg_start:.1f}s") - tmp_dir = Path(tempfile.mkdtemp()) - try: - # Pre-render each character as an 8-frame looping MOV with alpha - # (4 frames talking + 4 frames still at 8 fps = 1s loop) - unique_chars = sorted({VOICE_TO_CHAR[v] for v, _ in segments}) - char_movs: dict[str, Path] = {} - print(" Rendering character animation loops...") - for char in unique_chars: - print(f" {char}") - still = Image.open(CHARACTERS_DIR / f"{char}_still.png").convert("RGBA") - talking = Image.open(CHARACTERS_DIR / f"{char}_talking.png").convert("RGBA") - char_h = int(CHAR_WIDTH * still.height / still.width) - still = still.resize((CHAR_WIDTH, char_h), Image.Resampling.LANCZOS) - talking = talking.resize((CHAR_WIDTH, char_h), Image.Resampling.LANCZOS) - for i, frame in enumerate([talking] * 4 + [still] * 4): - frame.save(tmp_dir / f"{char}_{i:04d}.png") - mov = tmp_dir / f"{char}.mov" - subprocess.run( - [ - "ffmpeg", - "-y", - "-framerate", - "8", - "-i", - str(tmp_dir / f"{char}_%04d.png"), - "-c:v", - "png", - "-pix_fmt", - "rgba", - str(mov), - ], - check=True, - capture_output=True, - ) - char_movs[char] = mov - - # Build ffmpeg filter complex - # Inputs: [0]=background, [1..N]=character MOVs, [N+1]=episode.mp3 - char_list = unique_chars - char_to_inp = {char: i + 1 for i, char in enumerate(char_list)} - audio_inp = len(char_list) + 1 - char_counts: Counter[str] = Counter(VOICE_TO_CHAR[v] for v, _ in segments) - - filter_parts: list[str] = [] - - # Scale background to output size - filter_parts.append(f"[0:v]scale={W}:{H}[bg]") - - # Loop each character MOV infinitely, scale to CHAR_WIDTH, then split into - # as many copies as that character has speaking segments - char_labels: dict[str, list[str]] = {} - for char in char_list: - inp = char_to_inp[char] - n = char_counts[char] - # loop=-1 = infinite, size=8 = 8-frame loop, start=0 - filter_parts.append( - f"[{inp}:v]loop=-1:size=8:start=0,scale={CHAR_WIDTH}:-2[{char}_s]" - ) - if n == 1: - char_labels[char] = [f"{char}_s"] - else: - labels = [f"{char}_{j}" for j in range(n)] - filter_parts.append( - f"[{char}_s]split={n}" + "".join(f"[{l}]" for l in labels) - ) - char_labels[char] = labels - - # Chain one overlay per segment with time-gated enable and wobble - char_usage: dict[str, int] = {char: 0 for char in char_list} - current = "bg" - t = 0.0 - for seg_idx, ((voice_id, _), duration) in enumerate(zip(segments, durations)): - char = VOICE_TO_CHAR[voice_id] - j = char_usage[char] - char_usage[char] += 1 - label = char_labels[char][j] - t_end = t + duration - x = f"({W}-w)/2+8*sin(4*t)" - y = f"{H}-h-80+4*sin(5*t)" - enable = f"between(t,{t:.3f},{t_end:.3f})" - out = f"v{seg_idx}" + unique_chars = sorted({VOICE_TO_CHAR[v] for v, _ in segments}) + + # Build ffmpeg filter complex + # Inputs: [0]=background, [1..N]=character still PNGs, [N+1]=episode.mp3 + char_list = unique_chars + char_to_inp = {char: i + 1 for i, char in enumerate(char_list)} + audio_inp = len(char_list) + 1 + char_counts: Counter[str] = Counter(VOICE_TO_CHAR[v] for v, _ in segments) + + filter_parts: list[str] = [] + + # Scale background to output size + filter_parts.append(f"[0:v]scale={W}:{H}[bg]") + + # Scale each still image to CHAR_WIDTH, then split into as many copies as + # that character has speaking segments + char_labels: dict[str, list[str]] = {} + for char in char_list: + inp = char_to_inp[char] + n = char_counts[char] + filter_parts.append(f"[{inp}:v]scale={CHAR_WIDTH}:-2[{char}_s]") + if n == 1: + char_labels[char] = [f"{char}_s"] + else: + labels = [f"{char}_{j}" for j in range(n)] filter_parts.append( - f"[{current}][{label}]overlay=x='{x}':y='{y}':enable='{enable}'[{out}]" + f"[{char}_s]split={n}" + "".join(f"[{l}]" for l in labels) ) - current = out - t = t_end - - filter_complex = ";".join(filter_parts) - - # Assemble and run ffmpeg with VideoToolbox hardware encoding - print(" Encoding (hardware accelerated)...") - cmd: list[str] = ["ffmpeg", "-y"] - cmd += [ - "-ss", - f"{bg_start:.3f}", - "-t", - f"{total_duration:.3f}", - "-i", - background_path, - ] - for char in char_list: - cmd += ["-i", str(char_movs[char])] - cmd += ["-i", AUDIO_OUTPUT_FILE] - cmd += [ - "-filter_complex", - filter_complex, - "-map", - f"[{current}]", - "-map", - f"{audio_inp}:a", - "-c:v", - "h264_videotoolbox", - "-c:a", - "aac", - "-t", - f"{total_duration:.3f}", - VIDEO_OUTPUT_FILE, - ] - subprocess.run(cmd, check=True) - - finally: - shutil.rmtree(tmp_dir, ignore_errors=True) + char_labels[char] = labels + + # Chain one overlay per segment with time-gated enable and wobble + char_usage: dict[str, int] = {char: 0 for char in char_list} + current = "bg" + t = 0.0 + for seg_idx, ((voice_id, _), duration) in enumerate(zip(segments, durations)): + char = VOICE_TO_CHAR[voice_id] + j = char_usage[char] + char_usage[char] += 1 + label = char_labels[char][j] + t_end = t + duration + x = f"({W}-w)/2+8*sin(4*t)" + y = f"{H}-h-80+4*sin(5*t)" + enable = f"between(t,{t:.3f},{t_end:.3f})" + out = f"v{seg_idx}" + filter_parts.append( + f"[{current}][{label}]overlay=x='{x}':y='{y}':enable='{enable}'[{out}]" + ) + current = out + t = t_end + + filter_complex = ";".join(filter_parts) + + # Assemble and run ffmpeg with VideoToolbox hardware encoding + print(" Encoding (hardware accelerated)...") + cmd: list[str] = ["ffmpeg", "-y"] + cmd += [ + "-ss", + f"{bg_start:.3f}", + "-t", + f"{total_duration:.3f}", + "-i", + background_path, + ] + for char in char_list: + cmd += ["-loop", "1", "-i", str(CHARACTERS_DIR / f"{char}_still.png")] + cmd += ["-i", AUDIO_OUTPUT_FILE] + cmd += [ + "-filter_complex", + filter_complex, + "-map", + f"[{current}]", + "-map", + f"{audio_inp}:a", + "-c:v", + "h264_videotoolbox", + "-c:a", + "aac", + "-t", + f"{total_duration:.3f}", + VIDEO_OUTPUT_FILE, + ] + subprocess.run(cmd, check=True) def save_output(dialog: str) -> None: From 4eebec2febfbc2c1597db69ee0560c712cad9c66 Mon Sep 17 00:00:00 2001 From: Ole Magnus Fon Johnsen Date: Tue, 28 Apr 2026 13:11:07 +0200 Subject: [PATCH 3/4] fix: dont stretch screen --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 5e724fc..4ffc2b2 100644 --- a/main.py +++ b/main.py @@ -266,7 +266,7 @@ def generate_video( filter_parts: list[str] = [] # Scale background to output size - filter_parts.append(f"[0:v]scale={W}:{H}[bg]") + filter_parts.append(f"[0:v]scale={W}:{H}:force_original_aspect_ratio=increase,crop={W}:{H}[bg]") # Scale each still image to CHAR_WIDTH, then split into as many copies as # that character has speaking segments From 3f3dfe05acbba0ac9f8e6ff5a120e30e6e0f8852 Mon Sep 17 00:00:00 2001 From: Ole Magnus Fon Johnsen Date: Tue, 28 Apr 2026 13:51:00 +0200 Subject: [PATCH 4/4] chore: add type ignore comments and format code --- .gitignore | 1 + main.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 53 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 9953d5f..b8ed9aa 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ __pycache__/ characters/ background-videos/ +typings/ diff --git a/main.py b/main.py index 4ffc2b2..b26b960 100644 --- a/main.py +++ b/main.py @@ -43,6 +43,7 @@ CHAR_WIDTH = 480 TALKING_INTERVAL = 0.15 + SYSTEM_PROMPT = """You are a comedy writer generating a Family Guy-style TV dialog script. ## Characters @@ -108,7 +109,7 @@ def parse_since(since_str: str) -> datetime: def fetch_commits( owner: str, repo: str, since_iso: str, headers: dict[str, str] -) -> list[Any]: +) -> list[Any]: # pyright: ignore[reportExplicitAny] url = f"https://api.github.com/repos/{owner}/{repo}/commits" params = {"since": since_iso, "per_page": 100} resp = requests.get(url, headers=headers, params=params, timeout=20) @@ -123,12 +124,14 @@ def fetch_commits( return [] commits = resp.json() # filter out merge commits - return [c for c in commits if not c["commit"]["message"].startswith("Merge")] + return [ + c for c in commits if not c["commit"]["message"].startswith("Merge") + ].reverse() def fetch_commit_detail( owner: str, repo: str, sha: str, headers: dict[str, str] -) -> Any: +) -> Any: # pyright: ignore[reportExplicitAny] url = f"https://api.github.com/repos/{owner}/{repo}/commits/{sha}" resp = requests.get(url, headers=headers, timeout=20) if resp.status_code != 200: @@ -137,7 +140,10 @@ def fetch_commit_detail( def build_repo_summary( - owner: str, repo: str, commits: list[Any], headers: dict[str, str] + owner: str, + repo: str, + commits: list[Any], # pyright: ignore[reportExplicitAny] + headers: dict[str, str], ) -> str: lines = [f"## {repo} ({len(commits)} commits)"] for commit in commits[:10]: @@ -220,6 +226,14 @@ def generate_audio(segments: list[tuple[str, str]], fish_key: str) -> list[float return durations +def _srt_time(t: float) -> str: + ms = int(t * 1000) + h, ms = divmod(ms, 3_600_000) + m, ms = divmod(ms, 60_000) + s, ms = divmod(ms, 1_000) + return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" + + def generate_video( segments: list[tuple[str, str]], durations: list[float], @@ -266,7 +280,9 @@ def generate_video( filter_parts: list[str] = [] # Scale background to output size - filter_parts.append(f"[0:v]scale={W}:{H}:force_original_aspect_ratio=increase,crop={W}:{H}[bg]") + filter_parts.append( + f"[0:v]scale={W}:{H}:force_original_aspect_ratio=increase,crop={W}:{H}[bg]" + ) # Scale each still image to CHAR_WIDTH, then split into as many copies as # that character has speaking segments @@ -280,7 +296,7 @@ def generate_video( else: labels = [f"{char}_{j}" for j in range(n)] filter_parts.append( - f"[{char}_s]split={n}" + "".join(f"[{l}]" for l in labels) + f"[{char}_s]split={n}" + "".join(f"[{label}]" for label in labels) ) char_labels[char] = labels @@ -296,14 +312,41 @@ def generate_video( t_end = t + duration x = f"({W}-w)/2+8*sin(4*t)" y = f"{H}-h-80+4*sin(5*t)" - enable = f"between(t,{t:.3f},{t_end:.3f})" + enable = f"gte(t\\,{t:.3f})*lte(t\\,{t_end:.3f})" out = f"v{seg_idx}" filter_parts.append( - f"[{current}][{label}]overlay=x='{x}':y='{y}':enable='{enable}'[{out}]" + f"[{current}][{label}]overlay=x='{x}':y='{y}':enable={enable}[{out}]" ) current = out t = t_end + # Write SRT and burn in via subtitles filter (avoids drawtext escaping issues) + srt_path = Path("subtitles.srt").resolve() + t = 0.0 + entry = 1 + with open(srt_path, "w", encoding="utf-8") as f: + for (_, text), duration in zip(segments, durations): + clean = re.sub(r"^\(\w+\)\s*", "", text) + words = clean.split() + if not words: + t += duration + continue + word_dur = duration / len(words) + for word in words: + t_end = t + word_dur + f.write(f"{entry}\n{_srt_time(t)} --> {_srt_time(t_end)}\n{word}\n\n") + entry += 1 + t = t_end + style = ( + "FontName=Impact,FontSize=72,PrimaryColour=&H00FFFFFF," + "OutlineColour=&H00000000,BorderStyle=1,Outline=6," + "Alignment=2,MarginV=768" + ) + filter_parts.append( + f"[{current}]subtitles='{srt_path}':force_style='{style}'[subs]" + ) + current = "subs" + filter_complex = ";".join(filter_parts) # Assemble and run ffmpeg with VideoToolbox hardware encoding @@ -327,6 +370,7 @@ def generate_video( f"[{current}]", "-map", f"{audio_inp}:a", + "-r", "30", "-c:v", "h264_videotoolbox", "-c:a",