From 03cf6d83112e6a9370a15eb44f7845c8d9bc8715 Mon Sep 17 00:00:00 2001
From: Ole Magnus Fon Johnsen <me@omfj.no>
Date: Mon, 27 Apr 2026 23:42:57 +0200
Subject: [PATCH 1/4] feat: video output

---
 .gitignore       |   4 +
 README.md        |  11 ++-
 main.py          | 205 +++++++++++++++++++++++++++++++++++++++++++++--
 requirements.txt |   2 +
 4 files changed, 215 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1f8663e..9953d5f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,9 @@
 manuscript.txt
 episode.mp3
+episode.mp4
 .env
 __pycache__/
 *.pyc
+
+characters/
+background-videos/
diff --git a/README.md b/README.md
index 7e86410..bbd88ba 100644
--- a/README.md
+++ b/README.md
@@ -35,4 +35,13 @@ python main.py --since "3 days ago"
 
 Du kan bruke flagget `--no-tts` for å bare generere manuscriptet.
 
-Output lagres i `manuscript.txt`.
+For å generere video, pass en bakgrunnsvideofil. Et tilfeldig segment av videoen brukes:
+
+```bash
+python main.py --background background-videos/minecraft-parkour.mp4
+```
+
+Output:
+- `manuscript.txt` — dialogscriptet
+- `episode.mp3` — lydsporet
+- `episode.mp4` — videoen (kun om `--background` er gitt)
diff --git a/main.py b/main.py
index cc44523..016f024 100644
--- a/main.py
+++ b/main.py
@@ -3,6 +3,9 @@
 import re
 import sys
 from datetime import datetime, timedelta, timezone
+from io import BytesIO
+from pathlib import Path
+from typing import Any
 
 import anthropic
 import requests
@@ -22,6 +25,7 @@
 
 OUTPUT_FILE = "manuscript.txt"
 AUDIO_OUTPUT_FILE = "episode.mp3"
+VIDEO_OUTPUT_FILE = "episode.mp4"
 MODEL = "claude-haiku-4-5-20251001"
 PETER_VOICE = "d75c270eaee14c8aa1e9e980cc37cf1b"
 BRIAN_VOICE = "df7b23b4d67c4340be1170ae6cbc2913"
@@ -32,6 +36,12 @@
     "BRIAN": BRIAN_VOICE,
     "STEWIE": STEWIE_VOICE,
 }
+VOICE_TO_CHAR = {v: k.lower() for k, v in VOICE_MAP.items()}
+
+CHARACTERS_DIR = Path(__file__).parent / "characters"
+VIDEO_SIZE = (1080, 1920)
+CHAR_WIDTH = 480
+TALKING_INTERVAL = 0.15
 
 SYSTEM_PROMPT = """You are a comedy writer generating a Family Guy-style TV dialog script.
 
@@ -96,7 +106,9 @@ def parse_since(since_str: str) -> datetime:
     )
 
 
-def fetch_commits(owner: str, repo: str, since_iso: str, headers: dict) -> list[dict]:
+def fetch_commits(
+    owner: str, repo: str, since_iso: str, headers: dict[str, str]
+) -> list[Any]:
     url = f"https://api.github.com/repos/{owner}/{repo}/commits"
     params = {"since": since_iso, "per_page": 100}
     resp = requests.get(url, headers=headers, params=params, timeout=20)
@@ -114,7 +126,9 @@ def fetch_commits(owner: str, repo: str, since_iso: str, headers: dict) -> list[
     return [c for c in commits if not c["commit"]["message"].startswith("Merge")]
 
 
-def fetch_commit_detail(owner: str, repo: str, sha: str, headers: dict) -> dict:
+def fetch_commit_detail(
+    owner: str, repo: str, sha: str, headers: dict[str, str]
+) -> Any:
     url = f"https://api.github.com/repos/{owner}/{repo}/commits/{sha}"
     resp = requests.get(url, headers=headers, timeout=20)
     if resp.status_code != 200:
@@ -123,7 +137,7 @@ def fetch_commit_detail(owner: str, repo: str, sha: str, headers: dict) -> dict:
 
 
 def build_repo_summary(
-    owner: str, repo: str, commits: list[dict], headers: dict
+    owner: str, repo: str, commits: list[Any], headers: dict[str, str]
 ) -> str:
     lines = [f"## {repo} ({len(commits)} commits)"]
     for commit in commits[:10]:
@@ -185,20 +199,187 @@ def parse_manuscript(text: str) -> list[tuple[str, str]]:
     return result
 
 
-def generate_audio(segments: list[tuple[str, str]], fish_key: str) -> None:
+def generate_audio(segments: list[tuple[str, str]], fish_key: str) -> list[float]:
+    from mutagen.mp3 import MP3
+
     client = FishAudio(api_key=fish_key)
     configs = {
         voice_id: TTSConfig(reference_id=voice_id, format="mp3", latency="balanced")
         for voice_id in {v for v, _ in segments}
     }
-    audio_chunks = []
+    audio_chunks: list[bytes] = []
+    durations: list[float] = []
     for i, (voice_id, dialog) in enumerate(segments, 1):
         print(f"  TTS line {i}/{len(segments)}...")
         audio = client.tts.convert(text=dialog, config=configs[voice_id])
         audio_chunks.append(audio)
+        durations.append(MP3(BytesIO(audio)).info.length)
     with open(AUDIO_OUTPUT_FILE, "wb") as f:
         for chunk in audio_chunks:
             f.write(chunk)
+    return durations
+
+
+def generate_video(
+    segments: list[tuple[str, str]],
+    durations: list[float],
+    background_path: str,
+) -> None:
+    import json
+    import random
+    import shutil
+    import subprocess
+    import tempfile
+    from collections import Counter
+
+    from PIL import Image
+
+    W, H = VIDEO_SIZE
+    total_duration = sum(durations)
+    n_segments = len(segments)
+    print(f"  Duration: {total_duration:.1f}s across {n_segments} dialog lines")
+
+    # Pick a random segment of the background video
+    probe = subprocess.run(
+        [
+            "ffprobe",
+            "-v",
+            "quiet",
+            "-print_format",
+            "json",
+            "-show_format",
+            background_path,
+        ],
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+    bg_duration = float(json.loads(probe.stdout)["format"]["duration"])
+    bg_start = random.uniform(0, max(0.0, bg_duration - total_duration))
+    print(f"  Using {Path(background_path).name} from {bg_start:.1f}s")
+
+    tmp_dir = Path(tempfile.mkdtemp())
+    try:
+        # Pre-render each character as an 8-frame looping MOV with alpha
+        # (4 frames talking + 4 frames still at 8 fps = 1s loop)
+        unique_chars = sorted({VOICE_TO_CHAR[v] for v, _ in segments})
+        char_movs: dict[str, Path] = {}
+        print("  Rendering character animation loops...")
+        for char in unique_chars:
+            print(f"    {char}")
+            still = Image.open(CHARACTERS_DIR / f"{char}_still.png").convert("RGBA")
+            talking = Image.open(CHARACTERS_DIR / f"{char}_talking.png").convert("RGBA")
+            char_h = int(CHAR_WIDTH * still.height / still.width)
+            still = still.resize((CHAR_WIDTH, char_h), Image.Resampling.LANCZOS)
+            talking = talking.resize((CHAR_WIDTH, char_h), Image.Resampling.LANCZOS)
+            for i, frame in enumerate([talking] * 4 + [still] * 4):
+                frame.save(tmp_dir / f"{char}_{i:04d}.png")
+            mov = tmp_dir / f"{char}.mov"
+            subprocess.run(
+                [
+                    "ffmpeg",
+                    "-y",
+                    "-framerate",
+                    "8",
+                    "-i",
+                    str(tmp_dir / f"{char}_%04d.png"),
+                    "-c:v",
+                    "png",
+                    "-pix_fmt",
+                    "rgba",
+                    str(mov),
+                ],
+                check=True,
+                capture_output=True,
+            )
+            char_movs[char] = mov
+
+        # Build ffmpeg filter complex
+        # Inputs: [0]=background, [1..N]=character MOVs, [N+1]=episode.mp3
+        char_list = unique_chars
+        char_to_inp = {char: i + 1 for i, char in enumerate(char_list)}
+        audio_inp = len(char_list) + 1
+        char_counts: Counter[str] = Counter(VOICE_TO_CHAR[v] for v, _ in segments)
+
+        filter_parts: list[str] = []
+
+        # Scale background to output size
+        filter_parts.append(f"[0:v]scale={W}:{H}[bg]")
+
+        # Loop each character MOV infinitely, scale to CHAR_WIDTH, then split into
+        # as many copies as that character has speaking segments
+        char_labels: dict[str, list[str]] = {}
+        for char in char_list:
+            inp = char_to_inp[char]
+            n = char_counts[char]
+            # loop=-1 = infinite, size=8 = 8-frame loop, start=0
+            filter_parts.append(
+                f"[{inp}:v]loop=-1:size=8:start=0,scale={CHAR_WIDTH}:-2[{char}_s]"
+            )
+            if n == 1:
+                char_labels[char] = [f"{char}_s"]
+            else:
+                labels = [f"{char}_{j}" for j in range(n)]
+                filter_parts.append(
+                    f"[{char}_s]split={n}" + "".join(f"[{l}]" for l in labels)
+                )
+                char_labels[char] = labels
+
+        # Chain one overlay per segment with time-gated enable and wobble
+        char_usage: dict[str, int] = {char: 0 for char in char_list}
+        current = "bg"
+        t = 0.0
+        for seg_idx, ((voice_id, _), duration) in enumerate(zip(segments, durations)):
+            char = VOICE_TO_CHAR[voice_id]
+            j = char_usage[char]
+            char_usage[char] += 1
+            label = char_labels[char][j]
+            t_end = t + duration
+            x = f"({W}-w)/2+8*sin(4*t)"
+            y = f"{H}-h-80+4*sin(5*t)"
+            enable = f"between(t,{t:.3f},{t_end:.3f})"
+            out = f"v{seg_idx}"
+            filter_parts.append(
+                f"[{current}][{label}]overlay=x='{x}':y='{y}':enable='{enable}'[{out}]"
+            )
+            current = out
+            t = t_end
+
+        filter_complex = ";".join(filter_parts)
+
+        # Assemble and run ffmpeg with VideoToolbox hardware encoding
+        print("  Encoding (hardware accelerated)...")
+        cmd: list[str] = ["ffmpeg", "-y"]
+        cmd += [
+            "-ss",
+            f"{bg_start:.3f}",
+            "-t",
+            f"{total_duration:.3f}",
+            "-i",
+            background_path,
+        ]
+        for char in char_list:
+            cmd += ["-i", str(char_movs[char])]
+        cmd += ["-i", AUDIO_OUTPUT_FILE]
+        cmd += [
+            "-filter_complex",
+            filter_complex,
+            "-map",
+            f"[{current}]",
+            "-map",
+            f"{audio_inp}:a",
+            "-c:v",
+            "h264_videotoolbox",
+            "-c:a",
+            "aac",
+            "-t",
+            f"{total_duration:.3f}",
+            VIDEO_OUTPUT_FILE,
+        ]
+        subprocess.run(cmd, check=True)
+
+    finally:
+        shutil.rmtree(tmp_dir, ignore_errors=True)
 
 
 def save_output(dialog: str) -> None:
@@ -214,6 +395,11 @@ def main() -> None:
         "--since", default="1 week ago", help="How far back to look (e.g. '1 week ago')"
     )
     parser.add_argument("--no-tts", action="store_true", help="Skip audio generation")
+    parser.add_argument(
+        "--background",
+        default=None,
+        help="Path to background video/image for the episode (skips video if omitted)",
+    )
     args = parser.parse_args()
 
     github_token = os.environ.get("GITHUB_TOKEN")
@@ -269,9 +455,16 @@ def main() -> None:
     if not args.no_tts:
         print("\nGenerating audio...")
         segments = parse_manuscript(dialog)
-        generate_audio(segments, fish_key)  # pyright: ignore[reportArgumentType]
+        durations = generate_audio(segments, fish_key)  # pyright: ignore[reportArgumentType]
         print(f"Audio saved to {AUDIO_OUTPUT_FILE}")
 
+        if args.background:
+            print("\nGenerating video...")
+            generate_video(segments, durations, args.background)
+            print(f"Video saved to {VIDEO_OUTPUT_FILE}")
+        else:
+            print("Tip: pass --background <file> to also generate a video.")
+
 
 if __name__ == "__main__":
     main()
diff --git a/requirements.txt b/requirements.txt
index ba1f998..c40eac0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,5 @@ anthropic>=0.50.0
 requests>=2.31.0
 python-dotenv>=1.0.0
 fish-audio-sdk>=1.0.0
+mutagen>=1.47
+Pillow>=10.0

From 37233fdee8ad840adf7367785ac10d8918103618 Mon Sep 17 00:00:00 2001
From: Ole Magnus Fon Johnsen <me@omfj.no>
Date: Tue, 28 Apr 2026 12:53:01 +0200
Subject: [PATCH 2/4] refactor: simplify video generation by using ffmpeg
 looping instead of pre-rendering

---
 main.py | 204 ++++++++++++++++++++++----------------------------------
 1 file changed, 80 insertions(+), 124 deletions(-)

diff --git a/main.py b/main.py
index 016f024..5e724fc 100644
--- a/main.py
+++ b/main.py
@@ -227,13 +227,9 @@ def generate_video(
 ) -> None:
     import json
     import random
-    import shutil
     import subprocess
-    import tempfile
     from collections import Counter
 
-    from PIL import Image
-
     W, H = VIDEO_SIZE
     total_duration = sum(durations)
     n_segments = len(segments)
@@ -258,128 +254,88 @@ def generate_video(
     bg_start = random.uniform(0, max(0.0, bg_duration - total_duration))
     print(f"  Using {Path(background_path).name} from {bg_start:.1f}s")
 
-    tmp_dir = Path(tempfile.mkdtemp())
-    try:
-        # Pre-render each character as an 8-frame looping MOV with alpha
-        # (4 frames talking + 4 frames still at 8 fps = 1s loop)
-        unique_chars = sorted({VOICE_TO_CHAR[v] for v, _ in segments})
-        char_movs: dict[str, Path] = {}
-        print("  Rendering character animation loops...")
-        for char in unique_chars:
-            print(f"    {char}")
-            still = Image.open(CHARACTERS_DIR / f"{char}_still.png").convert("RGBA")
-            talking = Image.open(CHARACTERS_DIR / f"{char}_talking.png").convert("RGBA")
-            char_h = int(CHAR_WIDTH * still.height / still.width)
-            still = still.resize((CHAR_WIDTH, char_h), Image.Resampling.LANCZOS)
-            talking = talking.resize((CHAR_WIDTH, char_h), Image.Resampling.LANCZOS)
-            for i, frame in enumerate([talking] * 4 + [still] * 4):
-                frame.save(tmp_dir / f"{char}_{i:04d}.png")
-            mov = tmp_dir / f"{char}.mov"
-            subprocess.run(
-                [
-                    "ffmpeg",
-                    "-y",
-                    "-framerate",
-                    "8",
-                    "-i",
-                    str(tmp_dir / f"{char}_%04d.png"),
-                    "-c:v",
-                    "png",
-                    "-pix_fmt",
-                    "rgba",
-                    str(mov),
-                ],
-                check=True,
-                capture_output=True,
-            )
-            char_movs[char] = mov
-
-        # Build ffmpeg filter complex
-        # Inputs: [0]=background, [1..N]=character MOVs, [N+1]=episode.mp3
-        char_list = unique_chars
-        char_to_inp = {char: i + 1 for i, char in enumerate(char_list)}
-        audio_inp = len(char_list) + 1
-        char_counts: Counter[str] = Counter(VOICE_TO_CHAR[v] for v, _ in segments)
-
-        filter_parts: list[str] = []
-
-        # Scale background to output size
-        filter_parts.append(f"[0:v]scale={W}:{H}[bg]")
-
-        # Loop each character MOV infinitely, scale to CHAR_WIDTH, then split into
-        # as many copies as that character has speaking segments
-        char_labels: dict[str, list[str]] = {}
-        for char in char_list:
-            inp = char_to_inp[char]
-            n = char_counts[char]
-            # loop=-1 = infinite, size=8 = 8-frame loop, start=0
-            filter_parts.append(
-                f"[{inp}:v]loop=-1:size=8:start=0,scale={CHAR_WIDTH}:-2[{char}_s]"
-            )
-            if n == 1:
-                char_labels[char] = [f"{char}_s"]
-            else:
-                labels = [f"{char}_{j}" for j in range(n)]
-                filter_parts.append(
-                    f"[{char}_s]split={n}" + "".join(f"[{l}]" for l in labels)
-                )
-                char_labels[char] = labels
-
-        # Chain one overlay per segment with time-gated enable and wobble
-        char_usage: dict[str, int] = {char: 0 for char in char_list}
-        current = "bg"
-        t = 0.0
-        for seg_idx, ((voice_id, _), duration) in enumerate(zip(segments, durations)):
-            char = VOICE_TO_CHAR[voice_id]
-            j = char_usage[char]
-            char_usage[char] += 1
-            label = char_labels[char][j]
-            t_end = t + duration
-            x = f"({W}-w)/2+8*sin(4*t)"
-            y = f"{H}-h-80+4*sin(5*t)"
-            enable = f"between(t,{t:.3f},{t_end:.3f})"
-            out = f"v{seg_idx}"
+    unique_chars = sorted({VOICE_TO_CHAR[v] for v, _ in segments})
+
+    # Build ffmpeg filter complex
+    # Inputs: [0]=background, [1..N]=character still PNGs, [N+1]=episode.mp3
+    char_list = unique_chars
+    char_to_inp = {char: i + 1 for i, char in enumerate(char_list)}
+    audio_inp = len(char_list) + 1
+    char_counts: Counter[str] = Counter(VOICE_TO_CHAR[v] for v, _ in segments)
+
+    filter_parts: list[str] = []
+
+    # Scale background to output size
+    filter_parts.append(f"[0:v]scale={W}:{H}[bg]")
+
+    # Scale each still image to CHAR_WIDTH, then split into as many copies as
+    # that character has speaking segments
+    char_labels: dict[str, list[str]] = {}
+    for char in char_list:
+        inp = char_to_inp[char]
+        n = char_counts[char]
+        filter_parts.append(f"[{inp}:v]scale={CHAR_WIDTH}:-2[{char}_s]")
+        if n == 1:
+            char_labels[char] = [f"{char}_s"]
+        else:
+            labels = [f"{char}_{j}" for j in range(n)]
             filter_parts.append(
-                f"[{current}][{label}]overlay=x='{x}':y='{y}':enable='{enable}'[{out}]"
+                f"[{char}_s]split={n}" + "".join(f"[{l}]" for l in labels)
             )
-            current = out
-            t = t_end
-
-        filter_complex = ";".join(filter_parts)
-
-        # Assemble and run ffmpeg with VideoToolbox hardware encoding
-        print("  Encoding (hardware accelerated)...")
-        cmd: list[str] = ["ffmpeg", "-y"]
-        cmd += [
-            "-ss",
-            f"{bg_start:.3f}",
-            "-t",
-            f"{total_duration:.3f}",
-            "-i",
-            background_path,
-        ]
-        for char in char_list:
-            cmd += ["-i", str(char_movs[char])]
-        cmd += ["-i", AUDIO_OUTPUT_FILE]
-        cmd += [
-            "-filter_complex",
-            filter_complex,
-            "-map",
-            f"[{current}]",
-            "-map",
-            f"{audio_inp}:a",
-            "-c:v",
-            "h264_videotoolbox",
-            "-c:a",
-            "aac",
-            "-t",
-            f"{total_duration:.3f}",
-            VIDEO_OUTPUT_FILE,
-        ]
-        subprocess.run(cmd, check=True)
-
-    finally:
-        shutil.rmtree(tmp_dir, ignore_errors=True)
+            char_labels[char] = labels
+
+    # Chain one overlay per segment with time-gated enable and wobble
+    char_usage: dict[str, int] = {char: 0 for char in char_list}
+    current = "bg"
+    t = 0.0
+    for seg_idx, ((voice_id, _), duration) in enumerate(zip(segments, durations)):
+        char = VOICE_TO_CHAR[voice_id]
+        j = char_usage[char]
+        char_usage[char] += 1
+        label = char_labels[char][j]
+        t_end = t + duration
+        x = f"({W}-w)/2+8*sin(4*t)"
+        y = f"{H}-h-80+4*sin(5*t)"
+        enable = f"between(t,{t:.3f},{t_end:.3f})"
+        out = f"v{seg_idx}"
+        filter_parts.append(
+            f"[{current}][{label}]overlay=x='{x}':y='{y}':enable='{enable}'[{out}]"
+        )
+        current = out
+        t = t_end
+
+    filter_complex = ";".join(filter_parts)
+
+    # Assemble and run ffmpeg with VideoToolbox hardware encoding
+    print("  Encoding (hardware accelerated)...")
+    cmd: list[str] = ["ffmpeg", "-y"]
+    cmd += [
+        "-ss",
+        f"{bg_start:.3f}",
+        "-t",
+        f"{total_duration:.3f}",
+        "-i",
+        background_path,
+    ]
+    for char in char_list:
+        cmd += ["-loop", "1", "-i", str(CHARACTERS_DIR / f"{char}_still.png")]
+    cmd += ["-i", AUDIO_OUTPUT_FILE]
+    cmd += [
+        "-filter_complex",
+        filter_complex,
+        "-map",
+        f"[{current}]",
+        "-map",
+        f"{audio_inp}:a",
+        "-c:v",
+        "h264_videotoolbox",
+        "-c:a",
+        "aac",
+        "-t",
+        f"{total_duration:.3f}",
+        VIDEO_OUTPUT_FILE,
+    ]
+    subprocess.run(cmd, check=True)
 
 
 def save_output(dialog: str) -> None:

From 4eebec2febfbc2c1597db69ee0560c712cad9c66 Mon Sep 17 00:00:00 2001
From: Ole Magnus Fon Johnsen <me@omfj.no>
Date: Tue, 28 Apr 2026 13:11:07 +0200
Subject: [PATCH 3/4] fix: dont stretch screen

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index 5e724fc..4ffc2b2 100644
--- a/main.py
+++ b/main.py
@@ -266,7 +266,7 @@ def generate_video(
     filter_parts: list[str] = []
 
     # Scale background to output size
-    filter_parts.append(f"[0:v]scale={W}:{H}[bg]")
+    filter_parts.append(f"[0:v]scale={W}:{H}:force_original_aspect_ratio=increase,crop={W}:{H}[bg]")
 
     # Scale each still image to CHAR_WIDTH, then split into as many copies as
     # that character has speaking segments

From 3f3dfe05acbba0ac9f8e6ff5a120e30e6e0f8852 Mon Sep 17 00:00:00 2001
From: Ole Magnus Fon Johnsen <me@omfj.no>
Date: Tue, 28 Apr 2026 13:51:00 +0200
Subject: [PATCH 4/4] chore: add type ignore comments and format code

---
 .gitignore |  1 +
 main.py    | 60 ++++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9953d5f..b8ed9aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ __pycache__/
 
 characters/
 background-videos/
+typings/
diff --git a/main.py b/main.py
index 4ffc2b2..b26b960 100644
--- a/main.py
+++ b/main.py
@@ -43,6 +43,7 @@
 CHAR_WIDTH = 480
 TALKING_INTERVAL = 0.15
 
+
 SYSTEM_PROMPT = """You are a comedy writer generating a Family Guy-style TV dialog script.
 
 ## Characters
@@ -108,7 +109,7 @@ def parse_since(since_str: str) -> datetime:
 
 def fetch_commits(
     owner: str, repo: str, since_iso: str, headers: dict[str, str]
-) -> list[Any]:
+) -> list[Any]:  # pyright: ignore[reportExplicitAny]
     url = f"https://api.github.com/repos/{owner}/{repo}/commits"
     params = {"since": since_iso, "per_page": 100}
     resp = requests.get(url, headers=headers, params=params, timeout=20)
@@ -123,12 +124,14 @@ def fetch_commits(
         return []
     commits = resp.json()
     # filter out merge commits
-    return [c for c in commits if not c["commit"]["message"].startswith("Merge")]
+    return [
+        c for c in commits if not c["commit"]["message"].startswith("Merge")
+    ].reverse()
 
 
 def fetch_commit_detail(
     owner: str, repo: str, sha: str, headers: dict[str, str]
-) -> Any:
+) -> Any:  # pyright: ignore[reportExplicitAny]
     url = f"https://api.github.com/repos/{owner}/{repo}/commits/{sha}"
     resp = requests.get(url, headers=headers, timeout=20)
     if resp.status_code != 200:
@@ -137,7 +140,10 @@ def fetch_commit_detail(
 
 
 def build_repo_summary(
-    owner: str, repo: str, commits: list[Any], headers: dict[str, str]
+    owner: str,
+    repo: str,
+    commits: list[Any],  # pyright: ignore[reportExplicitAny]
+    headers: dict[str, str],
 ) -> str:
     lines = [f"## {repo} ({len(commits)} commits)"]
     for commit in commits[:10]:
@@ -220,6 +226,14 @@ def generate_audio(segments: list[tuple[str, str]], fish_key: str) -> list[float
     return durations
 
 
+def _srt_time(t: float) -> str:
+    ms = int(t * 1000)
+    h, ms = divmod(ms, 3_600_000)
+    m, ms = divmod(ms, 60_000)
+    s, ms = divmod(ms, 1_000)
+    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+
+
 def generate_video(
     segments: list[tuple[str, str]],
     durations: list[float],
@@ -266,7 +280,9 @@ def generate_video(
     filter_parts: list[str] = []
 
     # Scale background to output size
-    filter_parts.append(f"[0:v]scale={W}:{H}:force_original_aspect_ratio=increase,crop={W}:{H}[bg]")
+    filter_parts.append(
+        f"[0:v]scale={W}:{H}:force_original_aspect_ratio=increase,crop={W}:{H}[bg]"
+    )
 
     # Scale each still image to CHAR_WIDTH, then split into as many copies as
     # that character has speaking segments
@@ -280,7 +296,7 @@ def generate_video(
         else:
             labels = [f"{char}_{j}" for j in range(n)]
             filter_parts.append(
-                f"[{char}_s]split={n}" + "".join(f"[{l}]" for l in labels)
+                f"[{char}_s]split={n}" + "".join(f"[{label}]" for label in labels)
             )
             char_labels[char] = labels
 
@@ -296,14 +312,41 @@ def generate_video(
         t_end = t + duration
         x = f"({W}-w)/2+8*sin(4*t)"
         y = f"{H}-h-80+4*sin(5*t)"
-        enable = f"between(t,{t:.3f},{t_end:.3f})"
+        enable = f"gte(t\\,{t:.3f})*lte(t\\,{t_end:.3f})"
         out = f"v{seg_idx}"
         filter_parts.append(
-            f"[{current}][{label}]overlay=x='{x}':y='{y}':enable='{enable}'[{out}]"
+            f"[{current}][{label}]overlay=x='{x}':y='{y}':enable={enable}[{out}]"
         )
         current = out
         t = t_end
 
+    # Write SRT and burn in via subtitles filter (avoids drawtext escaping issues)
+    srt_path = Path("subtitles.srt").resolve()
+    t = 0.0
+    entry = 1
+    with open(srt_path, "w", encoding="utf-8") as f:
+        for (_, text), duration in zip(segments, durations):
+            clean = re.sub(r"^\(\w+\)\s*", "", text)
+            words = clean.split()
+            if not words:
+                t += duration
+                continue
+            word_dur = duration / len(words)
+            for word in words:
+                t_end = t + word_dur
+                f.write(f"{entry}\n{_srt_time(t)} --> {_srt_time(t_end)}\n{word}\n\n")
+                entry += 1
+                t = t_end
+    style = (
+        "FontName=Impact,FontSize=72,PrimaryColour=&H00FFFFFF,"
+        "OutlineColour=&H00000000,BorderStyle=1,Outline=6,"
+        "Alignment=2,MarginV=768"
+    )
+    filter_parts.append(
+        f"[{current}]subtitles='{srt_path}':force_style='{style}'[subs]"
+    )
+    current = "subs"
+
     filter_complex = ";".join(filter_parts)
 
     # Assemble and run ffmpeg with VideoToolbox hardware encoding
@@ -327,6 +370,7 @@ def generate_video(
         f"[{current}]",
         "-map",
         f"{audio_inp}:a",
+        "-r", "30",
         "-c:v",
         "h264_videotoolbox",
         "-c:a",