echo-webkom · omfj · Apr 27, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,10 @@
 manuscript.txt
 episode.mp3
+episode.mp4
 .env
 __pycache__/
 *.pyc
+
+characters/
+background-videos/
+typings/
diff --git a/README.md b/README.md
@@ -35,4 +35,13 @@ python main.py --since "3 days ago"
 
 Du kan bruke flagget `--no-tts` for å bare generere manuscriptet.
 
-Output lagres i `manuscript.txt`.
+For å generere video, pass en bakgrunnsvideofil. Et tilfeldig segment av videoen brukes:
+
+```bash
+python main.py --background background-videos/minecraft-parkour.mp4
+```
+
+Output:
+- `manuscript.txt` — dialogscriptet
+- `episode.mp3` — lydsporet
+- `episode.mp4` — videoen (kun om `--background` er gitt)
diff --git a/main.py b/main.py
@@ -3,6 +3,9 @@
 import re
 import sys
 from datetime import datetime, timedelta, timezone
+from io import BytesIO
+from pathlib import Path
+from typing import Any
 
 import anthropic
 import requests
@@ -22,6 +25,7 @@
 
 OUTPUT_FILE = "manuscript.txt"
 AUDIO_OUTPUT_FILE = "episode.mp3"
+VIDEO_OUTPUT_FILE = "episode.mp4"
 MODEL = "claude-haiku-4-5-20251001"
 PETER_VOICE = "d75c270eaee14c8aa1e9e980cc37cf1b"
 BRIAN_VOICE = "df7b23b4d67c4340be1170ae6cbc2913"
@@ -32,6 +36,13 @@
     "BRIAN": BRIAN_VOICE,
     "STEWIE": STEWIE_VOICE,
 }
+VOICE_TO_CHAR = {v: k.lower() for k, v in VOICE_MAP.items()}
+
+CHARACTERS_DIR = Path(__file__).parent / "characters"
+VIDEO_SIZE = (1080, 1920)
+CHAR_WIDTH = 480
+TALKING_INTERVAL = 0.15
+
 
 SYSTEM_PROMPT = """You are a comedy writer generating a Family Guy-style TV dialog script.
 
@@ -96,7 +107,9 @@ def parse_since(since_str: str) -> datetime:
     )
 
 
-def fetch_commits(owner: str, repo: str, since_iso: str, headers: dict) -> list[dict]:
+def fetch_commits(
+    owner: str, repo: str, since_iso: str, headers: dict[str, str]
+) -> list[Any]:  # pyright: ignore[reportExplicitAny]
     url = f"https://api.github.com/repos/{owner}/{repo}/commits"
     params = {"since": since_iso, "per_page": 100}
     resp = requests.get(url, headers=headers, params=params, timeout=20)
@@ -111,10 +124,14 @@ def fetch_commits(owner: str, repo: str, since_iso: str, headers: dict) -> list[
         return []
     commits = resp.json()
     # filter out merge commits
-    return [c for c in commits if not c["commit"]["message"].startswith("Merge")]
+    return [
+        c for c in commits if not c["commit"]["message"].startswith("Merge")
+    ].reverse()
 
 
-def fetch_commit_detail(owner: str, repo: str, sha: str, headers: dict) -> dict:
+def fetch_commit_detail(
+    owner: str, repo: str, sha: str, headers: dict[str, str]
+) -> Any:  # pyright: ignore[reportExplicitAny]
     url = f"https://api.github.com/repos/{owner}/{repo}/commits/{sha}"
     resp = requests.get(url, headers=headers, timeout=20)
     if resp.status_code != 200:
@@ -123,7 +140,10 @@ def fetch_commit_detail(owner: str, repo: str, sha: str, headers: dict) -> dict:
 
 
 def build_repo_summary(
-    owner: str, repo: str, commits: list[dict], headers: dict
+    owner: str,
+    repo: str,
+    commits: list[Any],  # pyright: ignore[reportExplicitAny]
+    headers: dict[str, str],
 ) -> str:
     lines = [f"## {repo} ({len(commits)} commits)"]
     for commit in commits[:10]:
@@ -185,20 +205,181 @@ def parse_manuscript(text: str) -> list[tuple[str, str]]:
     return result
 
 
-def generate_audio(segments: list[tuple[str, str]], fish_key: str) -> None:
+def generate_audio(segments: list[tuple[str, str]], fish_key: str) -> list[float]:
+    from mutagen.mp3 import MP3
+
     client = FishAudio(api_key=fish_key)
     configs = {
         voice_id: TTSConfig(reference_id=voice_id, format="mp3", latency="balanced")
         for voice_id in {v for v, _ in segments}
     }
-    audio_chunks = []
+    audio_chunks: list[bytes] = []
+    durations: list[float] = []
     for i, (voice_id, dialog) in enumerate(segments, 1):
         print(f"  TTS line {i}/{len(segments)}...")
         audio = client.tts.convert(text=dialog, config=configs[voice_id])
         audio_chunks.append(audio)
+        durations.append(MP3(BytesIO(audio)).info.length)
     with open(AUDIO_OUTPUT_FILE, "wb") as f:
         for chunk in audio_chunks:
             f.write(chunk)
+    return durations
+
+
+def _srt_time(t: float) -> str:
+    ms = int(t * 1000)
+    h, ms = divmod(ms, 3_600_000)
+    m, ms = divmod(ms, 60_000)
+    s, ms = divmod(ms, 1_000)
+    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+
+
+def generate_video(
+    segments: list[tuple[str, str]],
+    durations: list[float],
+    background_path: str,
+) -> None:
+    import json
+    import random
+    import subprocess
+    from collections import Counter
+
+    W, H = VIDEO_SIZE
+    total_duration = sum(durations)
+    n_segments = len(segments)
+    print(f"  Duration: {total_duration:.1f}s across {n_segments} dialog lines")
+
+    # Pick a random segment of the background video
+    probe = subprocess.run(
+        [
+            "ffprobe",
+            "-v",
+            "quiet",
+            "-print_format",
+            "json",
+            "-show_format",
+            background_path,
+        ],
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+    bg_duration = float(json.loads(probe.stdout)["format"]["duration"])
+    bg_start = random.uniform(0, max(0.0, bg_duration - total_duration))
+    print(f"  Using {Path(background_path).name} from {bg_start:.1f}s")
+
+    unique_chars = sorted({VOICE_TO_CHAR[v] for v, _ in segments})
+
+    # Build ffmpeg filter complex
+    # Inputs: [0]=background, [1..N]=character still PNGs, [N+1]=episode.mp3
+    char_list = unique_chars
+    char_to_inp = {char: i + 1 for i, char in enumerate(char_list)}
+    audio_inp = len(char_list) + 1
+    char_counts: Counter[str] = Counter(VOICE_TO_CHAR[v] for v, _ in segments)
+
+    filter_parts: list[str] = []
+
+    # Scale background to output size
+    filter_parts.append(
+        f"[0:v]scale={W}:{H}:force_original_aspect_ratio=increase,crop={W}:{H}[bg]"
+    )
+
+    # Scale each still image to CHAR_WIDTH, then split into as many copies as
+    # that character has speaking segments
+    char_labels: dict[str, list[str]] = {}
+    for char in char_list:
+        inp = char_to_inp[char]
+        n = char_counts[char]
+        filter_parts.append(f"[{inp}:v]scale={CHAR_WIDTH}:-2[{char}_s]")
+        if n == 1:
+            char_labels[char] = [f"{char}_s"]
+        else:
+            labels = [f"{char}_{j}" for j in range(n)]
+            filter_parts.append(
+                f"[{char}_s]split={n}" + "".join(f"[{label}]" for label in labels)
+            )
+            char_labels[char] = labels
+
+    # Chain one overlay per segment with time-gated enable and wobble
+    char_usage: dict[str, int] = {char: 0 for char in char_list}
+    current = "bg"
+    t = 0.0
+    for seg_idx, ((voice_id, _), duration) in enumerate(zip(segments, durations)):
+        char = VOICE_TO_CHAR[voice_id]
+        j = char_usage[char]
+        char_usage[char] += 1
+        label = char_labels[char][j]
+        t_end = t + duration
+        x = f"({W}-w)/2+8*sin(4*t)"
+        y = f"{H}-h-80+4*sin(5*t)"
+        enable = f"gte(t\\,{t:.3f})*lte(t\\,{t_end:.3f})"
+        out = f"v{seg_idx}"
+        filter_parts.append(
+            f"[{current}][{label}]overlay=x='{x}':y='{y}':enable={enable}[{out}]"
+        )
+        current = out
+        t = t_end
+
+    # Write SRT and burn in via subtitles filter (avoids drawtext escaping issues)
+    srt_path = Path("subtitles.srt").resolve()
+    t = 0.0
+    entry = 1
+    with open(srt_path, "w", encoding="utf-8") as f:
+        for (_, text), duration in zip(segments, durations):
+            clean = re.sub(r"^\(\w+\)\s*", "", text)
+            words = clean.split()
+            if not words:
+                t += duration
+                continue
+            word_dur = duration / len(words)
+            for word in words:
+                t_end = t + word_dur
+                f.write(f"{entry}\n{_srt_time(t)} --> {_srt_time(t_end)}\n{word}\n\n")
+                entry += 1
+                t = t_end
+    style = (
+        "FontName=Impact,FontSize=72,PrimaryColour=&H00FFFFFF,"
+        "OutlineColour=&H00000000,BorderStyle=1,Outline=6,"
+        "Alignment=2,MarginV=768"
+    )
+    filter_parts.append(
+        f"[{current}]subtitles='{srt_path}':force_style='{style}'[subs]"
+    )
+    current = "subs"
+
+    filter_complex = ";".join(filter_parts)
+
+    # Assemble and run ffmpeg with VideoToolbox hardware encoding
+    print("  Encoding (hardware accelerated)...")
+    cmd: list[str] = ["ffmpeg", "-y"]
+    cmd += [
+        "-ss",
+        f"{bg_start:.3f}",
+        "-t",
+        f"{total_duration:.3f}",
+        "-i",
+        background_path,
+    ]
+    for char in char_list:
+        cmd += ["-loop", "1", "-i", str(CHARACTERS_DIR / f"{char}_still.png")]
+    cmd += ["-i", AUDIO_OUTPUT_FILE]
+    cmd += [
+        "-filter_complex",
+        filter_complex,
+        "-map",
+        f"[{current}]",
+        "-map",
+        f"{audio_inp}:a",
+        "-r", "30",
+        "-c:v",
+        "h264_videotoolbox",
+        "-c:a",
+        "aac",
+        "-t",
+        f"{total_duration:.3f}",
+        VIDEO_OUTPUT_FILE,
+    ]
+    subprocess.run(cmd, check=True)
 
 
 def save_output(dialog: str) -> None:
@@ -214,6 +395,11 @@ def main() -> None:
         "--since", default="1 week ago", help="How far back to look (e.g. '1 week ago')"
     )
     parser.add_argument("--no-tts", action="store_true", help="Skip audio generation")
+    parser.add_argument(
+        "--background",
+        default=None,
+        help="Path to background video/image for the episode (skips video if omitted)",
+    )
     args = parser.parse_args()
 
     github_token = os.environ.get("GITHUB_TOKEN")
@@ -269,9 +455,16 @@ def main() -> None:
     if not args.no_tts:
         print("\nGenerating audio...")
         segments = parse_manuscript(dialog)
-        generate_audio(segments, fish_key)  # pyright: ignore[reportArgumentType]
+        durations = generate_audio(segments, fish_key)  # pyright: ignore[reportArgumentType]
         print(f"Audio saved to {AUDIO_OUTPUT_FILE}")
 
+        if args.background:
+            print("\nGenerating video...")
+            generate_video(segments, durations, args.background)
+            print(f"Video saved to {VIDEO_OUTPUT_FILE}")
+        else:
+            print("Tip: pass --background <file> to also generate a video.")
+
 
 if __name__ == "__main__":
     main()
diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,5 @@ anthropic>=0.50.0
 requests>=2.31.0
 python-dotenv>=1.0.0
 fish-audio-sdk>=1.0.0
+mutagen>=1.47
+Pillow>=10.0