diff --git a/.gitignore b/.gitignore index 1f8663e..b8ed9aa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,10 @@ manuscript.txt episode.mp3 +episode.mp4 .env __pycache__/ *.pyc + +characters/ +background-videos/ +typings/ diff --git a/README.md b/README.md index 7e86410..bbd88ba 100644 --- a/README.md +++ b/README.md @@ -35,4 +35,13 @@ python main.py --since "3 days ago" Du kan bruke flagget `--no-tts` for å bare generere manuscriptet. -Output lagres i `manuscript.txt`. +For å generere video, pass en bakgrunnsvideofil. Et tilfeldig segment av videoen brukes: + +```bash +python main.py --background background-videos/minecraft-parkour.mp4 +``` + +Output: +- `manuscript.txt` — dialogscriptet +- `episode.mp3` — lydsporet +- `episode.mp4` — videoen (kun om `--background` er gitt) diff --git a/main.py b/main.py index cc44523..b26b960 100644 --- a/main.py +++ b/main.py @@ -3,6 +3,9 @@ import re import sys from datetime import datetime, timedelta, timezone +from io import BytesIO +from pathlib import Path +from typing import Any import anthropic import requests @@ -22,6 +25,7 @@ OUTPUT_FILE = "manuscript.txt" AUDIO_OUTPUT_FILE = "episode.mp3" +VIDEO_OUTPUT_FILE = "episode.mp4" MODEL = "claude-haiku-4-5-20251001" PETER_VOICE = "d75c270eaee14c8aa1e9e980cc37cf1b" BRIAN_VOICE = "df7b23b4d67c4340be1170ae6cbc2913" @@ -32,6 +36,13 @@ "BRIAN": BRIAN_VOICE, "STEWIE": STEWIE_VOICE, } +VOICE_TO_CHAR = {v: k.lower() for k, v in VOICE_MAP.items()} + +CHARACTERS_DIR = Path(__file__).parent / "characters" +VIDEO_SIZE = (1080, 1920) +CHAR_WIDTH = 480 +TALKING_INTERVAL = 0.15 + SYSTEM_PROMPT = """You are a comedy writer generating a Family Guy-style TV dialog script. @@ -96,7 +107,9 @@ def parse_since(since_str: str) -> datetime: ) -def fetch_commits(owner: str, repo: str, since_iso: str, headers: dict) -> list[dict]: +def fetch_commits( + owner: str, repo: str, since_iso: str, headers: dict[str, str] +) -> list[Any]: # pyright: ignore[reportExplicitAny] url = f"https://api.github.com/repos/{owner}/{repo}/commits" params = {"since": since_iso, "per_page": 100} resp = requests.get(url, headers=headers, params=params, timeout=20) @@ -111,10 +124,14 @@ def fetch_commits(owner: str, repo: str, since_iso: str, headers: dict) -> list[ return [] commits = resp.json() # filter out merge commits - return [c for c in commits if not c["commit"]["message"].startswith("Merge")] + return [ + c for c in commits if not c["commit"]["message"].startswith("Merge") + ].reverse() -def fetch_commit_detail(owner: str, repo: str, sha: str, headers: dict) -> dict: +def fetch_commit_detail( + owner: str, repo: str, sha: str, headers: dict[str, str] +) -> Any: # pyright: ignore[reportExplicitAny] url = f"https://api.github.com/repos/{owner}/{repo}/commits/{sha}" resp = requests.get(url, headers=headers, timeout=20) if resp.status_code != 200: @@ -123,7 +140,10 @@ def fetch_commit_detail(owner: str, repo: str, sha: str, headers: dict) -> dict: def build_repo_summary( - owner: str, repo: str, commits: list[dict], headers: dict + owner: str, + repo: str, + commits: list[Any], # pyright: ignore[reportExplicitAny] + headers: dict[str, str], ) -> str: lines = [f"## {repo} ({len(commits)} commits)"] for commit in commits[:10]: @@ -185,20 +205,181 @@ def parse_manuscript(text: str) -> list[tuple[str, str]]: return result -def generate_audio(segments: list[tuple[str, str]], fish_key: str) -> None: +def generate_audio(segments: list[tuple[str, str]], fish_key: str) -> list[float]: + from mutagen.mp3 import MP3 + client = FishAudio(api_key=fish_key) configs = { voice_id: TTSConfig(reference_id=voice_id, format="mp3", latency="balanced") for voice_id in {v for v, _ in segments} } - audio_chunks = [] + audio_chunks: list[bytes] = [] + durations: list[float] = [] for i, (voice_id, dialog) in enumerate(segments, 1): print(f" TTS line {i}/{len(segments)}...") audio = client.tts.convert(text=dialog, config=configs[voice_id]) audio_chunks.append(audio) + durations.append(MP3(BytesIO(audio)).info.length) with open(AUDIO_OUTPUT_FILE, "wb") as f: for chunk in audio_chunks: f.write(chunk) + return durations + + +def _srt_time(t: float) -> str: + ms = int(t * 1000) + h, ms = divmod(ms, 3_600_000) + m, ms = divmod(ms, 60_000) + s, ms = divmod(ms, 1_000) + return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" + + +def generate_video( + segments: list[tuple[str, str]], + durations: list[float], + background_path: str, +) -> None: + import json + import random + import subprocess + from collections import Counter + + W, H = VIDEO_SIZE + total_duration = sum(durations) + n_segments = len(segments) + print(f" Duration: {total_duration:.1f}s across {n_segments} dialog lines") + + # Pick a random segment of the background video + probe = subprocess.run( + [ + "ffprobe", + "-v", + "quiet", + "-print_format", + "json", + "-show_format", + background_path, + ], + capture_output=True, + text=True, + check=True, + ) + bg_duration = float(json.loads(probe.stdout)["format"]["duration"]) + bg_start = random.uniform(0, max(0.0, bg_duration - total_duration)) + print(f" Using {Path(background_path).name} from {bg_start:.1f}s") + + unique_chars = sorted({VOICE_TO_CHAR[v] for v, _ in segments}) + + # Build ffmpeg filter complex + # Inputs: [0]=background, [1..N]=character still PNGs, [N+1]=episode.mp3 + char_list = unique_chars + char_to_inp = {char: i + 1 for i, char in enumerate(char_list)} + audio_inp = len(char_list) + 1 + char_counts: Counter[str] = Counter(VOICE_TO_CHAR[v] for v, _ in segments) + + filter_parts: list[str] = [] + + # Scale background to output size + filter_parts.append( + f"[0:v]scale={W}:{H}:force_original_aspect_ratio=increase,crop={W}:{H}[bg]" + ) + + # Scale each still image to CHAR_WIDTH, then split into as many copies as + # that character has speaking segments + char_labels: dict[str, list[str]] = {} + for char in char_list: + inp = char_to_inp[char] + n = char_counts[char] + filter_parts.append(f"[{inp}:v]scale={CHAR_WIDTH}:-2[{char}_s]") + if n == 1: + char_labels[char] = [f"{char}_s"] + else: + labels = [f"{char}_{j}" for j in range(n)] + filter_parts.append( + f"[{char}_s]split={n}" + "".join(f"[{label}]" for label in labels) + ) + char_labels[char] = labels + + # Chain one overlay per segment with time-gated enable and wobble + char_usage: dict[str, int] = {char: 0 for char in char_list} + current = "bg" + t = 0.0 + for seg_idx, ((voice_id, _), duration) in enumerate(zip(segments, durations)): + char = VOICE_TO_CHAR[voice_id] + j = char_usage[char] + char_usage[char] += 1 + label = char_labels[char][j] + t_end = t + duration + x = f"({W}-w)/2+8*sin(4*t)" + y = f"{H}-h-80+4*sin(5*t)" + enable = f"gte(t\\,{t:.3f})*lte(t\\,{t_end:.3f})" + out = f"v{seg_idx}" + filter_parts.append( + f"[{current}][{label}]overlay=x='{x}':y='{y}':enable={enable}[{out}]" + ) + current = out + t = t_end + + # Write SRT and burn in via subtitles filter (avoids drawtext escaping issues) + srt_path = Path("subtitles.srt").resolve() + t = 0.0 + entry = 1 + with open(srt_path, "w", encoding="utf-8") as f: + for (_, text), duration in zip(segments, durations): + clean = re.sub(r"^\(\w+\)\s*", "", text) + words = clean.split() + if not words: + t += duration + continue + word_dur = duration / len(words) + for word in words: + t_end = t + word_dur + f.write(f"{entry}\n{_srt_time(t)} --> {_srt_time(t_end)}\n{word}\n\n") + entry += 1 + t = t_end + style = ( + "FontName=Impact,FontSize=72,PrimaryColour=&H00FFFFFF," + "OutlineColour=&H00000000,BorderStyle=1,Outline=6," + "Alignment=2,MarginV=768" + ) + filter_parts.append( + f"[{current}]subtitles='{srt_path}':force_style='{style}'[subs]" + ) + current = "subs" + + filter_complex = ";".join(filter_parts) + + # Assemble and run ffmpeg with VideoToolbox hardware encoding + print(" Encoding (hardware accelerated)...") + cmd: list[str] = ["ffmpeg", "-y"] + cmd += [ + "-ss", + f"{bg_start:.3f}", + "-t", + f"{total_duration:.3f}", + "-i", + background_path, + ] + for char in char_list: + cmd += ["-loop", "1", "-i", str(CHARACTERS_DIR / f"{char}_still.png")] + cmd += ["-i", AUDIO_OUTPUT_FILE] + cmd += [ + "-filter_complex", + filter_complex, + "-map", + f"[{current}]", + "-map", + f"{audio_inp}:a", + "-r", "30", + "-c:v", + "h264_videotoolbox", + "-c:a", + "aac", + "-t", + f"{total_duration:.3f}", + VIDEO_OUTPUT_FILE, + ] + subprocess.run(cmd, check=True) def save_output(dialog: str) -> None: @@ -214,6 +395,11 @@ def main() -> None: "--since", default="1 week ago", help="How far back to look (e.g. '1 week ago')" ) parser.add_argument("--no-tts", action="store_true", help="Skip audio generation") + parser.add_argument( + "--background", + default=None, + help="Path to background video/image for the episode (skips video if omitted)", + ) args = parser.parse_args() github_token = os.environ.get("GITHUB_TOKEN") @@ -269,9 +455,16 @@ def main() -> None: if not args.no_tts: print("\nGenerating audio...") segments = parse_manuscript(dialog) - generate_audio(segments, fish_key) # pyright: ignore[reportArgumentType] + durations = generate_audio(segments, fish_key) # pyright: ignore[reportArgumentType] print(f"Audio saved to {AUDIO_OUTPUT_FILE}") + if args.background: + print("\nGenerating video...") + generate_video(segments, durations, args.background) + print(f"Video saved to {VIDEO_OUTPUT_FILE}") + else: + print("Tip: pass --background to also generate a video.") + if __name__ == "__main__": main() diff --git a/requirements.txt b/requirements.txt index ba1f998..c40eac0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,5 @@ anthropic>=0.50.0 requests>=2.31.0 python-dotenv>=1.0.0 fish-audio-sdk>=1.0.0 +mutagen>=1.47 +Pillow>=10.0