Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
manuscript.txt
episode.mp3
episode.mp4
.env
__pycache__/
*.pyc

characters/
background-videos/
typings/
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,13 @@ python main.py --since "3 days ago"

Du kan bruke flagget `--no-tts` for å bare generere manuscriptet.

Output lagres i `manuscript.txt`.
For å generere video, pass en bakgrunnsvideofil. Et tilfeldig segment av videoen brukes:

```bash
python main.py --background background-videos/minecraft-parkour.mp4
```

Output:
- `manuscript.txt` — dialogscriptet
- `episode.mp3` — lydsporet
- `episode.mp4` — videoen (kun om `--background` er gitt)
207 changes: 200 additions & 7 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import re
import sys
from datetime import datetime, timedelta, timezone
from io import BytesIO
from pathlib import Path
from typing import Any

import anthropic
import requests
Expand All @@ -22,6 +25,7 @@

OUTPUT_FILE = "manuscript.txt"
AUDIO_OUTPUT_FILE = "episode.mp3"
VIDEO_OUTPUT_FILE = "episode.mp4"
MODEL = "claude-haiku-4-5-20251001"
PETER_VOICE = "d75c270eaee14c8aa1e9e980cc37cf1b"
BRIAN_VOICE = "df7b23b4d67c4340be1170ae6cbc2913"
Expand All @@ -32,6 +36,13 @@
"BRIAN": BRIAN_VOICE,
"STEWIE": STEWIE_VOICE,
}
VOICE_TO_CHAR = {v: k.lower() for k, v in VOICE_MAP.items()}

CHARACTERS_DIR = Path(__file__).parent / "characters"
VIDEO_SIZE = (1080, 1920)
CHAR_WIDTH = 480
TALKING_INTERVAL = 0.15


SYSTEM_PROMPT = """You are a comedy writer generating a Family Guy-style TV dialog script.

Expand Down Expand Up @@ -96,7 +107,9 @@ def parse_since(since_str: str) -> datetime:
)


def fetch_commits(owner: str, repo: str, since_iso: str, headers: dict) -> list[dict]:
def fetch_commits(
owner: str, repo: str, since_iso: str, headers: dict[str, str]
) -> list[Any]: # pyright: ignore[reportExplicitAny]
url = f"https://api.github.com/repos/{owner}/{repo}/commits"
params = {"since": since_iso, "per_page": 100}
resp = requests.get(url, headers=headers, params=params, timeout=20)
Expand All @@ -111,10 +124,14 @@ def fetch_commits(owner: str, repo: str, since_iso: str, headers: dict) -> list[
return []
commits = resp.json()
# filter out merge commits
return [c for c in commits if not c["commit"]["message"].startswith("Merge")]
return [
c for c in commits if not c["commit"]["message"].startswith("Merge")
].reverse()


def fetch_commit_detail(owner: str, repo: str, sha: str, headers: dict) -> dict:
def fetch_commit_detail(
owner: str, repo: str, sha: str, headers: dict[str, str]
) -> Any: # pyright: ignore[reportExplicitAny]
url = f"https://api.github.com/repos/{owner}/{repo}/commits/{sha}"
resp = requests.get(url, headers=headers, timeout=20)
if resp.status_code != 200:
Expand All @@ -123,7 +140,10 @@ def fetch_commit_detail(owner: str, repo: str, sha: str, headers: dict) -> dict:


def build_repo_summary(
owner: str, repo: str, commits: list[dict], headers: dict
owner: str,
repo: str,
commits: list[Any], # pyright: ignore[reportExplicitAny]
headers: dict[str, str],
) -> str:
lines = [f"## {repo} ({len(commits)} commits)"]
for commit in commits[:10]:
Expand Down Expand Up @@ -185,20 +205,181 @@ def parse_manuscript(text: str) -> list[tuple[str, str]]:
return result


def generate_audio(segments: list[tuple[str, str]], fish_key: str) -> None:
def generate_audio(segments: list[tuple[str, str]], fish_key: str) -> list[float]:
from mutagen.mp3 import MP3

client = FishAudio(api_key=fish_key)
configs = {
voice_id: TTSConfig(reference_id=voice_id, format="mp3", latency="balanced")
for voice_id in {v for v, _ in segments}
}
audio_chunks = []
audio_chunks: list[bytes] = []
durations: list[float] = []
for i, (voice_id, dialog) in enumerate(segments, 1):
print(f" TTS line {i}/{len(segments)}...")
audio = client.tts.convert(text=dialog, config=configs[voice_id])
audio_chunks.append(audio)
durations.append(MP3(BytesIO(audio)).info.length)
with open(AUDIO_OUTPUT_FILE, "wb") as f:
for chunk in audio_chunks:
f.write(chunk)
return durations


def _srt_time(t: float) -> str:
ms = int(t * 1000)
h, ms = divmod(ms, 3_600_000)
m, ms = divmod(ms, 60_000)
s, ms = divmod(ms, 1_000)
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"


def generate_video(
segments: list[tuple[str, str]],
durations: list[float],
background_path: str,
) -> None:
import json
import random
import subprocess
from collections import Counter

W, H = VIDEO_SIZE
total_duration = sum(durations)
n_segments = len(segments)
print(f" Duration: {total_duration:.1f}s across {n_segments} dialog lines")

# Pick a random segment of the background video
probe = subprocess.run(
[
"ffprobe",
"-v",
"quiet",
"-print_format",
"json",
"-show_format",
background_path,
],
capture_output=True,
text=True,
check=True,
)
bg_duration = float(json.loads(probe.stdout)["format"]["duration"])
bg_start = random.uniform(0, max(0.0, bg_duration - total_duration))
print(f" Using {Path(background_path).name} from {bg_start:.1f}s")

unique_chars = sorted({VOICE_TO_CHAR[v] for v, _ in segments})

# Build ffmpeg filter complex
# Inputs: [0]=background, [1..N]=character still PNGs, [N+1]=episode.mp3
char_list = unique_chars
char_to_inp = {char: i + 1 for i, char in enumerate(char_list)}
audio_inp = len(char_list) + 1
char_counts: Counter[str] = Counter(VOICE_TO_CHAR[v] for v, _ in segments)

filter_parts: list[str] = []

# Scale background to output size
filter_parts.append(
f"[0:v]scale={W}:{H}:force_original_aspect_ratio=increase,crop={W}:{H}[bg]"
)

# Scale each still image to CHAR_WIDTH, then split into as many copies as
# that character has speaking segments
char_labels: dict[str, list[str]] = {}
for char in char_list:
inp = char_to_inp[char]
n = char_counts[char]
filter_parts.append(f"[{inp}:v]scale={CHAR_WIDTH}:-2[{char}_s]")
if n == 1:
char_labels[char] = [f"{char}_s"]
else:
labels = [f"{char}_{j}" for j in range(n)]
filter_parts.append(
f"[{char}_s]split={n}" + "".join(f"[{label}]" for label in labels)
)
char_labels[char] = labels

# Chain one overlay per segment with time-gated enable and wobble
char_usage: dict[str, int] = {char: 0 for char in char_list}
current = "bg"
t = 0.0
for seg_idx, ((voice_id, _), duration) in enumerate(zip(segments, durations)):
char = VOICE_TO_CHAR[voice_id]
j = char_usage[char]
char_usage[char] += 1
label = char_labels[char][j]
t_end = t + duration
x = f"({W}-w)/2+8*sin(4*t)"
y = f"{H}-h-80+4*sin(5*t)"
enable = f"gte(t\\,{t:.3f})*lte(t\\,{t_end:.3f})"
out = f"v{seg_idx}"
filter_parts.append(
f"[{current}][{label}]overlay=x='{x}':y='{y}':enable={enable}[{out}]"
)
current = out
t = t_end

# Write SRT and burn in via subtitles filter (avoids drawtext escaping issues)
srt_path = Path("subtitles.srt").resolve()
t = 0.0
entry = 1
with open(srt_path, "w", encoding="utf-8") as f:
for (_, text), duration in zip(segments, durations):
clean = re.sub(r"^\(\w+\)\s*", "", text)
words = clean.split()
if not words:
t += duration
continue
word_dur = duration / len(words)
for word in words:
t_end = t + word_dur
f.write(f"{entry}\n{_srt_time(t)} --> {_srt_time(t_end)}\n{word}\n\n")
entry += 1
t = t_end
style = (
"FontName=Impact,FontSize=72,PrimaryColour=&H00FFFFFF,"
"OutlineColour=&H00000000,BorderStyle=1,Outline=6,"
"Alignment=2,MarginV=768"
)
filter_parts.append(
f"[{current}]subtitles='{srt_path}':force_style='{style}'[subs]"
)
current = "subs"

filter_complex = ";".join(filter_parts)

# Assemble and run ffmpeg with VideoToolbox hardware encoding
print(" Encoding (hardware accelerated)...")
cmd: list[str] = ["ffmpeg", "-y"]
cmd += [
"-ss",
f"{bg_start:.3f}",
"-t",
f"{total_duration:.3f}",
"-i",
background_path,
]
for char in char_list:
cmd += ["-loop", "1", "-i", str(CHARACTERS_DIR / f"{char}_still.png")]
cmd += ["-i", AUDIO_OUTPUT_FILE]
cmd += [
"-filter_complex",
filter_complex,
"-map",
f"[{current}]",
"-map",
f"{audio_inp}:a",
"-r", "30",
"-c:v",
"h264_videotoolbox",
"-c:a",
"aac",
"-t",
f"{total_duration:.3f}",
VIDEO_OUTPUT_FILE,
]
subprocess.run(cmd, check=True)


def save_output(dialog: str) -> None:
Expand All @@ -214,6 +395,11 @@ def main() -> None:
"--since", default="1 week ago", help="How far back to look (e.g. '1 week ago')"
)
parser.add_argument("--no-tts", action="store_true", help="Skip audio generation")
parser.add_argument(
"--background",
default=None,
help="Path to background video/image for the episode (skips video if omitted)",
)
args = parser.parse_args()

github_token = os.environ.get("GITHUB_TOKEN")
Expand Down Expand Up @@ -269,9 +455,16 @@ def main() -> None:
if not args.no_tts:
print("\nGenerating audio...")
segments = parse_manuscript(dialog)
generate_audio(segments, fish_key) # pyright: ignore[reportArgumentType]
durations = generate_audio(segments, fish_key) # pyright: ignore[reportArgumentType]
print(f"Audio saved to {AUDIO_OUTPUT_FILE}")

if args.background:
print("\nGenerating video...")
generate_video(segments, durations, args.background)
print(f"Video saved to {VIDEO_OUTPUT_FILE}")
else:
print("Tip: pass --background <file> to also generate a video.")


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ anthropic>=0.50.0
requests>=2.31.0
python-dotenv>=1.0.0
fish-audio-sdk>=1.0.0
mutagen>=1.47
Pillow>=10.0