-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbatch_processing.py
More file actions
96 lines (76 loc) · 2.76 KB
/
batch_processing.py
File metadata and controls
96 lines (76 loc) · 2.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
Batch Transcript Extraction (V2 API)
======================================
Extract transcripts from up to 100 videos in a single request.
Get your free API key at: https://youtubetranscript.dev
Requirements:
pip install requests
"""
import requests
import json
import time
API_KEY = "your_api_key_here" # Get yours at https://youtubetranscript.dev
BASE_URL = "https://youtubetranscript.dev/api/v2"
def batch_transcribe(video_ids: list) -> dict:
"""
Extract transcripts from multiple videos (up to 100).
Args:
video_ids: List of YouTube URLs or 11-character video IDs
"""
response = requests.post(
f"{BASE_URL}/batch",
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json",
},
json={"video_ids": video_ids},
)
response.raise_for_status()
return response.json()
def check_batch_status(batch_id: str) -> dict:
"""Check the status of a batch request."""
response = requests.get(
f"{BASE_URL}/batch/{batch_id}",
headers={"Authorization": f"Bearer {API_KEY}"},
)
response.raise_for_status()
return response.json()
def main():
# Example: batch of 5 videos
video_ids = [
"dQw4w9WgXcQ",
"https://www.youtube.com/watch?v=jNQXAC9IVRw",
"9bZkp7q19f0",
]
print(f"Submitting batch of {len(video_ids)} videos...\n")
result = batch_transcribe(video_ids)
if result.get("batch_id"):
# Some results may be cached (returned immediately),
# others may need processing
batch_id = result["batch_id"]
print(f"Batch ID: {batch_id}")
# Check for immediately available results
if result.get("completed"):
for video in result["completed"]:
text_preview = " ".join(
seg["text"] for seg in video["transcript"][:3]
)
print(f" ✅ {video['video_id']}: {text_preview}...")
# Poll for remaining results if needed
if result.get("status") == "processing":
print("\nWaiting for remaining videos...")
while True:
time.sleep(5)
status = check_batch_status(batch_id)
print(f" Status: {status['status']} ({status.get('completed_count', 0)}/{len(video_ids)})")
if status["status"] == "completed":
break
print("\nAll transcripts ready!")
else:
# All results returned immediately (all cached)
for video in result.get("data", []):
word_count = sum(len(seg["text"].split()) for seg in video["transcript"])
print(f" ✅ {video['video_id']}: {word_count} words")
print("\nDone!")
if __name__ == "__main__":
main()