ai-content-process/youtube_mp3_examples.py at main · defmethodinc/ai-content-process · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
#!/usr/bin/env python3
"""
YouTube MP3 Transcription Examples

Examples of using the YouTube MP3 transcription service both directly and via API.
"""

import asyncio
import requests
import json
from src.youtube_mp3_service import YouTubeMP3TranscriptionService

def example_direct_service_usage():
    """Example of using the YouTube MP3 transcription service directly."""
    print("🎵 YouTube MP3 Transcription Service - Direct Usage Example")
    print("=" * 60)

    try:
        # Initialize the service
        service = YouTubeMP3TranscriptionService()
        print("✅ Service initialized successfully")

        # Example YouTube URLs (use your own or public domain videos)
        example_urls = [
            "https://www.youtube.com/watch?v=dQw4w9WgXcQ",  # Rick Astley - Never Gonna Give You Up
            "https://youtu.be/9bZkp7q19f0",  # PSY - GANGNAM STYLE
        ]

        for i, url in enumerate(example_urls, 1):
            print(f"\n📹 Example {i}: Processing {url}")
            print("-" * 40)

            # First, check if it's a valid YouTube URL
            if not service.is_youtube_url(url):
                print(f"❌ Invalid YouTube URL: {url}")
                continue

            # Get video info first (lightweight)
            video_info = service.get_video_info(url)
            print(f"📝 Title: {video_info.get('title', 'Unknown')}")
            print(f"⏱️  Duration: {video_info.get('duration', 0)} seconds")
            print(f"👤 Uploader: {video_info.get('uploader', 'Unknown')}")

            # Transcribe the video (this will download MP3 and transcribe)
            print("🔄 Starting transcription...")
            result = service.transcribe_youtube_video(url, keep_mp3=False)

            if result.success:
                print(f"✅ Transcription successful!")
                print(f"📊 Processing time: {result.processing_time:.2f} seconds")
                print(f"💾 Audio file size: {result.audio_file_size_mb:.2f} MB")
                print(f"🧩 Chunks processed: {result.chunks_processed}")
                print(f"📝 Transcript length: {len(result.transcript or '')} characters")

                # Show first 200 characters of transcript
                if result.transcript:
                    preview = result.transcript[:200] + "..." if len(result.transcript) > 200 else result.transcript
                    print(f"📖 Transcript preview: {preview}")
            else:
                print(f"❌ Transcription failed: {result.error}")

            print()

    except Exception as e:
        print(f"❌ Error: {e}")

def example_api_client_usage():
    """Example of using the YouTube MP3 transcription API endpoint."""
    print("🌐 YouTube MP3 Transcription API - Client Usage Example")
    print("=" * 60)

    # API endpoint (adjust for your server)
    API_BASE_URL = "http://localhost:8000"
    TRANSCRIBE_ENDPOINT = f"{API_BASE_URL}/youtube-mp3-transcribe"

    # Example YouTube URLs
    example_urls = [
        "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
        "https://youtu.be/9bZkp7q19f0",
    ]

    for i, url in enumerate(example_urls, 1):
        print(f"\n📹 API Example {i}: Processing {url}")
        print("-" * 40)

        # Prepare request payload
        payload = {
            "url": url,
            "keep_mp3": False  # Don't keep the MP3 file after transcription
        }

        try:
            print("🔄 Sending API request...")
            response = requests.post(
                TRANSCRIBE_ENDPOINT,
                json=payload,
                headers={"Content-Type": "application/json"},
                timeout=300  # 5 minute timeout for long videos
            )

            if response.status_code == 200:
                result = response.json()
                print(f"✅ API request successful!")
                print(f"🆔 Request ID: {result.get('request_id')}")
                print(f"📝 Title: {result.get('title')}")
                print(f"🎬 Video ID: {result.get('video_id')}")
                print(f"⏱️  Duration: {result.get('duration')} seconds")
                print(f"📊 Processing time: {result.get('processing_time', 0):.2f} seconds")
                print(f"💾 Audio size: {result.get('audio_file_size_mb', 0):.2f} MB")
                print(f"🧩 Chunks: {result.get('chunks_processed', 0)}")

                if result.get('success'):
                    transcript = result.get('transcript', '')
                    print(f"📝 Transcript length: {len(transcript)} characters")

                    # Show first 200 characters
                    if transcript:
                        preview = transcript[:200] + "..." if len(transcript) > 200 else transcript
                        print(f"📖 Preview: {preview}")
                else:
                    print(f"❌ Transcription failed: {result.get('error')}")

            elif response.status_code == 400:
                error_detail = response.json().get('detail', 'Bad request')
                print(f"❌ Bad request: {error_detail}")

            elif response.status_code == 503:
                error_detail = response.json().get('detail', 'Service unavailable')
                print(f"❌ Service unavailable: {error_detail}")

            else:
                print(f"❌ API error {response.status_code}: {response.text}")

        except requests.exceptions.Timeout:
            print("⏰ Request timed out (video might be too long)")
        except requests.exceptions.ConnectionError:
            print("🔌 Connection error - is the API server running?")
        except Exception as e:
            print(f"❌ Error: {e}")

        print()

async def example_async_multiple_videos():
    """Example of processing multiple videos asynchronously."""
    print("⚡ YouTube MP3 Transcription - Async Multiple Videos Example")
    print("=" * 65)

    # Example URLs (shorter videos for demo)
    video_urls = [
        "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
        "https://youtu.be/9bZkp7q19f0",
        # Add more URLs as needed
    ]

    API_BASE_URL = "http://localhost:8000"
    TRANSCRIBE_ENDPOINT = f"{API_BASE_URL}/youtube-mp3-transcribe"

    async def transcribe_video(session, url):
        """Transcribe a single video."""
        print(f"🔄 Starting: {url}")
        payload = {"url": url, "keep_mp3": False}

        try:
            # Note: This is a simplified example. In real async usage,
            # you'd want to use aiohttp for proper async HTTP requests
            import aiohttp

            async with session.post(
                TRANSCRIBE_ENDPOINT,
                json=payload,
                timeout=aiohttp.ClientTimeout(total=300)
            ) as response:
                if response.status == 200:
                    result = await response.json()
                    if result.get('success'):
                        transcript_len = len(result.get('transcript', ''))
                        print(f"✅ Completed: {result.get('title')} ({transcript_len} chars)")
                        return result
                    else:
                        print(f"❌ Failed: {url} - {result.get('error')}")
                        return None
                else:
                    print(f"❌ HTTP {response.status}: {url}")
                    return None

        except Exception as e:
            print(f"❌ Error processing {url}: {e}")
            return None

    try:
        import aiohttp

        async with aiohttp.ClientSession() as session:
            print(f"🚀 Processing {len(video_urls)} videos concurrently...")

            # Process videos concurrently
            tasks = [transcribe_video(session, url) for url in video_urls]
            results = await asyncio.gather(*tasks, return_exceptions=True)

            # Summary
            successful = sum(1 for r in results if r and not isinstance(r, Exception))
            print(f"\n📊 Summary: {successful}/{len(video_urls)} videos processed successfully")

    except ImportError:
        print("❌ aiohttp not installed. Install with: pip install aiohttp")
    except Exception as e:
        print(f"❌ Error: {e}")

def example_with_custom_settings():
    """Example showing customization options."""
    print("⚙️  YouTube MP3 Transcription - Custom Settings Example")
    print("=" * 60)

    try:
        # Initialize service
        service = YouTubeMP3TranscriptionService()

        # Customize chunk duration (default is 300 seconds = 5 minutes)
        service.max_chunk_duration = 180  # 3 minutes per chunk

        print(f"📝 Custom chunk duration: {service.max_chunk_duration} seconds")

        # Example with a longer video that will be chunked
        long_video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"

        print(f"🔄 Processing with custom settings: {long_video_url}")

        result = service.transcribe_youtube_video(long_video_url, keep_mp3=True)

        if result.success:
            print(f"✅ Success! Processed {result.chunks_processed} chunks")
            print(f"📊 Total processing time: {result.processing_time:.2f} seconds")
            print("💾 MP3 file was kept (keep_mp3=True)")
        else:
            print(f"❌ Failed: {result.error}")

    except Exception as e:
        print(f"❌ Error: {e}")

if __name__ == "__main__":
    print("🎵 YouTube MP3 Transcription Service Examples")
    print("=" * 50)
    print()

    # Run examples
    example_direct_service_usage()
    print("\n" + "="*60 + "\n")

    example_api_client_usage()
    print("\n" + "="*60 + "\n")

    # Uncomment to test async example (requires aiohttp)
    # asyncio.run(example_async_multiple_videos())
    # print("\n" + "="*60 + "\n")

    example_with_custom_settings()

    print("\n🎉 Examples completed!")
    print("\n💡 Tips:")
    print("   - Make sure the API server is running for API examples")
    print("   - Set GOOGLE_API_KEY in your .env file")
    print("   - Consider YouTube cookie authentication for age-restricted videos")
    print("   - Longer videos will be automatically chunked for processing")