-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapi_server.py
More file actions
2005 lines (1735 loc) · 81.8 KB
/
api_server.py
File metadata and controls
2005 lines (1735 loc) · 81.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""
Text Extraction API Server
A FastAPI-based REST API for extracting text from various file types.
Supports file uploads, URL processing, YouTube content, and batch operations.
"""
import os
import tempfile
import time
import uuid
import logging
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Dict, Any
import asyncio
import aiohttp
import traceback
import aiofiles
from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks, Depends, Query
from fastapi.responses import JSONResponse, PlainTextResponse, FileResponse
from fastapi.middleware.cors import CORSMiddleware
from fastapi.middleware.trustedhost import TrustedHostMiddleware
from fastapi.security import HTTPBearer
from pydantic import BaseModel, HttpUrl, Field
import uvicorn
from src.text_extractor import TextExtractor
from src.config import Config
from src.youtube_mp3_service import YouTubeMP3TranscriptionService, TranscriptionResult
from src.file_processors.tts_processor import TTSProcessor
from src.file_processors.translation_processor import TranslationProcessor
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Initialize FastAPI app
app = FastAPI(
title="AI Content Processing API",
description="Extract text from various file types using OpenAI GPT and Google Gemini APIs",
version="2.0.0",
docs_url="/docs",
redoc_url="/redoc",
contact={
"name": "AI Content Processing Team",
"email": "support@ai-content-process.com",
},
license_info={
"name": "MIT",
"url": "https://opensource.org/licenses/MIT",
},
)
# Security
security = HTTPBearer()
# Add security middleware
app.add_middleware(TrustedHostMiddleware, allowed_hosts=["*"])
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Configure for production: ["https://yourdomain.com"]
allow_credentials=True,
allow_methods=["GET", "POST"],
allow_headers=["*"],
)
# Global instances
extractor: Optional[TextExtractor] = None
youtube_mp3_service: Optional[YouTubeMP3TranscriptionService] = None
tts_processor: Optional[TTSProcessor] = None
translation_processor: Optional[TranslationProcessor] = None
# Pydantic Models
class FileInfo(BaseModel):
"""File information model."""
name: str = Field(..., description="Original filename")
extension: str = Field(..., description="File extension")
size_mb: float = Field(..., description="File size in megabytes")
mime_type: str = Field(..., description="MIME type of the file")
duration: Optional[int] = Field(None, description="Duration in seconds for video/audio files")
chunks_processed: Optional[int] = Field(None, description="Number of chunks processed")
url: Optional[str] = Field(None, description="Original URL for web-based content")
class ExtractionResult(BaseModel):
"""Single file extraction result."""
file_id: str = Field(..., description="Unique identifier for this extraction")
file_info: FileInfo = Field(..., description="File metadata")
success: bool = Field(..., description="Whether extraction was successful")
error: Optional[str] = Field(None, description="Error message if failed")
extracted_text: Optional[str] = Field(None, description="Extracted text content")
processor_used: Optional[str] = Field(None, description="Processor that handled the file")
processing_time: float = Field(..., description="Processing time in seconds")
text_length: int = Field(..., description="Length of extracted text")
timestamp: str = Field(..., description="Extraction timestamp")
class BatchExtractionResult(BaseModel):
"""Batch extraction result."""
batch_id: str = Field(..., description="Unique batch identifier")
total_files: int = Field(..., description="Total number of files processed")
successful: int = Field(..., description="Number of successful extractions")
failed: int = Field(..., description="Number of failed extractions")
total_processing_time: float = Field(..., description="Total processing time")
total_characters: int = Field(..., description="Total characters extracted")
results: List[ExtractionResult] = Field(..., description="Individual extraction results")
timestamp: str = Field(..., description="Batch processing timestamp")
class HealthResponse(BaseModel):
"""Health check response."""
status: str = Field(..., description="Service status")
message: str = Field(..., description="Status message")
supported_extensions: Dict[str, List[str]] = Field(..., description="Supported file extensions by processor")
api_keys_configured: Dict[str, bool] = Field(..., description="API key configuration status")
version: str = Field(..., description="API version")
class ErrorResponse(BaseModel):
"""Error response model."""
error: str = Field(..., description="Error type")
detail: str = Field(..., description="Error details")
timestamp: str = Field(..., description="Error timestamp")
request_id: Optional[str] = Field(None, description="Request identifier")
class ImageTranscriptionResult(BaseModel):
"""Image transcription result with structured data."""
file_id: str = Field(..., description="Unique identifier for this transcription")
file_info: FileInfo = Field(..., description="File metadata")
success: bool = Field(..., description="Whether transcription was successful")
error: Optional[str] = Field(None, description="Error message if failed")
title: Optional[str] = Field(None, description="Generated title for the image")
description: Optional[str] = Field(None, description="Detailed description of the image")
extracted_text: Optional[str] = Field(None, description="Text extracted from the image (OCR)")
processor_used: Optional[str] = Field(None, description="Vision processor that handled the image")
processing_time: float = Field(..., description="Processing time in seconds")
timestamp: str = Field(..., description="Transcription timestamp")
class ImageURLRequest(BaseModel):
"""Image URL transcription request."""
url: HttpUrl = Field(..., description="URL of the image to transcribe")
class URLRequest(BaseModel):
"""Single URL processing request."""
url: HttpUrl = Field(..., description="URL to process")
filename: Optional[str] = Field(None, description="Custom filename")
class URLBatchRequest(BaseModel):
"""Batch URL processing request."""
urls: List[HttpUrl] = Field(..., description="List of URLs to process")
filenames: Optional[List[str]] = Field(None, description="Custom filenames")
class YouTubeRequest(BaseModel):
"""YouTube content processing request."""
url: HttpUrl = Field(..., description="YouTube URL")
title: Optional[str] = Field(None, description="Custom title")
extract_audio_only: Optional[bool] = Field(False, description="Extract audio only")
class YouTubeBatchRequest(BaseModel):
"""Batch YouTube processing request."""
urls: List[HttpUrl] = Field(..., description="List of YouTube URLs")
titles: Optional[List[str]] = Field(None, description="Custom titles")
extract_audio_only: Optional[bool] = Field(False, description="Extract audio only")
class YouTubeMP3Request(BaseModel):
"""YouTube MP3 transcription request."""
url: HttpUrl = Field(..., description="YouTube URL to transcribe", example="https://www.youtube.com/watch?v=dQw4w9WgXcQ")
keep_mp3: Optional[bool] = Field(False, description="Keep downloaded MP3 file after transcription")
class YouTubeMP3Result(BaseModel):
"""YouTube MP3 transcription result."""
request_id: str = Field(..., description="Unique request identifier")
success: bool = Field(..., description="Whether transcription was successful")
video_id: str = Field(..., description="YouTube video ID")
title: str = Field(..., description="Video title")
duration: Optional[float] = Field(None, description="Video duration in seconds")
transcript: Optional[str] = Field(None, description="Full transcript")
error: Optional[str] = Field(None, description="Error message if failed")
processing_time: float = Field(..., description="Processing time in seconds")
audio_file_size_mb: Optional[float] = Field(None, description="Size of downloaded MP3 in MB")
chunks_processed: int = Field(..., description="Number of audio chunks processed")
timestamp: str = Field(..., description="Request timestamp")
class TTSRequest(BaseModel):
"""Text-to-speech request for single voice."""
text: str = Field(..., description="Text to convert to speech", max_length=4096)
voice: str = Field("alloy", description="Voice to use: alloy, echo, fable, onyx, nova, shimmer")
model: str = Field("tts-1", description="TTS model: tts-1 or tts-1-hd")
format: str = Field("mp3", description="Output format: mp3, opus, aac, flac")
speed: float = Field(1.0, description="Speech speed (0.25 to 4.0)", ge=0.25, le=4.0)
class TTSSegment(BaseModel):
"""Single segment for multi-voice TTS."""
text: str = Field(..., description="Text for this segment", max_length=4096)
voice: str = Field(..., description="Voice to use: alloy, echo, fable, onyx, nova, shimmer")
speaker_name: Optional[str] = Field(None, description="Name of the speaker for this segment")
class TTSMultiVoiceRequest(BaseModel):
"""Text-to-speech request for multiple voices (podcast mode)."""
segments: List[TTSSegment] = Field(..., description="List of text segments with different voices")
model: str = Field("tts-1", description="TTS model: tts-1 or tts-1-hd")
format: str = Field("mp3", description="Output format: mp3, opus, aac, flac")
speed: float = Field(1.0, description="Speech speed (0.25 to 4.0)", ge=0.25, le=4.0)
class TTSResult(BaseModel):
"""Text-to-speech conversion result."""
request_id: str = Field(..., description="Unique request identifier")
success: bool = Field(..., description="Whether conversion was successful")
error: Optional[str] = Field(None, description="Error message if failed")
audio_file: Optional[str] = Field(None, description="Path to generated audio file")
file_size_mb: Optional[float] = Field(None, description="Audio file size in MB")
voice: Optional[str] = Field(None, description="Voice used")
model: Optional[str] = Field(None, description="Model used")
format: Optional[str] = Field(None, description="Audio format")
speed: Optional[float] = Field(None, description="Speech speed")
text_length: Optional[int] = Field(None, description="Length of input text")
processing_time: float = Field(..., description="Processing time in seconds")
timestamp: str = Field(..., description="Request timestamp")
class TTSMultiVoiceResult(BaseModel):
"""Multi-voice text-to-speech conversion result."""
request_id: str = Field(..., description="Unique request identifier")
success: bool = Field(..., description="Whether all conversions were successful")
segments: List[Dict[str, Any]] = Field(..., description="Results for each segment")
total_segments: int = Field(..., description="Total number of segments")
successful_segments: int = Field(..., description="Number of successful conversions")
failed_segments: int = Field(..., description="Number of failed conversions")
total_characters: int = Field(..., description="Total characters processed")
output_directory: Optional[str] = Field(None, description="Directory containing audio files")
processing_time: float = Field(..., description="Total processing time in seconds")
timestamp: str = Field(..., description="Request timestamp")
class ProcessingOptions(BaseModel):
"""Processing options for batch operations."""
parallel: bool = Field(True, description="Use parallel processing")
max_workers: int = Field(4, description="Maximum number of workers", ge=1, le=16)
chunk_size: Optional[int] = Field(None, description="Chunk size for large files")
class TranslationRequest(BaseModel):
"""Single text translation request."""
text: str = Field(..., description="Text to translate", max_length=10000)
source_language: str = Field(..., description="Source language (e.g., 'en', 'es') or 'auto-detect'/'auto'/'detect' for automatic detection")
target_language: str = Field(..., description="Target language (e.g., 'en', 'es', 'fr')")
use_openai: Optional[bool] = Field(False, description="Use OpenAI API (true) or Gemini API (false)")
class TranslationBatchRequest(BaseModel):
"""Batch text translation request."""
texts: List[str] = Field(..., description="List of texts to translate", max_items=50)
source_language: str = Field(..., description="Source language (e.g., 'en', 'es') or 'auto-detect'/'auto'/'detect' for automatic detection")
target_language: str = Field(..., description="Target language (e.g., 'en', 'es', 'fr')")
use_openai: Optional[bool] = Field(False, description="Use OpenAI API (true) or Gemini API (false)")
class LanguageDetectionRequest(BaseModel):
"""Language detection request."""
text: str = Field(..., description="Text to analyze for language detection", max_length=5000)
use_openai: Optional[bool] = Field(False, description="Use OpenAI API (true) or Gemini API (false)")
class TranslationResult(BaseModel):
"""Single text translation result."""
request_id: str = Field(..., description="Unique request identifier")
success: bool = Field(..., description="Whether translation was successful")
original_text: str = Field(..., description="Original text")
translated_text: Optional[str] = Field(None, description="Translated text")
source_language: str = Field(..., description="Source language")
target_language: str = Field(..., description="Target language")
api_used: Optional[str] = Field(None, description="API used for translation")
model_used: Optional[str] = Field(None, description="Model used for translation")
original_length: Optional[int] = Field(None, description="Original text length")
translated_length: Optional[int] = Field(None, description="Translated text length")
error: Optional[str] = Field(None, description="Error message if failed")
processing_time: float = Field(..., description="Processing time in seconds")
timestamp: str = Field(..., description="Request timestamp")
class TranslationBatchResult(BaseModel):
"""Batch text translation result."""
batch_id: str = Field(..., description="Unique batch identifier")
success: bool = Field(..., description="Whether all translations were successful")
total_texts: int = Field(..., description="Total number of texts processed")
successful: int = Field(..., description="Number of successful translations")
failed: int = Field(..., description="Number of failed translations")
source_language: str = Field(..., description="Source language")
target_language: str = Field(..., description="Target language")
results: List[Dict[str, Any]] = Field(..., description="Individual translation results")
processing_time: float = Field(..., description="Total processing time in seconds")
timestamp: str = Field(..., description="Batch processing timestamp")
class LanguageDetectionResult(BaseModel):
"""Language detection result."""
request_id: str = Field(..., description="Unique request identifier")
success: bool = Field(..., description="Whether detection was successful")
original_text: str = Field(..., description="Original text analyzed")
detected_language: Optional[str] = Field(None, description="Detected language")
confidence: Optional[float] = Field(None, description="Detection confidence (0.0 to 1.0)")
api_used: Optional[str] = Field(None, description="API used for detection")
error: Optional[str] = Field(None, description="Error message if failed")
processing_time: float = Field(..., description="Processing time in seconds")
timestamp: str = Field(..., description="Request timestamp")
class VideoGenerationRequest(BaseModel):
"""Video generation request model."""
image_url: HttpUrl = Field(..., description="URL of the source image")
prompt_text: str = Field("A cinematic video with smooth motion", description="Text prompt describing desired video content")
ratio: str = Field("1280:720", description="Video aspect ratio (e.g., '1280:720', '1024:1024')")
duration: int = Field(5, description="Video duration in seconds (5 or 10 for gen4_turbo)", ge=5, le=10)
model: Optional[str] = Field(None, description="Runway model to use")
motion_intensity: str = Field("dynamic", description="Motion intensity level: subtle, moderate, dynamic, or cinematic")
effects: Optional[List[str]] = Field(None, description="List of visual effects to apply (e.g., ['cinematic', 'vibrant', 'glow'])")
effects_intensity: str = Field("moderate", description="Effects intensity level: subtle, moderate, or strong")
enhance_motion: bool = Field(True, description="Apply FFmpeg motion enhancement to static videos")
class PromptToVideoRequest(BaseModel):
"""Prompt-to-video generation request model."""
image_prompt: str = Field(..., description="Text description for image generation", max_length=1000)
video_prompt: Optional[str] = Field(None, description="Text prompt for video animation (optional)", max_length=1000)
ratio: str = Field("1280:720", description="Video aspect ratio (e.g., '1280:720', '1024:1024')")
duration: int = Field(5, description="Video duration in seconds (5 or 10 for gen4_turbo)", ge=5, le=10)
model: Optional[str] = Field(None, description="Runway model to use")
motion_intensity: str = Field("dynamic", description="Motion intensity level: subtle, moderate, dynamic, or cinematic")
voice_type: Optional[str] = Field(None, description="Voice type: male, female, male_deep, male_warm, female_bright, female_soft, or specific voice (alloy, echo, fable, onyx, nova, shimmer)")
style: Optional[str] = Field(None, description="Narration style: brain_rot, explainer, cartoon, public_text, descriptive, poetic, educational, meditative")
voice_script_preview: Optional[str] = Field(None, description="Custom script text for narration (overrides style-based generation)", max_length=2000)
merge_audio: bool = Field(False, description="Merge audio with video using FFmpeg (requires FFmpeg installed)")
preview_only: bool = Field(False, description="Preview all prompts that will be sent to APIs without generated content")
effects: Optional[List[str]] = Field(None, description="List of visual effects to apply (e.g., ['cinematic', 'vibrant', 'glow'])")
effects_intensity: str = Field("moderate", description="Effects intensity level: subtle, moderate, or strong")
enhance_motion: bool = Field(True, description="Apply FFmpeg motion enhancement to static videos")
auto_select_model: bool = Field(True, description="Automatically select optimal model for maximum animation quality")
tiktok_mode: bool = Field(False, description="Optimize video specifically for TikTok (9:16 format, mobile effects)")
tiktok_style: str = Field("trending", description="TikTok style: trending, viral, aesthetic, engaging")
class VideoGenerationResult(BaseModel):
"""Video generation result model."""
request_id: str = Field(..., description="Unique request identifier")
status: str = Field(..., description="Generation status: success, failed, timeout, or error")
task_id: Optional[str] = Field(None, description="Runway task ID")
video_url: Optional[str] = Field(None, description="URL of the generated video")
source_image: str = Field(..., description="Source image URL")
prompt_text: str = Field(..., description="Text prompt used")
ratio: str = Field(..., description="Video aspect ratio")
duration: int = Field(..., description="Video duration in seconds")
model: str = Field(..., description="Runway model used")
processing_time_seconds: float = Field(..., description="Total processing time")
error: Optional[str] = Field(None, description="Error message if failed")
timestamp: str = Field(..., description="Request timestamp")
# Startup and shutdown events
@app.on_event("startup")
async def startup_event():
"""Initialize the text extractor, YouTube MP3 service, TTS processor, and translation processor on startup."""
global extractor, youtube_mp3_service, tts_processor, translation_processor
try:
logger.info("Initializing AI Content Processing API...")
extractor = TextExtractor()
logger.info("✅ Text Extraction API initialized successfully")
logger.info(f" OpenAI configured: {bool(Config.OPENAI_API_KEY)}")
logger.info(f" Gemini configured: {bool(Config.GOOGLE_API_KEY)}")
# Initialize YouTube MP3 service if Gemini is available
if Config.GOOGLE_API_KEY:
try:
youtube_mp3_service = YouTubeMP3TranscriptionService()
logger.info("✅ YouTube MP3 Transcription Service initialized")
except Exception as e:
logger.warning(f"⚠️ YouTube MP3 service failed to initialize: {e}")
youtube_mp3_service = None
# Initialize TTS processor if OpenAI is available
if Config.OPENAI_API_KEY:
try:
tts_processor = TTSProcessor()
logger.info("✅ TTS Processor initialized successfully")
except Exception as e:
logger.warning(f"⚠️ TTS processor failed to initialize: {e}")
tts_processor = None
# Initialize Translation processor if either API is available
if Config.OPENAI_API_KEY or Config.GOOGLE_API_KEY:
try:
translation_processor = TranslationProcessor()
logger.info("✅ Translation Processor initialized successfully")
except Exception as e:
logger.warning(f"⚠️ Translation processor failed to initialize: {e}")
translation_processor = None
# Show supported extensions
extensions = extractor.get_supported_extensions()
for processor, exts in extensions.items():
logger.info(f" {processor}: {len(exts)} extensions")
except Exception as e:
logger.error(f"❌ Failed to initialize text extractor: {e}")
logger.error(traceback.format_exc())
extractor = None
@app.on_event("shutdown")
async def shutdown_event():
"""Cleanup on shutdown."""
logger.info("Shutting down AI Content Processing API...")
# Dependencies for ensuring services are initialized
async def get_extractor() -> TextExtractor:
"""Dependency to ensure extractor is initialized."""
if extractor is None:
raise HTTPException(
status_code=503,
detail="Text extractor not initialized. Please check API configuration."
)
return extractor
async def get_youtube_mp3_service() -> YouTubeMP3TranscriptionService:
"""Dependency to ensure YouTube MP3 service is initialized."""
if youtube_mp3_service is None:
raise HTTPException(
status_code=503,
detail="YouTube MP3 service not available. Please check Gemini API configuration."
)
return youtube_mp3_service
async def get_tts_processor() -> TTSProcessor:
"""Dependency to ensure TTS processor is initialized."""
if tts_processor is None:
raise HTTPException(
status_code=503,
detail="TTS processor not available. Please check OpenAI API configuration."
)
return tts_processor
async def get_translation_processor() -> TranslationProcessor:
"""Dependency to ensure translation processor is initialized."""
if translation_processor is None:
raise HTTPException(
status_code=503,
detail="Translation processor not available. Please check OpenAI or Gemini API configuration."
)
return translation_processor
# Root endpoint
@app.get("/", response_class=PlainTextResponse)
async def root():
"""Root endpoint with service information."""
return ""
# Health check endpoint
@app.get("/health", response_model=HealthResponse)
async def health_check(extractor: TextExtractor = Depends(get_extractor)):
"""Check API health and configuration status."""
extensions = extractor.get_supported_extensions()
api_keys = {
"openai": bool(Config.OPENAI_API_KEY),
"gemini": bool(Config.GOOGLE_API_KEY)
}
return HealthResponse(
status="healthy",
message="AI Content Processing API is running",
supported_extensions=extensions,
api_keys_configured=api_keys,
version="2.0.0"
)
# Supported file types endpoint
@app.get("/supported-types")
async def get_supported_types(extractor: TextExtractor = Depends(get_extractor)):
"""Get detailed information about supported file types."""
extensions = extractor.get_supported_extensions()
detailed_info = {}
for processor_name, exts in extensions.items():
processor_info = {
"extensions": sorted(list(exts)),
"count": len(exts),
"description": _get_processor_description(processor_name)
}
detailed_info[processor_name] = processor_info
return {
"total_extensions": sum(len(exts) for exts in extensions.values()),
"processors": detailed_info,
"all_extensions": sorted(list(set().union(*extensions.values())))
}
def _get_processor_description(processor_name: str) -> str:
"""Get description for processor."""
descriptions = {
"OpenAIProcessor": "Handles documents and text files using OpenAI GPT models",
"GeminiProcessor": "Processes video and audio files using Google Gemini",
"YouTubeProcessor": "Extracts content from YouTube videos and audio"
}
return descriptions.get(processor_name, "File processor")
# Single file upload endpoint
@app.post("/extract", response_model=ExtractionResult)
async def extract_text_from_file(
background_tasks: BackgroundTasks,
file: UploadFile = File(..., description="File to extract text from"),
extractor: TextExtractor = Depends(get_extractor)
):
"""Extract text from a single uploaded file."""
file_id = str(uuid.uuid4())
timestamp = datetime.now().isoformat()
# Validate file
if not file.filename:
raise HTTPException(status_code=400, detail="Filename is required")
# Check file extension
file_path = Path(file.filename)
all_extensions = set()
for exts in extractor.get_supported_extensions().values():
all_extensions.update(exts)
if file_path.suffix.lower() not in all_extensions:
raise HTTPException(
status_code=400,
detail=f"Unsupported file type: {file_path.suffix}. Supported: {', '.join(sorted(all_extensions))}"
)
# Create temporary file
temp_file = None
try:
# Save uploaded file
temp_file = await _save_uploaded_file(file, file_id)
# Extract text
result = extractor.extract_from_file(temp_file)
# Schedule cleanup
background_tasks.add_task(cleanup_temp_file, temp_file)
# Convert to API response format
return _convert_to_api_result(result, file_id, timestamp)
except Exception as e:
# Cleanup on error
if temp_file and temp_file.exists():
background_tasks.add_task(cleanup_temp_file, temp_file)
logger.error(f"Error processing file {file.filename}: {e}")
raise HTTPException(status_code=500, detail=f"File processing failed: {str(e)}")
# URL-based single file extraction endpoint
@app.post("/extract-url", response_model=ExtractionResult)
async def extract_text_from_url(
background_tasks: BackgroundTasks,
request: URLRequest,
extractor: TextExtractor = Depends(get_extractor)
):
"""Extract text from a file at the given URL."""
file_id = str(uuid.uuid4())
timestamp = datetime.now().isoformat()
url_str = str(request.url)
try:
# Check if it's a YouTube URL first
from src.file_processors.youtube_processor import YouTubeProcessor
from src.file_processors.web_processor import WebProcessor
if YouTubeProcessor.is_youtube_url(url_str):
# Process YouTube URL directly without downloading
result = extractor.extract_from_file(url_str)
# Add URL to file info
if result.get('file_info'):
result['file_info']['url'] = url_str
if request.filename:
result['file_info']['name'] = request.filename
return _convert_to_api_result(result, file_id, timestamp)
# Check if it's a web page URL
if WebProcessor.is_web_page_url(url_str):
# Process web page directly without downloading
result = extractor.extract_from_file(url_str)
# Add URL to file info
if result.get('file_info'):
result['file_info']['url'] = url_str
if request.filename:
result['file_info']['name'] = request.filename
return _convert_to_api_result(result, file_id, timestamp)
# For other URLs, download file first
temp_file_path, original_filename, file_size = await download_file_from_url(
url_str,
file_id,
request.filename
)
# Validate file type
file_path = Path(original_filename)
all_extensions = set()
for exts in extractor.get_supported_extensions().values():
all_extensions.update(exts)
if file_path.suffix.lower() not in all_extensions:
background_tasks.add_task(cleanup_temp_file, temp_file_path)
raise HTTPException(
status_code=400,
detail=f"Unsupported file type: {file_path.suffix}. Supported: {', '.join(sorted(all_extensions))}"
)
# Extract text
result = extractor.extract_from_file(temp_file_path)
# Add URL to file info
if result.get('file_info'):
result['file_info']['url'] = url_str
# Schedule cleanup
background_tasks.add_task(cleanup_temp_file, temp_file_path)
return _convert_to_api_result(result, file_id, timestamp)
except Exception as e:
logger.error(f"Error processing URL {request.url}: {e}")
raise HTTPException(status_code=500, detail=f"URL processing failed: {str(e)}")
# YouTube extraction endpoint
@app.post("/extract-youtube", response_model=ExtractionResult)
async def extract_text_from_youtube(
request: YouTubeRequest,
extractor: TextExtractor = Depends(get_extractor)
):
"""Extract text/audio content from YouTube videos."""
file_id = str(uuid.uuid4())
timestamp = datetime.now().isoformat()
try:
# Process YouTube URL directly
result = extractor.extract_from_file(str(request.url))
# Add YouTube-specific info
if result.get('file_info'):
result['file_info']['url'] = str(request.url)
if request.title:
result['file_info']['name'] = request.title
return _convert_to_api_result(result, file_id, timestamp)
except Exception as e:
logger.error(f"Error processing YouTube URL {request.url}: {e}")
raise HTTPException(status_code=500, detail=f"YouTube processing failed: {str(e)}")
# Image transcription endpoint
@app.post("/transcribe-image", response_model=ImageTranscriptionResult)
async def transcribe_image(
background_tasks: BackgroundTasks,
file: UploadFile = File(..., description="Image file to transcribe"),
extractor: TextExtractor = Depends(get_extractor)
):
"""Transcribe an image and return structured JSON with title, description, and extracted text."""
file_id = str(uuid.uuid4())
timestamp = datetime.now().isoformat()
start_time = datetime.now()
# Validate file
if not file.filename:
raise HTTPException(status_code=400, detail="Filename is required")
# Check if it's an image file
file_path = Path(file.filename)
image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.tif', '.webp'}
if file_path.suffix.lower() not in image_extensions:
raise HTTPException(
status_code=400,
detail=f"Unsupported image format: {file_path.suffix}. Supported: {', '.join(sorted(image_extensions))}"
)
# Check if image processor is available
if not extractor.image_processor:
raise HTTPException(
status_code=503,
detail="Image processor not available. Please check API key configuration."
)
# Create temporary file
temp_file = None
try:
# Save uploaded file
temp_file = await _save_uploaded_file(file, file_id)
# Get image transcription data
image_result = extractor.image_processor.extract_image_data(temp_file)
# Get file info
file_info_dict = extractor.image_processor.get_file_info(temp_file)
# Calculate processing time
end_time = datetime.now()
processing_time = (end_time - start_time).total_seconds()
# Schedule cleanup
background_tasks.add_task(cleanup_temp_file, temp_file)
# Convert to API response format
file_info = FileInfo(
name=file.filename,
extension=file_path.suffix.lower(),
size_mb=file_info_dict['size_mb'],
mime_type=file_info_dict['mime_type']
)
if image_result['success']:
return ImageTranscriptionResult(
file_id=file_id,
file_info=file_info,
success=True,
error=None,
title=image_result['title'],
description=image_result['description'],
extracted_text=image_result['extracted_text'],
processor_used=image_result.get('processor_used'),
processing_time=processing_time,
timestamp=timestamp
)
else:
return ImageTranscriptionResult(
file_id=file_id,
file_info=file_info,
success=False,
error=image_result['error'],
title=None,
description=None,
extracted_text=None,
processor_used=None,
processing_time=processing_time,
timestamp=timestamp
)
except Exception as e:
# Cleanup on error
if temp_file and temp_file.exists():
background_tasks.add_task(cleanup_temp_file, temp_file)
logger.error(f"Error processing image {file.filename}: {e}")
raise HTTPException(status_code=500, detail=f"Image transcription failed: {str(e)}")
# Image URL transcription endpoint
@app.post("/transcribe-image-url", response_model=ImageTranscriptionResult)
async def transcribe_image_url(
background_tasks: BackgroundTasks,
request: ImageURLRequest,
extractor: TextExtractor = Depends(get_extractor)
):
"""Transcribe an image from URL and return structured JSON with title, description, and extracted text."""
file_id = str(uuid.uuid4())
timestamp = datetime.now().isoformat()
start_time = datetime.now()
# Check if image processor is available
if not extractor.image_processor:
raise HTTPException(
status_code=503,
detail="Image processor not available. Please check API key configuration."
)
# Extract filename from URL
url_path = Path(str(request.url))
filename = url_path.name or f"image_{file_id}"
file_extension = url_path.suffix.lower()
# Check if it's an image file based on extension
image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.tif', '.webp'}
if file_extension not in image_extensions:
# If no extension in URL, try to determine from headers later
file_extension = '.jpg' # Default assumption
filename = f"{filename}.jpg"
temp_file = None
try:
# Download image from URL
async with aiohttp.ClientSession() as session:
async with session.get(str(request.url)) as response:
if response.status != 200:
raise HTTPException(
status_code=400,
detail=f"Failed to download image from URL. HTTP {response.status}: {response.reason}"
)
# Check content type
content_type = response.headers.get('content-type', '')
if not content_type.startswith('image/'):
# Try to proceed anyway, but warn
logger.warning(f"URL content-type is not image/*: {content_type}")
# Read image data
image_data = await response.read()
if len(image_data) == 0:
raise HTTPException(status_code=400, detail="Downloaded image is empty")
# Create temporary file
temp_file = Path(tempfile.gettempdir()) / f"image_{file_id}_{filename}"
# Write image data to temporary file
async with aiofiles.open(temp_file, 'wb') as f:
await f.write(image_data)
# Get image transcription data
image_result = extractor.image_processor.extract_image_data(temp_file)
# Get file info
file_info_dict = extractor.image_processor.get_file_info(temp_file)
# Calculate processing time
end_time = datetime.now()
processing_time = (end_time - start_time).total_seconds()
# Schedule cleanup
background_tasks.add_task(cleanup_temp_file, temp_file)
# Convert to API response format
file_info = FileInfo(
name=filename,
extension=file_extension,
size_mb=file_info_dict['size_mb'],
mime_type=file_info_dict['mime_type'],
url=str(request.url)
)
if image_result['success']:
return ImageTranscriptionResult(
file_id=file_id,
file_info=file_info,
success=True,
error=None,
title=image_result['title'],
description=image_result['description'],
extracted_text=image_result['extracted_text'],
processor_used=image_result.get('processor_used'),
processing_time=processing_time,
timestamp=timestamp
)
else:
return ImageTranscriptionResult(
file_id=file_id,
file_info=file_info,
success=False,
error=image_result['error'],
title=None,
description=None,
extracted_text=None,
processor_used=None,
processing_time=processing_time,
timestamp=timestamp
)
except aiohttp.ClientError as e:
raise HTTPException(status_code=400, detail=f"Failed to download image: {str(e)}")
except Exception as e:
# Cleanup on error
if temp_file and temp_file.exists():
background_tasks.add_task(cleanup_temp_file, temp_file)
logger.error(f"Error processing image URL {request.url}: {e}")
raise HTTPException(status_code=500, detail=f"Image transcription failed: {str(e)}")
# YouTube MP3 transcription endpoint
@app.post("/youtube-mp3-transcribe", response_model=YouTubeMP3Result)
async def transcribe_youtube_mp3(
request: YouTubeMP3Request,
service: YouTubeMP3TranscriptionService = Depends(get_youtube_mp3_service)
):
"""Download MP3 from YouTube video and transcribe it using Gemini."""
request_id = str(uuid.uuid4())
timestamp = datetime.now().isoformat()
try:
# Validate YouTube URL
if not service.is_youtube_url(str(request.url)):
raise HTTPException(
status_code=400,
detail="Invalid YouTube URL provided"
)
logger.info(f"Starting YouTube MP3 transcription for: {request.url}")
# Process the video
result = service.transcribe_youtube_video(
str(request.url),
keep_mp3=request.keep_mp3
)
# Convert to API response format
return YouTubeMP3Result(
request_id=request_id,
success=result.success,
video_id=result.video_id,
title=result.title,
duration=result.duration,
transcript=result.transcript,
error=result.error,
processing_time=result.processing_time,
audio_file_size_mb=result.audio_file_size_mb,
chunks_processed=result.chunks_processed,
timestamp=timestamp
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error in YouTube MP3 transcription {request.url}: {e}")
return YouTubeMP3Result(
request_id=request_id,
success=False,
video_id="",
title="",
duration=None,
transcript=None,
error=f"Transcription failed: {str(e)}",
processing_time=0.0,
audio_file_size_mb=None,
chunks_processed=0,
timestamp=timestamp
)
# Text-to-Speech endpoint for single voice
@app.post("/text-to-speech", response_model=TTSResult)
async def convert_text_to_speech(
background_tasks: BackgroundTasks,
request: TTSRequest,
processor: TTSProcessor = Depends(get_tts_processor)
):
"""Convert text to speech using a single voice."""
request_id = str(uuid.uuid4())
timestamp = datetime.now().isoformat()
try:
logger.info(f"🎤 TTS request: {len(request.text)} characters, voice: {request.voice}")
# Convert text to speech
result = processor.text_to_speech(
text=request.text,
voice=request.voice,
model=request.model,
output_format=request.format,
speed=request.speed
)
# Clean up audio file after response (optional - comment out to keep files)
if result.get("success") and result.get("audio_file"):
background_tasks.add_task(cleanup_temp_file, Path(result["audio_file"]))
return TTSResult(
request_id=request_id,
success=result["success"],
error=result.get("error"),
audio_file=result.get("audio_file"),
file_size_mb=result.get("file_size_mb"),
voice=result.get("voice"),
model=result.get("model"),
format=result.get("format"),
speed=result.get("speed"),
text_length=result.get("text_length"),
processing_time=result["processing_time"],
timestamp=timestamp
)
except Exception as e:
logger.error(f"Error in TTS conversion: {e}")
return TTSResult(
request_id=request_id,
success=False,
error=f"TTS conversion failed: {str(e)}",
audio_file=None,
file_size_mb=None,
voice=None,
model=None,
format=None,
speed=None,
text_length=None,
processing_time=0.0,
timestamp=timestamp
)
# Text-to-Speech endpoint for multiple voices (podcast mode)
@app.post("/text-to-speech-podcast", response_model=TTSMultiVoiceResult)
async def convert_text_to_speech_podcast(
background_tasks: BackgroundTasks,
request: TTSMultiVoiceRequest,
processor: TTSProcessor = Depends(get_tts_processor)
):
"""Convert multiple text segments to speech with different voices for podcast creation."""
request_id = str(uuid.uuid4())
timestamp = datetime.now().isoformat()