diff --git a/02_ml_inference/02_speech_to_text/.env.example b/02_ml_inference/02_speech_to_text/.env.example new file mode 100644 index 0000000..8360712 --- /dev/null +++ b/02_ml_inference/02_speech_to_text/.env.example @@ -0,0 +1,4 @@ +# FLASH_HOST=localhost +# FLASH_PORT=8888 +# LOG_LEVEL=INFO +# RUNPOD_API_KEY=your_api_key_here diff --git a/02_ml_inference/02_speech_to_text/.flashignore b/02_ml_inference/02_speech_to_text/.flashignore new file mode 100644 index 0000000..4ce0adc --- /dev/null +++ b/02_ml_inference/02_speech_to_text/.flashignore @@ -0,0 +1,40 @@ +# Flash Build Ignore Patterns + +# Python cache +__pycache__/ +*.pyc + +# Virtual environments +venv/ +.venv/ +env/ + +# IDE +.vscode/ +.idea/ + +# Environment files +.env +.env.local + +# Git +.git/ +.gitignore + +# Build artifacts +dist/ +build/ +*.egg-info/ + +# Flash resources +.runpod/ + +# Tests +tests/ +test_*.py +*_test.py + +# Documentation +docs/ +*.md +!README.md diff --git a/02_ml_inference/02_speech_to_text/.gitignore b/02_ml_inference/02_speech_to_text/.gitignore new file mode 100644 index 0000000..f067358 --- /dev/null +++ b/02_ml_inference/02_speech_to_text/.gitignore @@ -0,0 +1,44 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +env/ +venv/ +.venv/ +ENV/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Environment +.env +.env.local + +# Flash +.runpod/ +dist/ + +# OS +.DS_Store +Thumbs.db diff --git a/02_ml_inference/02_speech_to_text/README.md b/02_ml_inference/02_speech_to_text/README.md new file mode 100644 index 0000000..7dde6d7 --- /dev/null +++ b/02_ml_inference/02_speech_to_text/README.md @@ -0,0 +1,289 @@ +# Speech-to-Text with Parakeet-TDT + +Automatic Speech Recognition (ASR) API using [NVIDIA Parakeet-TDT-0.6B-v2](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2) running on RunPod serverless GPUs. + +## Overview + +This example demonstrates running a 600M parameter automatic speech recognition model on serverless GPU infrastructure. Parakeet-TDT is a high-performance English transcription model that provides accurate text output with automatic punctuation, capitalization, and detailed timestamp predictions. + +## What You'll Learn + +- Running a NeMo ASR model with `@remote` on GPU workers +- Processing audio files from URLs with automatic format handling +- Returning transcription results with word-level timestamps +- Using efficient FastConformer-TDT architecture for real-time inference +- Input validation for audio processing endpoints + +## Architecture + +Parakeet-TDT-0.6B-v2 is based on the FastConformer-TDT architecture and achieves: +- **Average WER**: 6.05% across major benchmarks +- **RTFx Performance**: 3380 at batch size 128 +- **Max Duration**: Up to 24 minutes of audio per pass +- **Parameters**: 600 million + +## Quick Start + +### Prerequisites + +- Python 3.10+ +- RunPod API key ([get one here](https://docs.runpod.io/get-started/api-keys)) + +### Setup + +```bash +cd 02_ml_inference/02_speech_to_text +pip install -r requirements.txt +cp .env.example .env +# Add your RUNPOD_API_KEY to .env +``` + +### Run + +```bash +flash run +``` + +First run provisions the endpoint (~1-2 min). Server starts at http://localhost:8888 + +### Test the Endpoint + +**Basic transcription:** +```bash +curl -X POST http://localhost:8888/gpu/transcribe \ + -H "Content-Type: application/json" \ + -d '{ + "audio_url": "https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav" + }' +``` + +**With timestamps:** +```bash +curl -X POST http://localhost:8888/gpu/transcribe \ + -H "Content-Type: application/json" \ + -d '{ + "audio_url": "https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav", + "timestamps": true + }' +``` + +**Get model information:** +```bash +curl http://localhost:8888/gpu/model-info +``` + +Visit http://localhost:8888/docs for interactive API documentation. + +## API Endpoints + +### POST /gpu/transcribe + +Transcribe audio from a URL and return text with optional timestamps. + +**Request:** +```json +{ + "audio_url": "https://example.com/audio.wav", + "timestamps": false +} +``` + +**Response (without timestamps):** +```json +{ + "status": "success", + "text": "This is the transcribed text with proper punctuation and capitalization.", + "duration": 5.2, + "processing_time": 0.8, + "sample_rate": 16000, + "timestamp": "2026-02-13T10:30:00.000000" +} +``` + +**Response (with timestamps):** +```json +{ + "status": "success", + "text": "This is the transcribed text.", + "duration": 5.2, + "processing_time": 0.9, + "sample_rate": 16000, + "timestamps": { + "word": [ + {"start": 0.0, "end": 0.2, "word": "This"}, + {"start": 0.2, "end": 0.4, "word": "is"}, + {"start": 0.4, "end": 0.6, "word": "the"} + ], + "segment": [ + {"start": 0.0, "end": 5.2, "segment": "This is the transcribed text."} + ], + "char": [] + }, + "timestamp": "2026-02-13T10:30:00.000000" +} +``` + +### GET /gpu/model-info + +Get information about the Parakeet-TDT model and its capabilities. + +**Response:** +```json +{ + "status": "success", + "model": "nvidia/parakeet-tdt-0.6b-v2", + "parameters": "600M", + "architecture": "FastConformer-TDT", + "supported_formats": ["WAV", "FLAC"], + "sample_rate": "16kHz", + "max_duration": "24 minutes per pass", + "features": [ + "Automatic punctuation", + "Automatic capitalization", + "Word-level timestamps", + "Segment-level timestamps", + "Character-level timestamps" + ], + "license": "CC-BY-4.0" +} +``` + +## Parameters + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| audio_url | string (URL) | Yes | - | URL of the audio file to transcribe (WAV or FLAC) | +| timestamps | boolean | No | false | Include word, segment, and character-level timestamps | + +## Supported Audio Formats + +- **Formats**: WAV, FLAC +- **Sample Rate**: 16kHz (recommended) +- **Channels**: Mono (single channel recommended) +- **Duration**: Up to 24 minutes per request + +## Performance Metrics + +**Word Error Rate (WER) on Benchmarks:** + +| Dataset | WER | +|---------|-----| +| LibriSpeech test-clean | 1.69% | +| LibriSpeech test-other | 3.19% | +| GigaSpeech | 9.74% | +| Earnings-22 | 11.15% | +| AMI | 11.16% | + +**Average WER**: 6.05% + +## Deployment + +```bash +flash build +flash deploy new production +flash deploy send production +``` + +## Cost Estimates + +- Workers scale to 0 when idle (no charges) +- Pay only for GPU time during transcription +- First request after idle: ~30-60s (cold start for model loading) +- Subsequent requests: ~1-3s for short audio clips +- GPU: RTX 4090 (24GB VRAM) or similar + +## Use Cases + +- **Podcast Transcription**: Convert podcast episodes to searchable text +- **Meeting Notes**: Transcribe recorded meetings with timestamps +- **Caption Generation**: Create accurate captions for video content +- **Voice Commands**: Process voice commands with low latency +- **Call Analytics**: Transcribe customer support calls for analysis + +## Common Issues + +- **Cold start delay**: First request after idle takes 30-60s to load the NeMo model. Use `flash run --auto-provision` during development. +- **Out of memory**: The model requires 8GB+ VRAM. Ensure `GpuGroup.ADA_24` or similar is configured. +- **Invalid audio format**: Only WAV and FLAC formats are supported. Convert other formats (MP3, M4A) to WAV before uploading. +- **Audio too long**: Maximum 24 minutes per request. Split longer audio files into chunks. +- **Low quality transcription**: Ensure audio is 16kHz mono for best results. Background noise and multiple speakers may reduce accuracy. + +## Advanced Features + +### Timestamp Analysis + +The model provides three levels of timestamps: + +1. **Word-level**: Start/end times for each word +2. **Segment-level**: Start/end times for sentence segments +3. **Character-level**: Start/end times for individual characters + +Use these for: +- Creating precise subtitles +- Analyzing speech patterns +- Synchronizing transcripts with video +- Building interactive transcripts + +### Example: Processing Timestamps + +```python +import requests + +response = requests.post( + "http://localhost:8888/gpu/transcribe", + json={ + "audio_url": "https://example.com/audio.wav", + "timestamps": True + } +) + +result = response.json() + +# Print word-level timestamps +for word_info in result["timestamps"]["word"]: + print(f"{word_info['start']:.2f}s - {word_info['end']:.2f}s: {word_info['word']}") + +# Print segment-level timestamps +for segment_info in result["timestamps"]["segment"]: + print(f"{segment_info['start']:.2f}s - {segment_info['end']:.2f}s: {segment_info['segment']}") +``` + +## Model Details + +**Architecture**: FastConformer with Token-and-Duration Transducer (TDT) decoder + +**Key Features**: +- Trained with full attention for long-form audio +- Efficient inference with RTFx of 3380 +- Supports commercial and non-commercial use (CC-BY-4.0 license) +- Optimized for English transcription +- Accurate on various domains (audiobooks, podcasts, meetings, earnings calls) + +## Error Handling + +The API returns detailed error messages: + +**Invalid audio URL:** +```json +{ + "status": "error", + "error": "audio_url is required", + "timestamp": "2026-02-13T10:30:00.000000" +} +``` + +**Processing failure:** +```json +{ + "status": "error", + "error": "Failed to download audio: 404 Not Found", + "timestamp": "2026-02-13T10:30:00.000000" +} +``` + +## References + +- [Parakeet-TDT-0.6B-v2 Model Card](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2) +- [NVIDIA NeMo Toolkit](https://github.com/NVIDIA/NeMo) +- [Flash Documentation](https://docs.runpod.io) +- [Model Demo Space](https://huggingface.co/spaces/nvidia/parakeet-tdt-0.6b-v2) diff --git a/02_ml_inference/02_speech_to_text/__init__.py b/02_ml_inference/02_speech_to_text/__init__.py new file mode 100644 index 0000000..27a4247 --- /dev/null +++ b/02_ml_inference/02_speech_to_text/__init__.py @@ -0,0 +1 @@ +"""Speech-to-text example using NVIDIA Parakeet-TDT-0.6B-v2.""" diff --git a/02_ml_inference/02_speech_to_text/main.py b/02_ml_inference/02_speech_to_text/main.py new file mode 100644 index 0000000..6e606dd --- /dev/null +++ b/02_ml_inference/02_speech_to_text/main.py @@ -0,0 +1,44 @@ +import logging +import os + +from fastapi import FastAPI + +from workers.gpu import gpu_router + +logger = logging.getLogger(__name__) + + +app = FastAPI( + title="Parakeet-TDT Speech-to-Text API", + description="Speech-to-Text API using NVIDIA Parakeet-TDT-0.6B-v2 on RunPod serverless GPUs", + version="1.0.0", +) + +app.include_router(gpu_router, prefix="/gpu", tags=["Speech-to-Text"]) + + +@app.get("/") +def home(): + return { + "message": "Parakeet-TDT Speech-to-Text API", + "docs": "/docs", + "endpoints": { + "transcribe": "/gpu/transcribe", + "model_info": "/gpu/model-info", + }, + } + + +@app.get("/ping") +def ping(): + return {"status": "healthy"} + + +if __name__ == "__main__": + import uvicorn + + host = os.getenv("FLASH_HOST", "localhost") + port = int(os.getenv("FLASH_PORT", 8888)) + logger.info(f"Starting Flash server on {host}:{port}") + + uvicorn.run(app, host=host, port=port) diff --git a/02_ml_inference/02_speech_to_text/mothership.py b/02_ml_inference/02_speech_to_text/mothership.py new file mode 100644 index 0000000..571a54a --- /dev/null +++ b/02_ml_inference/02_speech_to_text/mothership.py @@ -0,0 +1,55 @@ +""" +Mothership Endpoint Configuration + +The mothership endpoint serves your FastAPI application routes. +It is automatically deployed as a CPU-optimized load-balanced endpoint. + +To customize this configuration: +- Modify worker scaling: change workersMin and workersMax values +- Use GPU load balancer: import LiveLoadBalancer instead of CpuLiveLoadBalancer +- Change endpoint name: update the 'name' parameter + +To disable mothership deployment: +- Delete this file, or +- Comment out the 'mothership' variable below + +Documentation: https://docs.runpod.io/flash/mothership +""" + +from runpod_flash import CpuLiveLoadBalancer + +# Mothership endpoint configuration +# This serves your FastAPI app routes from main.py +mothership = CpuLiveLoadBalancer( + name="02_02_speech_to_text-mothership", + workersMin=1, + workersMax=3, +) + +# Examples of customization: + +# Increase scaling for high traffic +# mothership = CpuLiveLoadBalancer( +# name="mothership", +# workersMin=2, +# workersMax=10, +# ) + +# Use GPU-based load balancer instead of CPU +# (requires importing LiveLoadBalancer) +# from runpod_flash import LiveLoadBalancer +# mothership = LiveLoadBalancer( +# name="mothership", +# gpus=[GpuGroup.ANY], +# ) + +# Custom endpoint name +# mothership = CpuLiveLoadBalancer( +# name="my-api-gateway", +# workersMin=1, +# workersMax=3, +# ) + +# To disable mothership: +# - Delete this entire file, or +# - Comment out the 'mothership' variable above diff --git a/02_ml_inference/02_speech_to_text/pyproject.toml b/02_ml_inference/02_speech_to_text/pyproject.toml new file mode 100644 index 0000000..bd15b36 --- /dev/null +++ b/02_ml_inference/02_speech_to_text/pyproject.toml @@ -0,0 +1,10 @@ +[project] +name = "02_speech_to_text" +version = "0.1.0" +description = "Speech-to-Text using NVIDIA Parakeet-TDT-0.6B-v2 on RunPod serverless GPUs" +requires-python = ">=3.10" +dependencies = [ + "runpod-flash", + "fastapi>=0.104.0", + "uvicorn>=0.24.0", +] diff --git a/02_ml_inference/02_speech_to_text/requirements.txt b/02_ml_inference/02_speech_to_text/requirements.txt new file mode 100644 index 0000000..a73ed1a --- /dev/null +++ b/02_ml_inference/02_speech_to_text/requirements.txt @@ -0,0 +1 @@ +runpod-flash diff --git a/02_ml_inference/02_speech_to_text/workers/__init__.py b/02_ml_inference/02_speech_to_text/workers/__init__.py new file mode 100644 index 0000000..2bfb3fa --- /dev/null +++ b/02_ml_inference/02_speech_to_text/workers/__init__.py @@ -0,0 +1 @@ +"""Workers package for speech-to-text.""" diff --git a/02_ml_inference/02_speech_to_text/workers/gpu/__init__.py b/02_ml_inference/02_speech_to_text/workers/gpu/__init__.py new file mode 100644 index 0000000..58c82bd --- /dev/null +++ b/02_ml_inference/02_speech_to_text/workers/gpu/__init__.py @@ -0,0 +1,63 @@ +from fastapi import APIRouter +from pydantic import BaseModel, Field, HttpUrl + +from .endpoint import get_model_info, transcribe_audio + +gpu_router = APIRouter() + + +class TranscribeRequest(BaseModel): + """Request model for audio transcription.""" + + audio_url: HttpUrl = Field( + ..., description="URL of the audio file to transcribe (WAV or FLAC format)" + ) + timestamps: bool = Field( + default=False, + description="Include word, segment, and character-level timestamps in the output", + ) + + +class TimestampInfo(BaseModel): + """Timestamp information for words, segments, or characters.""" + + start: float = Field(..., description="Start time in seconds") + end: float = Field(..., description="End time in seconds") + word: str | None = Field(None, description="Word text (for word timestamps)") + segment: str | None = Field(None, description="Segment text (for segment timestamps)") + char: str | None = Field(None, description="Character (for char timestamps)") + + +class TranscribeResponse(BaseModel): + """Response model for audio transcription.""" + + status: str = Field(..., description="Status of the request") + text: str | None = Field(None, description="Transcribed text with punctuation and capitalization") + duration: float | None = Field(None, description="Audio duration in seconds") + processing_time: float | None = Field(None, description="Processing time in seconds") + sample_rate: int | None = Field(None, description="Audio sample rate") + timestamps: dict | None = Field(None, description="Timestamp information (if requested)") + error: str | None = Field(None, description="Error message if status is error") + + +@gpu_router.post("/transcribe", response_model=TranscribeResponse) +async def transcribe_audio_endpoint(request: TranscribeRequest) -> dict: + """ + Transcribe audio from a URL using NVIDIA Parakeet-TDT-0.6B-v2. + + Returns JSON with transcribed text and optional timestamps. + """ + payload = { + "audio_url": str(request.audio_url), + "timestamps": request.timestamps, + } + + result = await transcribe_audio(payload) + return result + + +@gpu_router.get("/model-info") +async def model_info() -> dict: + """Get information about the Parakeet-TDT model and its capabilities.""" + result = await get_model_info({}) + return result diff --git a/02_ml_inference/02_speech_to_text/workers/gpu/endpoint.py b/02_ml_inference/02_speech_to_text/workers/gpu/endpoint.py new file mode 100644 index 0000000..d43205e --- /dev/null +++ b/02_ml_inference/02_speech_to_text/workers/gpu/endpoint.py @@ -0,0 +1,154 @@ +from runpod_flash import GpuGroup, LiveServerless, remote + +# GPU config for Parakeet-TDT - needs 8GB+ VRAM for 0.6B model +# Naming convention: {category}_{example}_{worker_type} +gpu_config = LiveServerless( + name="02_02_speech_to_text_gpu", + gpus=[GpuGroup.ADA_24], # RTX 4090 or similar with 24GB (can work with less) + workersMin=0, + workersMax=3, + idleTimeout=5, +) + + +@remote( + resource_config=gpu_config, + dependencies=[ + "nemo_toolkit[asr]", + "torch", + ], +) +async def transcribe_audio(input_data: dict) -> dict: + """ + Transcribe audio using NVIDIA Parakeet-TDT-0.6B-v2 model. + + Input: + audio_url: str - URL of the audio file to transcribe (WAV or FLAC) + timestamps: bool - Whether to include timestamps in the output (default: False) + + Returns: + text: str - Transcribed text with punctuation and capitalization + timestamps: dict (optional) - Word, segment, and character-level timestamps + duration: float - Audio duration in seconds + """ + import time + from datetime import datetime + from io import BytesIO + from urllib.request import urlopen + + import nemo.collections.asr as nemo_asr + import soundfile as sf + + audio_url = input_data.get("audio_url") + include_timestamps = input_data.get("timestamps", False) + + if not audio_url: + return { + "status": "error", + "error": "audio_url is required", + } + + try: + # Load the model (cached after first load) + asr_model = nemo_asr.models.ASRModel.from_pretrained( + model_name="nvidia/parakeet-tdt-0.6b-v2" + ) + + # Download audio file + start_time = time.time() + response = urlopen(audio_url) + audio_bytes = response.read() + + # Load audio data + audio_data, sample_rate = sf.read(BytesIO(audio_bytes)) + + # Calculate duration + duration = len(audio_data) / sample_rate + + # Save to temporary file (NeMo expects file paths) + import tempfile + + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: + temp_path = temp_file.name + sf.write(temp_path, audio_data, sample_rate) + + # Transcribe with or without timestamps + output = asr_model.transcribe([temp_path], timestamps=include_timestamps) + + # Clean up temp file + import os + + os.unlink(temp_path) + + processing_time = time.time() - start_time + + result = { + "status": "success", + "text": output[0].text, + "duration": duration, + "processing_time": processing_time, + "sample_rate": sample_rate, + "timestamp": datetime.now().isoformat(), + } + + if include_timestamps and hasattr(output[0], "timestamp"): + result["timestamps"] = { + "word": output[0].timestamp.get("word", []), + "segment": output[0].timestamp.get("segment", []), + "char": output[0].timestamp.get("char", []), + } + + return result + + except Exception as e: + return { + "status": "error", + "error": str(e), + "timestamp": datetime.now().isoformat(), + } + + +@remote(resource_config=gpu_config, dependencies=["nemo_toolkit[asr]"]) +async def get_model_info(input_data: dict) -> dict: + """Get model information and capabilities.""" + return { + "status": "success", + "model": "nvidia/parakeet-tdt-0.6b-v2", + "parameters": "600M", + "architecture": "FastConformer-TDT", + "supported_formats": ["WAV", "FLAC"], + "sample_rate": "16kHz", + "max_duration": "24 minutes per pass", + "features": [ + "Automatic punctuation", + "Automatic capitalization", + "Word-level timestamps", + "Segment-level timestamps", + "Character-level timestamps", + ], + "license": "CC-BY-4.0", + } + + +# Test locally with: python -m workers.gpu.endpoint +if __name__ == "__main__": + import asyncio + + # Test model info + print("Model Information:") + result = asyncio.run(get_model_info({})) + print(result) + + # Test transcription (requires GPU) + test_payload = { + "audio_url": "https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav", + "timestamps": True, + } + print(f"\nTesting transcription with payload: {test_payload}") + result = asyncio.run(transcribe_audio(test_payload)) + if result["status"] == "success": + print(f"Success! Transcription: {result['text']}") + if "timestamps" in result: + print(f"Timestamps available: {len(result['timestamps']['word'])} words") + else: + print(f"Error: {result}")