diff --git a/02_ml_inference/02_speech_to_text/.env.example b/02_ml_inference/02_speech_to_text/.env.example
new file mode 100644
index 0000000..8360712
--- /dev/null
+++ b/02_ml_inference/02_speech_to_text/.env.example
@@ -0,0 +1,4 @@
+# FLASH_HOST=localhost
+# FLASH_PORT=8888
+# LOG_LEVEL=INFO
+# RUNPOD_API_KEY=your_api_key_here
diff --git a/02_ml_inference/02_speech_to_text/.flashignore b/02_ml_inference/02_speech_to_text/.flashignore
new file mode 100644
index 0000000..4ce0adc
--- /dev/null
+++ b/02_ml_inference/02_speech_to_text/.flashignore
@@ -0,0 +1,40 @@
+# Flash Build Ignore Patterns
+
+# Python cache
+__pycache__/
+*.pyc
+
+# Virtual environments
+venv/
+.venv/
+env/
+
+# IDE
+.vscode/
+.idea/
+
+# Environment files
+.env
+.env.local
+
+# Git
+.git/
+.gitignore
+
+# Build artifacts
+dist/
+build/
+*.egg-info/
+
+# Flash resources
+.runpod/
+
+# Tests
+tests/
+test_*.py
+*_test.py
+
+# Documentation
+docs/
+*.md
+!README.md
diff --git a/02_ml_inference/02_speech_to_text/.gitignore b/02_ml_inference/02_speech_to_text/.gitignore
new file mode 100644
index 0000000..f067358
--- /dev/null
+++ b/02_ml_inference/02_speech_to_text/.gitignore
@@ -0,0 +1,44 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+.venv/
+ENV/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# Environment
+.env
+.env.local
+
+# Flash
+.runpod/
+dist/
+
+# OS
+.DS_Store
+Thumbs.db
diff --git a/02_ml_inference/02_speech_to_text/README.md b/02_ml_inference/02_speech_to_text/README.md
new file mode 100644
index 0000000..7dde6d7
--- /dev/null
+++ b/02_ml_inference/02_speech_to_text/README.md
@@ -0,0 +1,289 @@
+# Speech-to-Text with Parakeet-TDT
+
+Automatic Speech Recognition (ASR) API using [NVIDIA Parakeet-TDT-0.6B-v2](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2) running on RunPod serverless GPUs.
+
+## Overview
+
+This example demonstrates running a 600M parameter automatic speech recognition model on serverless GPU infrastructure. Parakeet-TDT is a high-performance English transcription model that provides accurate text output with automatic punctuation, capitalization, and detailed timestamp predictions.
+
+## What You'll Learn
+
+- Running a NeMo ASR model with `@remote` on GPU workers
+- Processing audio files from URLs with automatic format handling
+- Returning transcription results with word-level timestamps
+- Using efficient FastConformer-TDT architecture for real-time inference
+- Input validation for audio processing endpoints
+
+## Architecture
+
+Parakeet-TDT-0.6B-v2 is based on the FastConformer-TDT architecture and achieves:
+- **Average WER**: 6.05% across major benchmarks
+- **RTFx Performance**: 3380 at batch size 128
+- **Max Duration**: Up to 24 minutes of audio per pass
+- **Parameters**: 600 million
+
+## Quick Start
+
+### Prerequisites
+
+- Python 3.10+
+- RunPod API key ([get one here](https://docs.runpod.io/get-started/api-keys))
+
+### Setup
+
+```bash
+cd 02_ml_inference/02_speech_to_text
+pip install -r requirements.txt
+cp .env.example .env
+# Add your RUNPOD_API_KEY to .env
+```
+
+### Run
+
+```bash
+flash run
+```
+
+First run provisions the endpoint (~1-2 min). Server starts at http://localhost:8888
+
+### Test the Endpoint
+
+**Basic transcription:**
+```bash
+curl -X POST http://localhost:8888/gpu/transcribe \
+  -H "Content-Type: application/json" \
+  -d '{
+    "audio_url": "https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav"
+  }'
+```
+
+**With timestamps:**
+```bash
+curl -X POST http://localhost:8888/gpu/transcribe \
+  -H "Content-Type: application/json" \
+  -d '{
+    "audio_url": "https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav",
+    "timestamps": true
+  }'
+```
+
+**Get model information:**
+```bash
+curl http://localhost:8888/gpu/model-info
+```
+
+Visit http://localhost:8888/docs for interactive API documentation.
+
+## API Endpoints
+
+### POST /gpu/transcribe
+
+Transcribe audio from a URL and return text with optional timestamps.
+
+**Request:**
+```json
+{
+  "audio_url": "https://example.com/audio.wav",
+  "timestamps": false
+}
+```
+
+**Response (without timestamps):**
+```json
+{
+  "status": "success",
+  "text": "This is the transcribed text with proper punctuation and capitalization.",
+  "duration": 5.2,
+  "processing_time": 0.8,
+  "sample_rate": 16000,
+  "timestamp": "2026-02-13T10:30:00.000000"
+}
+```
+
+**Response (with timestamps):**
+```json
+{
+  "status": "success",
+  "text": "This is the transcribed text.",
+  "duration": 5.2,
+  "processing_time": 0.9,
+  "sample_rate": 16000,
+  "timestamps": {
+    "word": [
+      {"start": 0.0, "end": 0.2, "word": "This"},
+      {"start": 0.2, "end": 0.4, "word": "is"},
+      {"start": 0.4, "end": 0.6, "word": "the"}
+    ],
+    "segment": [
+      {"start": 0.0, "end": 5.2, "segment": "This is the transcribed text."}
+    ],
+    "char": []
+  },
+  "timestamp": "2026-02-13T10:30:00.000000"
+}
+```
+
+### GET /gpu/model-info
+
+Get information about the Parakeet-TDT model and its capabilities.
+
+**Response:**
+```json
+{
+  "status": "success",
+  "model": "nvidia/parakeet-tdt-0.6b-v2",
+  "parameters": "600M",
+  "architecture": "FastConformer-TDT",
+  "supported_formats": ["WAV", "FLAC"],
+  "sample_rate": "16kHz",
+  "max_duration": "24 minutes per pass",
+  "features": [
+    "Automatic punctuation",
+    "Automatic capitalization",
+    "Word-level timestamps",
+    "Segment-level timestamps",
+    "Character-level timestamps"
+  ],
+  "license": "CC-BY-4.0"
+}
+```
+
+## Parameters
+
+| Parameter | Type | Required | Default | Description |
+|-----------|------|----------|---------|-------------|
+| audio_url | string (URL) | Yes | - | URL of the audio file to transcribe (WAV or FLAC) |
+| timestamps | boolean | No | false | Include word, segment, and character-level timestamps |
+
+## Supported Audio Formats
+
+- **Formats**: WAV, FLAC
+- **Sample Rate**: 16kHz (recommended)
+- **Channels**: Mono (single channel recommended)
+- **Duration**: Up to 24 minutes per request
+
+## Performance Metrics
+
+**Word Error Rate (WER) on Benchmarks:**
+
+| Dataset | WER |
+|---------|-----|
+| LibriSpeech test-clean | 1.69% |
+| LibriSpeech test-other | 3.19% |
+| GigaSpeech | 9.74% |
+| Earnings-22 | 11.15% |
+| AMI | 11.16% |
+
+**Average WER**: 6.05%
+
+## Deployment
+
+```bash
+flash build
+flash deploy new production
+flash deploy send production
+```
+
+## Cost Estimates
+
+- Workers scale to 0 when idle (no charges)
+- Pay only for GPU time during transcription
+- First request after idle: ~30-60s (cold start for model loading)
+- Subsequent requests: ~1-3s for short audio clips
+- GPU: RTX 4090 (24GB VRAM) or similar
+
+## Use Cases
+
+- **Podcast Transcription**: Convert podcast episodes to searchable text
+- **Meeting Notes**: Transcribe recorded meetings with timestamps
+- **Caption Generation**: Create accurate captions for video content
+- **Voice Commands**: Process voice commands with low latency
+- **Call Analytics**: Transcribe customer support calls for analysis
+
+## Common Issues
+
+- **Cold start delay**: First request after idle takes 30-60s to load the NeMo model. Use `flash run --auto-provision` during development.
+- **Out of memory**: The model requires 8GB+ VRAM. Ensure `GpuGroup.ADA_24` or similar is configured.
+- **Invalid audio format**: Only WAV and FLAC formats are supported. Convert other formats (MP3, M4A) to WAV before uploading.
+- **Audio too long**: Maximum 24 minutes per request. Split longer audio files into chunks.
+- **Low quality transcription**: Ensure audio is 16kHz mono for best results. Background noise and multiple speakers may reduce accuracy.
+
+## Advanced Features
+
+### Timestamp Analysis
+
+The model provides three levels of timestamps:
+
+1. **Word-level**: Start/end times for each word
+2. **Segment-level**: Start/end times for sentence segments
+3. **Character-level**: Start/end times for individual characters
+
+Use these for:
+- Creating precise subtitles
+- Analyzing speech patterns
+- Synchronizing transcripts with video
+- Building interactive transcripts
+
+### Example: Processing Timestamps
+
+```python
+import requests
+
+response = requests.post(
+    "http://localhost:8888/gpu/transcribe",
+    json={
+        "audio_url": "https://example.com/audio.wav",
+        "timestamps": True
+    }
+)
+
+result = response.json()
+
+# Print word-level timestamps
+for word_info in result["timestamps"]["word"]:
+    print(f"{word_info['start']:.2f}s - {word_info['end']:.2f}s: {word_info['word']}")
+
+# Print segment-level timestamps
+for segment_info in result["timestamps"]["segment"]:
+    print(f"{segment_info['start']:.2f}s - {segment_info['end']:.2f}s: {segment_info['segment']}")
+```
+
+## Model Details
+
+**Architecture**: FastConformer with Token-and-Duration Transducer (TDT) decoder
+
+**Key Features**:
+- Trained with full attention for long-form audio
+- Efficient inference with RTFx of 3380
+- Supports commercial and non-commercial use (CC-BY-4.0 license)
+- Optimized for English transcription
+- Accurate on various domains (audiobooks, podcasts, meetings, earnings calls)
+
+## Error Handling
+
+The API returns detailed error messages:
+
+**Invalid audio URL:**
+```json
+{
+  "status": "error",
+  "error": "audio_url is required",
+  "timestamp": "2026-02-13T10:30:00.000000"
+}
+```
+
+**Processing failure:**
+```json
+{
+  "status": "error",
+  "error": "Failed to download audio: 404 Not Found",
+  "timestamp": "2026-02-13T10:30:00.000000"
+}
+```
+
+## References
+
+- [Parakeet-TDT-0.6B-v2 Model Card](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2)
+- [NVIDIA NeMo Toolkit](https://github.com/NVIDIA/NeMo)
+- [Flash Documentation](https://docs.runpod.io)
+- [Model Demo Space](https://huggingface.co/spaces/nvidia/parakeet-tdt-0.6b-v2)
diff --git a/02_ml_inference/02_speech_to_text/__init__.py b/02_ml_inference/02_speech_to_text/__init__.py
new file mode 100644
index 0000000..27a4247
--- /dev/null
+++ b/02_ml_inference/02_speech_to_text/__init__.py
@@ -0,0 +1 @@
+"""Speech-to-text example using NVIDIA Parakeet-TDT-0.6B-v2."""
diff --git a/02_ml_inference/02_speech_to_text/main.py b/02_ml_inference/02_speech_to_text/main.py
new file mode 100644
index 0000000..6e606dd
--- /dev/null
+++ b/02_ml_inference/02_speech_to_text/main.py
@@ -0,0 +1,44 @@
+import logging
+import os
+
+from fastapi import FastAPI
+
+from workers.gpu import gpu_router
+
+logger = logging.getLogger(__name__)
+
+
+app = FastAPI(
+    title="Parakeet-TDT Speech-to-Text API",
+    description="Speech-to-Text API using NVIDIA Parakeet-TDT-0.6B-v2 on RunPod serverless GPUs",
+    version="1.0.0",
+)
+
+app.include_router(gpu_router, prefix="/gpu", tags=["Speech-to-Text"])
+
+
+@app.get("/")
+def home():
+    return {
+        "message": "Parakeet-TDT Speech-to-Text API",
+        "docs": "/docs",
+        "endpoints": {
+            "transcribe": "/gpu/transcribe",
+            "model_info": "/gpu/model-info",
+        },
+    }
+
+
+@app.get("/ping")
+def ping():
+    return {"status": "healthy"}
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    host = os.getenv("FLASH_HOST", "localhost")
+    port = int(os.getenv("FLASH_PORT", 8888))
+    logger.info(f"Starting Flash server on {host}:{port}")
+
+    uvicorn.run(app, host=host, port=port)
diff --git a/02_ml_inference/02_speech_to_text/mothership.py b/02_ml_inference/02_speech_to_text/mothership.py
new file mode 100644
index 0000000..571a54a
--- /dev/null
+++ b/02_ml_inference/02_speech_to_text/mothership.py
@@ -0,0 +1,55 @@
+"""
+Mothership Endpoint Configuration
+
+The mothership endpoint serves your FastAPI application routes.
+It is automatically deployed as a CPU-optimized load-balanced endpoint.
+
+To customize this configuration:
+- Modify worker scaling: change workersMin and workersMax values
+- Use GPU load balancer: import LiveLoadBalancer instead of CpuLiveLoadBalancer
+- Change endpoint name: update the 'name' parameter
+
+To disable mothership deployment:
+- Delete this file, or
+- Comment out the 'mothership' variable below
+
+Documentation: https://docs.runpod.io/flash/mothership
+"""
+
+from runpod_flash import CpuLiveLoadBalancer
+
+# Mothership endpoint configuration
+# This serves your FastAPI app routes from main.py
+mothership = CpuLiveLoadBalancer(
+    name="02_02_speech_to_text-mothership",
+    workersMin=1,
+    workersMax=3,
+)
+
+# Examples of customization:
+
+# Increase scaling for high traffic
+# mothership = CpuLiveLoadBalancer(
+#     name="mothership",
+#     workersMin=2,
+#     workersMax=10,
+# )
+
+# Use GPU-based load balancer instead of CPU
+# (requires importing LiveLoadBalancer)
+# from runpod_flash import LiveLoadBalancer
+# mothership = LiveLoadBalancer(
+#     name="mothership",
+#     gpus=[GpuGroup.ANY],
+# )
+
+# Custom endpoint name
+# mothership = CpuLiveLoadBalancer(
+#     name="my-api-gateway",
+#     workersMin=1,
+#     workersMax=3,
+# )
+
+# To disable mothership:
+# - Delete this entire file, or
+# - Comment out the 'mothership' variable above
diff --git a/02_ml_inference/02_speech_to_text/pyproject.toml b/02_ml_inference/02_speech_to_text/pyproject.toml
new file mode 100644
index 0000000..bd15b36
--- /dev/null
+++ b/02_ml_inference/02_speech_to_text/pyproject.toml
@@ -0,0 +1,10 @@
+[project]
+name = "02_speech_to_text"
+version = "0.1.0"
+description = "Speech-to-Text using NVIDIA Parakeet-TDT-0.6B-v2 on RunPod serverless GPUs"
+requires-python = ">=3.10"
+dependencies = [
+    "runpod-flash",
+    "fastapi>=0.104.0",
+    "uvicorn>=0.24.0",
+]
diff --git a/02_ml_inference/02_speech_to_text/requirements.txt b/02_ml_inference/02_speech_to_text/requirements.txt
new file mode 100644
index 0000000..a73ed1a
--- /dev/null
+++ b/02_ml_inference/02_speech_to_text/requirements.txt
@@ -0,0 +1 @@
+runpod-flash
diff --git a/02_ml_inference/02_speech_to_text/workers/__init__.py b/02_ml_inference/02_speech_to_text/workers/__init__.py
new file mode 100644
index 0000000..2bfb3fa
--- /dev/null
+++ b/02_ml_inference/02_speech_to_text/workers/__init__.py
@@ -0,0 +1 @@
+"""Workers package for speech-to-text."""
diff --git a/02_ml_inference/02_speech_to_text/workers/gpu/__init__.py b/02_ml_inference/02_speech_to_text/workers/gpu/__init__.py
new file mode 100644
index 0000000..58c82bd
--- /dev/null
+++ b/02_ml_inference/02_speech_to_text/workers/gpu/__init__.py
@@ -0,0 +1,63 @@
+from fastapi import APIRouter
+from pydantic import BaseModel, Field, HttpUrl
+
+from .endpoint import get_model_info, transcribe_audio
+
+gpu_router = APIRouter()
+
+
+class TranscribeRequest(BaseModel):
+    """Request model for audio transcription."""
+
+    audio_url: HttpUrl = Field(
+        ..., description="URL of the audio file to transcribe (WAV or FLAC format)"
+    )
+    timestamps: bool = Field(
+        default=False,
+        description="Include word, segment, and character-level timestamps in the output",
+    )
+
+
+class TimestampInfo(BaseModel):
+    """Timestamp information for words, segments, or characters."""
+
+    start: float = Field(..., description="Start time in seconds")
+    end: float = Field(..., description="End time in seconds")
+    word: str | None = Field(None, description="Word text (for word timestamps)")
+    segment: str | None = Field(None, description="Segment text (for segment timestamps)")
+    char: str | None = Field(None, description="Character (for char timestamps)")
+
+
+class TranscribeResponse(BaseModel):
+    """Response model for audio transcription."""
+
+    status: str = Field(..., description="Status of the request")
+    text: str | None = Field(None, description="Transcribed text with punctuation and capitalization")
+    duration: float | None = Field(None, description="Audio duration in seconds")
+    processing_time: float | None = Field(None, description="Processing time in seconds")
+    sample_rate: int | None = Field(None, description="Audio sample rate")
+    timestamps: dict | None = Field(None, description="Timestamp information (if requested)")
+    error: str | None = Field(None, description="Error message if status is error")
+
+
+@gpu_router.post("/transcribe", response_model=TranscribeResponse)
+async def transcribe_audio_endpoint(request: TranscribeRequest) -> dict:
+    """
+    Transcribe audio from a URL using NVIDIA Parakeet-TDT-0.6B-v2.
+
+    Returns JSON with transcribed text and optional timestamps.
+    """
+    payload = {
+        "audio_url": str(request.audio_url),
+        "timestamps": request.timestamps,
+    }
+
+    result = await transcribe_audio(payload)
+    return result
+
+
+@gpu_router.get("/model-info")
+async def model_info() -> dict:
+    """Get information about the Parakeet-TDT model and its capabilities."""
+    result = await get_model_info({})
+    return result
diff --git a/02_ml_inference/02_speech_to_text/workers/gpu/endpoint.py b/02_ml_inference/02_speech_to_text/workers/gpu/endpoint.py
new file mode 100644
index 0000000..d43205e
--- /dev/null
+++ b/02_ml_inference/02_speech_to_text/workers/gpu/endpoint.py
@@ -0,0 +1,154 @@
+from runpod_flash import GpuGroup, LiveServerless, remote
+
+# GPU config for Parakeet-TDT - needs 8GB+ VRAM for 0.6B model
+# Naming convention: {category}_{example}_{worker_type}
+gpu_config = LiveServerless(
+    name="02_02_speech_to_text_gpu",
+    gpus=[GpuGroup.ADA_24],  # RTX 4090 or similar with 24GB (can work with less)
+    workersMin=0,
+    workersMax=3,
+    idleTimeout=5,
+)
+
+
+@remote(
+    resource_config=gpu_config,
+    dependencies=[
+        "nemo_toolkit[asr]",
+        "torch",
+    ],
+)
+async def transcribe_audio(input_data: dict) -> dict:
+    """
+    Transcribe audio using NVIDIA Parakeet-TDT-0.6B-v2 model.
+
+    Input:
+        audio_url: str - URL of the audio file to transcribe (WAV or FLAC)
+        timestamps: bool - Whether to include timestamps in the output (default: False)
+
+    Returns:
+        text: str - Transcribed text with punctuation and capitalization
+        timestamps: dict (optional) - Word, segment, and character-level timestamps
+        duration: float - Audio duration in seconds
+    """
+    import time
+    from datetime import datetime
+    from io import BytesIO
+    from urllib.request import urlopen
+
+    import nemo.collections.asr as nemo_asr
+    import soundfile as sf
+
+    audio_url = input_data.get("audio_url")
+    include_timestamps = input_data.get("timestamps", False)
+
+    if not audio_url:
+        return {
+            "status": "error",
+            "error": "audio_url is required",
+        }
+
+    try:
+        # Load the model (cached after first load)
+        asr_model = nemo_asr.models.ASRModel.from_pretrained(
+            model_name="nvidia/parakeet-tdt-0.6b-v2"
+        )
+
+        # Download audio file
+        start_time = time.time()
+        response = urlopen(audio_url)
+        audio_bytes = response.read()
+
+        # Load audio data
+        audio_data, sample_rate = sf.read(BytesIO(audio_bytes))
+
+        # Calculate duration
+        duration = len(audio_data) / sample_rate
+
+        # Save to temporary file (NeMo expects file paths)
+        import tempfile
+
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
+            temp_path = temp_file.name
+            sf.write(temp_path, audio_data, sample_rate)
+
+        # Transcribe with or without timestamps
+        output = asr_model.transcribe([temp_path], timestamps=include_timestamps)
+
+        # Clean up temp file
+        import os
+
+        os.unlink(temp_path)
+
+        processing_time = time.time() - start_time
+
+        result = {
+            "status": "success",
+            "text": output[0].text,
+            "duration": duration,
+            "processing_time": processing_time,
+            "sample_rate": sample_rate,
+            "timestamp": datetime.now().isoformat(),
+        }
+
+        if include_timestamps and hasattr(output[0], "timestamp"):
+            result["timestamps"] = {
+                "word": output[0].timestamp.get("word", []),
+                "segment": output[0].timestamp.get("segment", []),
+                "char": output[0].timestamp.get("char", []),
+            }
+
+        return result
+
+    except Exception as e:
+        return {
+            "status": "error",
+            "error": str(e),
+            "timestamp": datetime.now().isoformat(),
+        }
+
+
+@remote(resource_config=gpu_config, dependencies=["nemo_toolkit[asr]"])
+async def get_model_info(input_data: dict) -> dict:
+    """Get model information and capabilities."""
+    return {
+        "status": "success",
+        "model": "nvidia/parakeet-tdt-0.6b-v2",
+        "parameters": "600M",
+        "architecture": "FastConformer-TDT",
+        "supported_formats": ["WAV", "FLAC"],
+        "sample_rate": "16kHz",
+        "max_duration": "24 minutes per pass",
+        "features": [
+            "Automatic punctuation",
+            "Automatic capitalization",
+            "Word-level timestamps",
+            "Segment-level timestamps",
+            "Character-level timestamps",
+        ],
+        "license": "CC-BY-4.0",
+    }
+
+
+# Test locally with: python -m workers.gpu.endpoint
+if __name__ == "__main__":
+    import asyncio
+
+    # Test model info
+    print("Model Information:")
+    result = asyncio.run(get_model_info({}))
+    print(result)
+
+    # Test transcription (requires GPU)
+    test_payload = {
+        "audio_url": "https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav",
+        "timestamps": True,
+    }
+    print(f"\nTesting transcription with payload: {test_payload}")
+    result = asyncio.run(transcribe_audio(test_payload))
+    if result["status"] == "success":
+        print(f"Success! Transcription: {result['text']}")
+        if "timestamps" in result:
+            print(f"Timestamps available: {len(result['timestamps']['word'])} words")
+    else:
+        print(f"Error: {result}")