runpod · Yhlong00 · Dec 18, 2025 · Dec 18, 2025 · Dec 18, 2025 · Dec 23, 2025
diff --git a/02_ml_inference/01_text_generation/.env.example b/02_ml_inference/01_text_generation/.env.example
@@ -0,0 +1,5 @@
+# RUNPOD_API_KEY=your_api_key_here
+# FLASH_HOST=localhost
+# FLASH_PORT=8888
+# LOG_LEVEL=INFO
+# HF_TOKEN=your_huggingface_token
diff --git a/02_ml_inference/01_text_generation/.flashignore b/02_ml_inference/01_text_generation/.flashignore
@@ -0,0 +1,40 @@
+# Flash Build Ignore Patterns
+
+# Python cache
+__pycache__/
+*.pyc
+
+# Virtual environments
+venv/
+.venv/
+env/
+
+# IDE
+.vscode/
+.idea/
+
+# Environment files
+.env
+.env.local
+
+# Git
+.git/
+.gitignore
+
+# Build artifacts
+dist/
+build/
+*.egg-info/
+
+# Flash resources
+.tetra_resources.pkl
+
+# Tests
+tests/
+test_*.py
+*_test.py
+
+# Documentation
+docs/
+*.md
+!README.md
diff --git a/02_ml_inference/01_text_generation/__init__.py b/02_ml_inference/01_text_generation/__init__.py
@@ -0,0 +1 @@
+"""LLM chat inference on a serverless GPU example."""
diff --git a/02_ml_inference/01_text_generation/gpu_worker.py b/02_ml_inference/01_text_generation/gpu_worker.py
@@ -0,0 +1,170 @@
+## LLM chat inference on a serverless GPU
+# This example runs a small chat LLM (Llama 3.2 1B Instruct) on Runpod serverless GPUs
+# using `transformers.pipeline`.
+#
+# Call it via the FastAPI endpoint (`POST /gpu/llm`) or run this module directly for
+# a quick smoke test.
+#
+# Scaling behavior is controlled by the `LiveServerless` config below.
+import os
+
+from fastapi import APIRouter
+from pydantic import BaseModel
+
+from tetra_rp import (
+    GpuGroup,
+    LiveServerless,
+    remote,
+)
+
+# Here, we'll define several variables that change the
+# default behavior of our serverless endpoint. `workersMin` sets our endpoint
+# to scale to 0 active containers; `workersMax` will allow our endpoint to run
+# up to 3 workers in parallel as the endpoint receives more work. We also set
+# an idle timeout of 5 minutes so that any active worker stays alive for 5
+# minutes after completing a request.
+#
+# Hugging Face auth:
+# Many `meta-llama/*` models are gated on Hugging Face. Local shell env vars are NOT
+# automatically forwarded into serverless containers, so we pass `HF_TOKEN` via `env=...`
+# so the remote worker can download the model.
+_hf_token = os.getenv("HF_TOKEN")
+_worker_env = {"HF_TOKEN": _hf_token} if _hf_token else {}
+gpu_config = LiveServerless(
+    name="02_01_text_generation_gpu_worker",
+    gpus=[GpuGroup.ANY],  # Run on any GPU
+    env=_worker_env,
+    workersMin=0,
+    workersMax=3,
+    idleTimeout=5,
+)
+
+
+# Decorating our function with `remote` will package up the function code and
+# deploy it on the infrastructure according to the passed input config. The
+# results from the worker will be returned to your terminal. In this example
+# the function will return a greeting to the input string passed in the `name`
+# key. The code itself will run on a GPU worker, and information about the GPU
+# the worker has access to will be included in the response.
+# Declare worker dependencies so they're installed in the remote execution environment.
+# (Local `requirements.txt` is not automatically shipped to the worker.)
+@remote(
+    resource_config=gpu_config,
+    dependencies=[
+        "torch",
+        "transformers",
+        "accelerate",
+    ],
+)
+async def gpu_hello(
+    input_data: dict,
+) -> dict:
+    """Generate one chat response using Llama 3.2 1B Instruct on a serverless GPU."""
+    import os
+    import platform
+    from datetime import datetime
+
+    import torch
+    from transformers import pipeline
+
+    gpu_available = torch.cuda.is_available()
+    gpu_name = torch.cuda.get_device_name(0)
+    gpu_count = torch.cuda.device_count()
+    gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+
+    # Inputs:
+    # - Simple: {"message": "...", "system_prompt": "...", "max_new_tokens": 512}
+    # - Full chat: {"messages": [{"role": "...", "content": "..."}, ...], "max_new_tokens": 512}
+    system_prompt = input_data.get(
+        "system_prompt",
+        "You are a helpful assistant chatbot who always responds in a friendly and helpful manner!",
+    )
+    message = input_data.get("message", "What is gpu?")
+    messages = input_data.get("messages") or [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": message},
+    ]
+
+    model_id = "meta-llama/Llama-3.2-1B-Instruct"
+
+    # Hugging Face auth for gated repos:
+    hf_token = os.getenv("HF_TOKEN")
+    if not hf_token:
+        raise RuntimeError("HF_TOKEN is required to download gated models (e.g. meta-llama/*).")
+
+    pipe = pipeline(
+        "text-generation",
+        model=model_id,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+        token=hf_token,
+    )
+
+    outputs = pipe(
+        messages,
+        max_new_tokens=int(input_data.get("max_new_tokens", 512)),
+    )
+    generated = outputs[0]["generated_text"]
+    last = generated[-1] if isinstance(generated, list) and generated else generated
+    assistant_message = last.get("content") if isinstance(last, dict) else str(last)
+    print(assistant_message)
+
+    return {
+        "status": "success",
+        "message": assistant_message,
+        "worker_type": "GPU",
+        "gpu_info": {
+            "available": gpu_available,
+            "name": gpu_name,
+            "count": gpu_count,
+            "memory_gb": round(
+                gpu_memory,
+                2,
+            ),
+        },
+        "timestamp": datetime.now().isoformat(),
+        "platform": platform.system(),
+        "python_version": platform.python_version(),
+    }
+
+
+# We define a subrouter for our gpu worker so that our main router in `main.py`
+# can attach it for routing gpu-specific requests.
+gpu_router = APIRouter()
+
+
+class MessageRequest(BaseModel):
+    """Request model for GPU worker."""
+
+    message: str = "What is gpu?"
+    system_prompt: str = (
+        "You are a helpful assistant chatbot who always responds in a friendly and helpful manner!"
+    )
+    max_new_tokens: int = 512
+
+
+@gpu_router.post("/llm")
+async def llm(
+    request: MessageRequest,
+):
+    """Simple GPU worker endpoint."""
+    result = await gpu_hello(
+        {
+            "message": request.message,
+            "system_prompt": request.system_prompt,
+            "max_new_tokens": request.max_new_tokens,
+        }
+    )
+    return result
+
+
+# This code is packaged up as a "worker" that will handle requests sent to the
+# endpoint at /gpu/llm, but you can also trigger it directly by running
+# python -m workers.gpu.endpoint
+if __name__ == "__main__":
+    import asyncio
+
+    test_payload = {"message": "Testing GPU worker"}
+    print(f"Testing GPU worker with payload: {test_payload}")
+    result = asyncio.run(gpu_hello(test_payload))
+    print(f"Result: {result}")
diff --git a/02_ml_inference/01_text_generation/main.py b/02_ml_inference/01_text_generation/main.py
@@ -0,0 +1,69 @@
+## LLM demo: FastAPI router + serverless GPU worker
+# This example exposes a simple local FastAPI app (this file) with a single LLM endpoint
+# backed by a Runpod serverless GPU worker defined in `gpu_worker.py`.
+#
+# - Local API: runs on your machine via `flash run` (default: http://localhost:8888)
+# - Remote compute: executed on Runpod serverless GPUs via `tetra_rp.remote`
+#
+# Main endpoint:
+# - POST /gpu/llm  -> runs Llama chat inference on the remote GPU worker
+#
+# Note: The Llama model used in the worker is gated on Hugging Face, so you must provide
+# `HF_TOKEN` (the worker reads it from the serverless env).
+
+import logging
+import os
+
+from fastapi import FastAPI
+from gpu_worker import gpu_router
+
+logger = logging.getLogger(__name__)
+
+# We define a simple FastAPI app to serve requests from localhost.
+app = FastAPI(
+    title="Flash Application",
+    description="Distributed GPU computing with Runpod Flash",
+    version="0.1.0",
+)
+
+# Attach gpu worker subrouters - this will route any requests to our
+# app with the prefix /gpu to the gpu subrouter. To see the subrouter in action,
+# start the app and execute the following command in another terminal window:
+# curl -X POST http://localhost:8888/gpu/llm -d '{"message": "hello"}' -H "Content-Type: application/json"
+app.include_router(
+    gpu_router,
+    prefix="/gpu",
+    tags=["GPU Workers"],
+)
+
+
+# The homepage for our main endpoint will just return a plaintext json object
+# containing the endpoints defined in this app.
+@app.get("/")
+def home():
+    return {
+        "message": "Flash Application",
+        "docs": "/docs",
+        "endpoints": {
+            "gpu_hello": "/gpu/llm",
+        },
+    }
+
+
+@app.get("/ping")
+def ping():
+    return {"status": "healthy"}
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    host = os.getenv("FLASH_HOST", "localhost")
+    port = int(os.getenv("FLASH_PORT", 8888))
+    logger.info(f"Starting Flash server on {host}:{port}")
+
+    uvicorn.run(
+        app,
+        host=host,
+        port=port,
+    )
diff --git a/02_ml_inference/01_text_generation/requirements.txt b/02_ml_inference/01_text_generation/requirements.txt
@@ -0,0 +1,4 @@
+tetra_rp
+torch
+transformers
+accelerate
diff --git a/02_ml_inference/README.md b/02_ml_inference/README.md
@@ -4,65 +4,74 @@ Deploy machine learning models as production-ready APIs. Learn how to serve LLMs
 
 ## Examples
 
-### 01_text_generation _(coming soon)_
-LLM inference API with streaming support.
+### 01_text_generation
+
+LLM chat inference API (serverless GPU) using Hugging Face `transformers.pipeline`.
 
 **What you'll learn:**
-- Loading and serving LLMs (Llama, Mistral, etc.)
-- Streaming text generation
-- Model quantization for efficiency
-- Memory management for large models
+
+- Loading a gated Llama model with `HF_TOKEN` (Hugging Face auth)
+- Serving a simple chat endpoint (`POST /01_text_generation/gpu/llm`)
 
 **Models covered:**
-- Llama 3, Llama 3.1, Llama 3.2
-- Mistral, Mixtral
-- Qwen, Phi, Gemma
 
-### 02_image_generation _(coming soon)_
+- `meta-llama/Llama-3.2-1B-Instruct`
+
+### 02*image_generation *(coming soon)\_
+
 Stable Diffusion image generation API.
 
 **What you'll learn:**
+
 - Loading Stable Diffusion models
 - Optimizing inference with diffusers
 - Handling image uploads and downloads
 - Model caching strategies
 
 **Models covered:**
+
 - Stable Diffusion 1.5, 2.1, XL
 - SDXL Turbo
 - ControlNet integration
 
-### 03_embeddings _(coming soon)_
+### 03*embeddings *(coming soon)\_
+
 Text embedding API for semantic search and RAG.
 
 **What you'll learn:**
+
 - Serving embedding models
 - Batch processing for efficiency
 - Integrating with vector databases
 - Dimensionality reduction
 
 **Models covered:**
+
 - sentence-transformers
 - OpenAI-compatible embeddings
 - Multilingual models
 
-### 04_multimodal _(coming soon)_
+### 04*multimodal *(coming soon)\_
+
 Vision-language models (CLIP, LLaVA, etc.).
 
 **What you'll learn:**
+
 - Serving vision-language models
 - Image+text processing
 - Zero-shot classification
 - Visual question answering
 
 **Models covered:**
+
 - CLIP
 - LLaVA
 - BLIP-2
 
 ## Architecture Patterns
 
 All examples demonstrate:
+
 - Model loading and caching
 - Efficient batching
 - Error handling
@@ -72,13 +81,15 @@ All examples demonstrate:
 ## GPU Selection
 
 Examples include GPU recommendations:
+
 - **RTX 4090 (24GB)**: Most consumer models
 - **L40/RTX 6000 Ada (48GB)**: Larger models
 - **A100 (80GB)**: Largest models, multi-GPU
 
 ## Next Steps
 
 After exploring ML inference:
+
 - Learn [03_advanced_workers](../03_advanced_workers/) for optimization
 - Study [04_scaling_performance](../04_scaling_performance/) for production
 - Build complete apps in [06_real_world](../06_real_world/)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""LLM chat inference on a serverless GPU example."""