diff --git a/02_ml_inference/01_text_generation/.env.example b/02_ml_inference/01_text_generation/.env.example new file mode 100644 index 0000000..89b08af --- /dev/null +++ b/02_ml_inference/01_text_generation/.env.example @@ -0,0 +1,5 @@ +# RUNPOD_API_KEY=your_api_key_here +# FLASH_HOST=localhost +# FLASH_PORT=8888 +# LOG_LEVEL=INFO +# HF_TOKEN=your_huggingface_token diff --git a/02_ml_inference/01_text_generation/.flashignore b/02_ml_inference/01_text_generation/.flashignore new file mode 100644 index 0000000..ea5988c --- /dev/null +++ b/02_ml_inference/01_text_generation/.flashignore @@ -0,0 +1,40 @@ +# Flash Build Ignore Patterns + +# Python cache +__pycache__/ +*.pyc + +# Virtual environments +venv/ +.venv/ +env/ + +# IDE +.vscode/ +.idea/ + +# Environment files +.env +.env.local + +# Git +.git/ +.gitignore + +# Build artifacts +dist/ +build/ +*.egg-info/ + +# Flash resources +.tetra_resources.pkl + +# Tests +tests/ +test_*.py +*_test.py + +# Documentation +docs/ +*.md +!README.md diff --git a/02_ml_inference/01_text_generation/__init__.py b/02_ml_inference/01_text_generation/__init__.py new file mode 100644 index 0000000..3773a93 --- /dev/null +++ b/02_ml_inference/01_text_generation/__init__.py @@ -0,0 +1 @@ +"""LLM chat inference on a serverless GPU example.""" diff --git a/02_ml_inference/01_text_generation/gpu_worker.py b/02_ml_inference/01_text_generation/gpu_worker.py new file mode 100644 index 0000000..14f7dc3 --- /dev/null +++ b/02_ml_inference/01_text_generation/gpu_worker.py @@ -0,0 +1,170 @@ +## LLM chat inference on a serverless GPU +# This example runs a small chat LLM (Llama 3.2 1B Instruct) on Runpod serverless GPUs +# using `transformers.pipeline`. +# +# Call it via the FastAPI endpoint (`POST /gpu/llm`) or run this module directly for +# a quick smoke test. +# +# Scaling behavior is controlled by the `LiveServerless` config below. +import os + +from fastapi import APIRouter +from pydantic import BaseModel + +from tetra_rp import ( + GpuGroup, + LiveServerless, + remote, +) + +# Here, we'll define several variables that change the +# default behavior of our serverless endpoint. `workersMin` sets our endpoint +# to scale to 0 active containers; `workersMax` will allow our endpoint to run +# up to 3 workers in parallel as the endpoint receives more work. We also set +# an idle timeout of 5 minutes so that any active worker stays alive for 5 +# minutes after completing a request. +# +# Hugging Face auth: +# Many `meta-llama/*` models are gated on Hugging Face. Local shell env vars are NOT +# automatically forwarded into serverless containers, so we pass `HF_TOKEN` via `env=...` +# so the remote worker can download the model. +_hf_token = os.getenv("HF_TOKEN") +_worker_env = {"HF_TOKEN": _hf_token} if _hf_token else {} +gpu_config = LiveServerless( + name="02_01_text_generation_gpu_worker", + gpus=[GpuGroup.ANY], # Run on any GPU + env=_worker_env, + workersMin=0, + workersMax=3, + idleTimeout=5, +) + + +# Decorating our function with `remote` will package up the function code and +# deploy it on the infrastructure according to the passed input config. The +# results from the worker will be returned to your terminal. In this example +# the function will return a greeting to the input string passed in the `name` +# key. The code itself will run on a GPU worker, and information about the GPU +# the worker has access to will be included in the response. +# Declare worker dependencies so they're installed in the remote execution environment. +# (Local `requirements.txt` is not automatically shipped to the worker.) +@remote( + resource_config=gpu_config, + dependencies=[ + "torch", + "transformers", + "accelerate", + ], +) +async def gpu_hello( + input_data: dict, +) -> dict: + """Generate one chat response using Llama 3.2 1B Instruct on a serverless GPU.""" + import os + import platform + from datetime import datetime + + import torch + from transformers import pipeline + + gpu_available = torch.cuda.is_available() + gpu_name = torch.cuda.get_device_name(0) + gpu_count = torch.cuda.device_count() + gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3) + + # Inputs: + # - Simple: {"message": "...", "system_prompt": "...", "max_new_tokens": 512} + # - Full chat: {"messages": [{"role": "...", "content": "..."}, ...], "max_new_tokens": 512} + system_prompt = input_data.get( + "system_prompt", + "You are a helpful assistant chatbot who always responds in a friendly and helpful manner!", + ) + message = input_data.get("message", "What is gpu?") + messages = input_data.get("messages") or [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": message}, + ] + + model_id = "meta-llama/Llama-3.2-1B-Instruct" + + # Hugging Face auth for gated repos: + hf_token = os.getenv("HF_TOKEN") + if not hf_token: + raise RuntimeError("HF_TOKEN is required to download gated models (e.g. meta-llama/*).") + + pipe = pipeline( + "text-generation", + model=model_id, + torch_dtype=torch.bfloat16, + device_map="auto", + token=hf_token, + ) + + outputs = pipe( + messages, + max_new_tokens=int(input_data.get("max_new_tokens", 512)), + ) + generated = outputs[0]["generated_text"] + last = generated[-1] if isinstance(generated, list) and generated else generated + assistant_message = last.get("content") if isinstance(last, dict) else str(last) + print(assistant_message) + + return { + "status": "success", + "message": assistant_message, + "worker_type": "GPU", + "gpu_info": { + "available": gpu_available, + "name": gpu_name, + "count": gpu_count, + "memory_gb": round( + gpu_memory, + 2, + ), + }, + "timestamp": datetime.now().isoformat(), + "platform": platform.system(), + "python_version": platform.python_version(), + } + + +# We define a subrouter for our gpu worker so that our main router in `main.py` +# can attach it for routing gpu-specific requests. +gpu_router = APIRouter() + + +class MessageRequest(BaseModel): + """Request model for GPU worker.""" + + message: str = "What is gpu?" + system_prompt: str = ( + "You are a helpful assistant chatbot who always responds in a friendly and helpful manner!" + ) + max_new_tokens: int = 512 + + +@gpu_router.post("/llm") +async def llm( + request: MessageRequest, +): + """Simple GPU worker endpoint.""" + result = await gpu_hello( + { + "message": request.message, + "system_prompt": request.system_prompt, + "max_new_tokens": request.max_new_tokens, + } + ) + return result + + +# This code is packaged up as a "worker" that will handle requests sent to the +# endpoint at /gpu/llm, but you can also trigger it directly by running +# python -m workers.gpu.endpoint +if __name__ == "__main__": + import asyncio + + test_payload = {"message": "Testing GPU worker"} + print(f"Testing GPU worker with payload: {test_payload}") + result = asyncio.run(gpu_hello(test_payload)) + print(f"Result: {result}") diff --git a/02_ml_inference/01_text_generation/main.py b/02_ml_inference/01_text_generation/main.py new file mode 100644 index 0000000..b8652f9 --- /dev/null +++ b/02_ml_inference/01_text_generation/main.py @@ -0,0 +1,69 @@ +## LLM demo: FastAPI router + serverless GPU worker +# This example exposes a simple local FastAPI app (this file) with a single LLM endpoint +# backed by a Runpod serverless GPU worker defined in `gpu_worker.py`. +# +# - Local API: runs on your machine via `flash run` (default: http://localhost:8888) +# - Remote compute: executed on Runpod serverless GPUs via `tetra_rp.remote` +# +# Main endpoint: +# - POST /gpu/llm -> runs Llama chat inference on the remote GPU worker +# +# Note: The Llama model used in the worker is gated on Hugging Face, so you must provide +# `HF_TOKEN` (the worker reads it from the serverless env). + +import logging +import os + +from fastapi import FastAPI +from gpu_worker import gpu_router + +logger = logging.getLogger(__name__) + +# We define a simple FastAPI app to serve requests from localhost. +app = FastAPI( + title="Flash Application", + description="Distributed GPU computing with Runpod Flash", + version="0.1.0", +) + +# Attach gpu worker subrouters - this will route any requests to our +# app with the prefix /gpu to the gpu subrouter. To see the subrouter in action, +# start the app and execute the following command in another terminal window: +# curl -X POST http://localhost:8888/gpu/llm -d '{"message": "hello"}' -H "Content-Type: application/json" +app.include_router( + gpu_router, + prefix="/gpu", + tags=["GPU Workers"], +) + + +# The homepage for our main endpoint will just return a plaintext json object +# containing the endpoints defined in this app. +@app.get("/") +def home(): + return { + "message": "Flash Application", + "docs": "/docs", + "endpoints": { + "gpu_hello": "/gpu/llm", + }, + } + + +@app.get("/ping") +def ping(): + return {"status": "healthy"} + + +if __name__ == "__main__": + import uvicorn + + host = os.getenv("FLASH_HOST", "localhost") + port = int(os.getenv("FLASH_PORT", 8888)) + logger.info(f"Starting Flash server on {host}:{port}") + + uvicorn.run( + app, + host=host, + port=port, + ) diff --git a/02_ml_inference/01_text_generation/requirements.txt b/02_ml_inference/01_text_generation/requirements.txt new file mode 100644 index 0000000..ed55547 --- /dev/null +++ b/02_ml_inference/01_text_generation/requirements.txt @@ -0,0 +1,4 @@ +tetra_rp +torch +transformers +accelerate \ No newline at end of file diff --git a/02_ml_inference/README.md b/02_ml_inference/README.md index cd41fed..8e192e8 100644 --- a/02_ml_inference/README.md +++ b/02_ml_inference/README.md @@ -4,58 +4,66 @@ Deploy machine learning models as production-ready APIs. Learn how to serve LLMs ## Examples -### 01_text_generation _(coming soon)_ -LLM inference API with streaming support. +### 01_text_generation + +LLM chat inference API (serverless GPU) using Hugging Face `transformers.pipeline`. **What you'll learn:** -- Loading and serving LLMs (Llama, Mistral, etc.) -- Streaming text generation -- Model quantization for efficiency -- Memory management for large models + +- Loading a gated Llama model with `HF_TOKEN` (Hugging Face auth) +- Serving a simple chat endpoint (`POST /01_text_generation/gpu/llm`) **Models covered:** -- Llama 3, Llama 3.1, Llama 3.2 -- Mistral, Mixtral -- Qwen, Phi, Gemma -### 02_image_generation _(coming soon)_ +- `meta-llama/Llama-3.2-1B-Instruct` + +### 02*image_generation *(coming soon)\_ + Stable Diffusion image generation API. **What you'll learn:** + - Loading Stable Diffusion models - Optimizing inference with diffusers - Handling image uploads and downloads - Model caching strategies **Models covered:** + - Stable Diffusion 1.5, 2.1, XL - SDXL Turbo - ControlNet integration -### 03_embeddings _(coming soon)_ +### 03*embeddings *(coming soon)\_ + Text embedding API for semantic search and RAG. **What you'll learn:** + - Serving embedding models - Batch processing for efficiency - Integrating with vector databases - Dimensionality reduction **Models covered:** + - sentence-transformers - OpenAI-compatible embeddings - Multilingual models -### 04_multimodal _(coming soon)_ +### 04*multimodal *(coming soon)\_ + Vision-language models (CLIP, LLaVA, etc.). **What you'll learn:** + - Serving vision-language models - Image+text processing - Zero-shot classification - Visual question answering **Models covered:** + - CLIP - LLaVA - BLIP-2 @@ -63,6 +71,7 @@ Vision-language models (CLIP, LLaVA, etc.). ## Architecture Patterns All examples demonstrate: + - Model loading and caching - Efficient batching - Error handling @@ -72,6 +81,7 @@ All examples demonstrate: ## GPU Selection Examples include GPU recommendations: + - **RTX 4090 (24GB)**: Most consumer models - **L40/RTX 6000 Ada (48GB)**: Larger models - **A100 (80GB)**: Largest models, multi-GPU @@ -79,6 +89,7 @@ Examples include GPU recommendations: ## Next Steps After exploring ML inference: + - Learn [03_advanced_workers](../03_advanced_workers/) for optimization - Study [04_scaling_performance](../04_scaling_performance/) for production - Build complete apps in [06_real_world](../06_real_world/) diff --git a/README.md b/README.md index ecece6d..6377d43 100644 --- a/README.md +++ b/README.md @@ -33,11 +33,11 @@ export RUNPOD_API_KEY=your_key_here # echo "RUNPOD_API_KEY=your_key_here" > .env # Option A: Run all examples from the unified app (recommended) -flash run +uv run flash run # Option B: Run individual examples cd 01_getting_started/01_hello_world -flash run +uv run flash run # Visit http://localhost:8888/docs ```