Universal LLM interface layer for Python. One API, 16 backends, sync & async.
pip install vv-llm
OpenAI | Anthropic | DeepSeek | Gemini | Qwen | Groq | Mistral | Moonshot | MiniMax | Yi | ZhiPuAI | Baichuan | StepFun | xAI | Ernie | Local
Also supports Azure OpenAI, Vertex AI, and AWS Bedrock deployments.
from vv_llm.settings import settings
settings.load({
"VERSION": "2",
"endpoints": [
{
"id": "openai-default",
"api_base": "https://api.openai.com/v1",
"api_key": "sk-...",
}
],
"backends": {
"openai": {
"models": {
"gpt-4o": {
"id": "gpt-4o",
"endpoints": ["openai-default"],
}
}
}
}
})from vv_llm.chat_clients import create_chat_client, BackendType
client = create_chat_client(BackendType.OpenAI, model="gpt-4o")
resp = client.create_completion([
{"role": "user", "content": "Explain RAG in one sentence"}
])
print(resp.content)for chunk in client.create_stream([
{"role": "user", "content": "Write a haiku"}
]):
if chunk.content:
print(chunk.content, end="")import asyncio
from vv_llm.chat_clients import create_async_chat_client, BackendType
async def main():
client = create_async_chat_client(BackendType.OpenAI, model="gpt-4o")
resp = await client.create_completion([
{"role": "user", "content": "hello"}
])
print(resp.content)
asyncio.run(main())from vv_llm.settings import settings
settings.load({
"VERSION": "2",
"endpoints": [
{
"id": "siliconflow",
"api_base": "https://api.siliconflow.cn/v1",
"api_key": "sk-...",
}
],
"backends": {},
"embedding_backends": {
"siliconflow": {
"models": {
"BAAI/bge-large-zh-v1.5": {
"id": "BAAI/bge-large-zh-v1.5",
"endpoints": ["siliconflow"],
"protocol": "openai_embeddings",
}
}
}
},
"rerank_backends": {
"siliconflow": {
"models": {
"BAAI/bge-reranker-v2-m3": {
"id": "BAAI/bge-reranker-v2-m3",
"endpoints": ["siliconflow"],
"protocol": "custom_json_http",
"request_mapping": {
"method": "POST",
"path": "/rerank",
"body_template": {
"model": "${model_id}",
"query": "${query}",
"documents": "${documents}",
},
},
"response_mapping": {
"results_path": "$.results[*]",
"field_map": {
"index": "$.index",
"relevance_score": "$.relevance_score",
},
},
}
}
}
},
})from vv_llm.embedding_clients import create_embedding_client
from vv_llm.rerank_clients import create_rerank_client
embedding_client = create_embedding_client("siliconflow", model="BAAI/bge-large-zh-v1.5")
embedding_resp = embedding_client.create_embeddings(input="hello world")
print(len(embedding_resp.data[0].embedding))
rerank_client = create_rerank_client("siliconflow", model="BAAI/bge-reranker-v2-m3")
rerank_resp = rerank_client.rerank(
query="Apple",
documents=["apple", "banana", "fruit", "vegetable"],
)
print(rerank_resp.results[0].index, rerank_resp.results[0].relevance_score)import asyncio
from vv_llm.embedding_clients import create_async_embedding_client
from vv_llm.rerank_clients import create_async_rerank_client
async def main():
embedding_client = create_async_embedding_client("siliconflow", model="BAAI/bge-large-zh-v1.5")
rerank_client = create_async_rerank_client("siliconflow", model="BAAI/bge-reranker-v2-m3")
emb = await embedding_client.create_embeddings(input=["a", "b"])
rr = await rerank_client.rerank(query="Apple", documents=["apple", "banana"])
print(len(emb.data), len(rr.results))
asyncio.run(main())- Unified interface — same
create_completion/create_streamAPI across all providers - Embedding & rerank — unified sync/async retrieval clients with normalized outputs
- Type-safe factory —
create_chat_client(BackendType.X)returns the correct client type - Multi-endpoint — configure multiple endpoints per backend with random selection and failover
- Tool calling — normalized tool/function calling across providers
- Multimodal — text + image inputs where supported
- Thinking/reasoning — access chain-of-thought from Claude, DeepSeek Reasoner, etc.
- Token counting — per-model tokenizers (tiktoken, deepseek-tokenizer, qwen-tokenizer)
- Rate limiting — RPM/TPM controls with memory, Redis, or DiskCache backends
- Context length control — automatic message truncation to fit model limits
- Prompt caching — Anthropic prompt caching support
- Retry with backoff — configurable retry logic for transient failures
from vv_llm.chat_clients import format_messages, get_token_counts, get_message_token_counts| Function | Description |
|---|---|
format_messages |
Normalize multimodal/tool messages across formats |
get_token_counts |
Count tokens for a text string |
get_message_token_counts |
Count tokens for a message list |
pip install 'vv-llm[redis]' # Redis rate limiting
pip install 'vv-llm[diskcache]' # DiskCache rate limiting
pip install 'vv-llm[server]' # FastAPI token server
pip install 'vv-llm[vertex]' # Google Vertex AI
pip install 'vv-llm[bedrock]' # AWS Bedrocksrc/vv_llm/
chat_clients/ # Per-backend clients + factory
embedding_clients/ # Embedding clients + factory
rerank_clients/ # Rerank clients + factory
retrieval_clients/ # Shared retrieval client internals
settings/ # Configuration management
types/ # Type definitions & enums
utilities/ # Rate limiting, retry, media processing, token counting
server/ # Optional token counting server
tests/unit/ # Unit tests
tests/live/ # Live integration tests (requires real API keys)
pdm install -d # Install dev dependencies
pdm run lint # Ruff linter
pdm run format-check # Ruff format check
pdm run type-check # Ty type checker
pdm run test # Unit tests
pdm run test-live # Live tests (needs real endpoints)MIT