Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions python/openai/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,98 @@ See the
[vLLM documentation](https://github.com/triton-inference-server/vllm_backend/blob/main/docs/llama_multi_lora_tutorial.md)
on how to serve a model with LoRA adapters.

### Embedding Models
Currently, OpenAI-Compatible Frontend supports loading embedding models and embeddings endpoints via vLLM backend. Check [vLLM supported models](https://docs.vllm.ai/en/latest/models/supported_models.html#embedding) for all supported embedding models from vLLM.

1. Launch the container and install dependencies:
- Mounts the `~/.huggingface/cache` for re-use of downloaded models across runs, containers, etc.
- Sets the [`HF_TOKEN`](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hftoken) environment variable to
access gated models, make sure this is set in your local environment if needed.

```bash
docker run -it --net=host --gpus all --rm \
-v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-e HF_TOKEN \
nvcr.io/nvidia/tritonserver:25.10-vllm-python-py3
```

2. Launch the OpenAI-compatible Triton Inference Server:
```bash
cd /opt/tritonserver/python/openai

# NOTE: Embeddings endpoint does not require "--tokenizer"
python3 openai_frontend/main.py --model-repository tests/vllm_embedding_models
```

<details>
<summary>Example output</summary>

```
...
+------------------+---------+--------+
| Model | Version | Status |
+------------------+---------+--------+
| all-MiniLM-L6-v2 | 1 | READY | <- Correct Model Loaded in Triton
+------------------+---------+--------+
...
Found model: name='all-MiniLM-L6-v2', backend='vllm'
[WARNING] Adding CORS for the following origins: ['http://localhost']
INFO: Started server process [133]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:9000 (Press CTRL+C to quit) <- OpenAI Frontend Started Successfully
```

</details>

3. Send a `/v1/embeddings` request:
- Note the use of `jq` is optional, but provides a nicely formatted output for JSON responses.
```bash
MODEL="all-MiniLM-L6-v2"
curl -s http://localhost:9000/v1/embeddings \
-H 'Content-Type: application/json' \
-d '{
"model": "'${MODEL}'",
"input": "The food was delicious and the waiter...",
"dimensions": 10,
"encoding_format": "float"
}' | jq
```

<details>
<summary>Example output</summary>

```json
{
"object": "list",
"data": [
{
"object": "embedding",
"embedding": [
-0.1914404183626175,
0.4000193178653717,
0.058502197265625,
0.18909454345703125,
-0.4690297544002533,
0.004936536308377981,
0.45893096923828125,
-0.31141534447669983,
0.18299102783203125,
-0.4907582700252533
],
"index": 0
}
],
"model": "all-MiniLM-L6-v2",
"usage": {
"prompt_tokens": 12,
"total_tokens": 12
}
}
```

</details>

## TensorRT-LLM

0. Prepare your model repository for a TensorRT-LLM model, build the engine, etc. You can try any of the following options:
Expand Down Expand Up @@ -655,6 +747,8 @@ Use the `--openai-restricted-api` command-line argument to configure endpoint re
- **inference**: Chat completions and text completions endpoints
- `POST /v1/chat/completions`
- `POST /v1/completions`
- **embedding**: Embedding endpoint
- `POST /v1/embeddings`
- **model-repository**: Model listing and information endpoints
- `GET /v1/models`
- `GET /v1/models/{model_name}`
Expand Down
10 changes: 9 additions & 1 deletion python/openai/openai_frontend/engine/engine.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -34,6 +34,8 @@
CreateChatCompletionResponse,
CreateCompletionRequest,
CreateCompletionResponse,
CreateEmbeddingRequest,
CreateEmbeddingResponse,
Model,
)

Expand Down Expand Up @@ -92,3 +94,9 @@ def completion(
If request.stream is False, this returns a CreateCompletionResponse.
"""
pass

def embedding(self, request: CreateEmbeddingRequest) -> CreateEmbeddingResponse:
"""
Returns a CreateEmbeddingResponse.
"""
pass
147 changes: 132 additions & 15 deletions python/openai/openai_frontend/engine/triton_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

from __future__ import annotations

import base64
import json
import time
import uuid
Expand All @@ -38,18 +39,24 @@
Callable,
Dict,
List,
Literal,
Optional,
Tuple,
Union,
)

import numpy as np
import tritonserver
from engine.engine import LLMEngine
from engine.utils.chat import load_chat_template, parse_chat_messages
from engine.utils.tokenizer import get_tokenizer
from engine.utils.tool_call_parsers import ToolCallParser, ToolParserManager
from engine.utils.triton import (
_create_trtllm_inference_request,
_create_vllm_inference_request,
RequestKind,
_create_trtllm_embedding_request,
_create_trtllm_generate_request,
_create_vllm_embedding_request,
_create_vllm_generate_request,
_get_output,
_get_usage_from_response,
_get_vllm_lora_names,
Expand All @@ -73,6 +80,9 @@
CreateChatCompletionStreamResponse,
CreateCompletionRequest,
CreateCompletionResponse,
CreateEmbeddingRequest,
CreateEmbeddingResponse,
EmbeddingObject,
FinishReason,
Function1,
Function2,
Expand All @@ -97,7 +107,8 @@ class TritonModelMetadata:
# Time that model was loaded by Triton
create_time: int
# Conversion format between OpenAI and Triton requests
request_converter: Callable
inference_request_converter: Callable
embedding_request_converter: Callable


class TritonLLMEngine(LLMEngine):
Expand Down Expand Up @@ -189,7 +200,7 @@ async def chat(

# Convert to Triton request format and perform inference
responses = metadata.model.async_infer(
metadata.request_converter(
metadata.inference_request_converter(
metadata.model, prompt, request, lora_name, self.default_max_tokens
)
)
Expand Down Expand Up @@ -232,7 +243,9 @@ async def chat(
backend=metadata.backend,
)

usage = _get_usage_from_response(response, metadata.backend)
usage = _get_usage_from_response(
response, metadata.backend, RequestKind.GENERATION
)

return CreateChatCompletionResponse(
id=request_id,
Expand Down Expand Up @@ -311,7 +324,7 @@ async def completion(

# Convert to Triton request format and perform inference
responses = metadata.model.async_infer(
metadata.request_converter(
metadata.inference_request_converter(
metadata.model,
request.prompt,
request,
Expand All @@ -334,7 +347,9 @@ async def completion(
response = responses[0]
text = _get_output(response)

usage = _get_usage_from_response(response, metadata.backend)
usage = _get_usage_from_response(
response, metadata.backend, RequestKind.GENERATION
)

choice = Choice(
finish_reason=FinishReason.stop,
Expand All @@ -352,6 +367,57 @@ async def completion(
usage=usage,
)

async def embedding(
self, request: CreateEmbeddingRequest
) -> CreateEmbeddingResponse:
# Validate request and convert to Triton format
model_name, _ = self._get_model_and_lora_name(request.model)
metadata = self.model_metadata.get(model_name)
self._validate_embedding_request(request, metadata)

# Convert to Triton request format and perform inference
responses = metadata.model.async_infer(
metadata.embedding_request_converter(
metadata.model,
request,
)
)

# Response validation with decoupled models in mind
responses = [response async for response in responses]
_validate_triton_responses_non_streaming(responses)
response = responses[0]

# Extract embedding from response (currently stored as JSON string in text_output)
embedding_json = _get_output(response)
embedding_list = json.loads(embedding_json)

usage = _get_usage_from_response(
response, metadata.backend, RequestKind.EMBEDDING
)

embedding = self._get_embedding(embedding_list, request.encoding_format)
embedding_obj = EmbeddingObject(
embedding=embedding, index=0, object="embedding"
)

return CreateEmbeddingResponse(
object="list",
data=[embedding_obj],
model=request.model,
usage=usage,
)

@staticmethod
def _get_embedding(
embedding: List[float], encoding_format: Literal["float", "base64"]
) -> Union[list[float], str]:
Comment thread
yinggeh marked this conversation as resolved.
Fixed
if encoding_format == "float":
return embedding
else:
embedding_bytes = np.array(embedding, dtype="float32").tobytes()
return base64.b64encode(embedding_bytes).decode("utf-8")

# TODO: This behavior should be tested further
def _get_first_response_role(
self, conversation: List[Dict], add_generation_prompt: bool, default_role: str
Expand All @@ -362,18 +428,24 @@ def _get_first_response_role(
return conversation[-1]["role"]

# TODO: Expose explicit flag to catch edge cases
def _determine_request_converter(self, backend: str):
def _determine_request_converter(self, backend: str, request_type: RequestKind):
# Allow manual override of backend request format if provided by user
if self.backend:
backend = self.backend

# Request conversion from OpenAI format to backend-specific format
if backend == "vllm":
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wouldn't this be safer as below?

if backend == 'trtllm':
  # do something
elif backend == 'vllm':
  # do something else
else:
  raise ValueError(f'Unknown backend "{backend}" provided.')

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

# Explicitly handle ensembles to avoid any runtime validation errors
if not backend and model.config()["platform"] == "ensemble":
backend = "ensemble"
print(f"Found model: {name=}, {backend=}")
lora_names = None
if self.backend == "vllm" or backend == "vllm":
lora_names = _get_vllm_lora_names(
self.server.options.model_repository, name, model.version
)
metadata = TritonModelMetadata(
name=name,
backend=backend,
model=model,
tokenizer=self.tokenizer,
lora_names=lora_names,
create_time=self.create_time,
inference_request_converter=self._determine_request_converter(
backend, RequestKind.GENERATION
),
embedding_request_converter=self._determine_request_converter(
backend, RequestKind.EMBEDDING
),
)

backend can be ensemble.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense.

when backend == "ensemble" then we hit this code:

        if request_type == RequestKind.GENERATION:
            return _create_trtllm_generate_request
        else:
            return _create_trtllm_embedding_request

is that desirable?

also, adding the switch-like statement future-proofs the function.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

    # Use TRT-LLM format as default for everything else. This could be
    # an ensemble, a python or BLS model, a TRT-LLM backend model, etc.

return _create_vllm_inference_request
if request_type == RequestKind.GENERATION:
return _create_vllm_generate_request
else:
return _create_vllm_embedding_request

# Use TRT-LLM format as default for everything else. This could be
# an ensemble, a python or BLS model, a TRT-LLM backend model, etc.
return _create_trtllm_inference_request
if request_type == RequestKind.GENERATION:
return _create_trtllm_generate_request
else:
return _create_trtllm_embedding_request

def _get_model_and_lora_name(self, request_model_name: str):
if self.lora_separator is None or len(self.lora_separator) == 0:
Expand Down Expand Up @@ -418,7 +490,12 @@ def _get_model_metadata(self) -> Dict[str, TritonModelMetadata]:
tokenizer=self.tokenizer,
lora_names=lora_names,
create_time=self.create_time,
request_converter=self._determine_request_converter(backend),
inference_request_converter=self._determine_request_converter(
backend, RequestKind.GENERATION
),
embedding_request_converter=self._determine_request_converter(
backend, RequestKind.EMBEDDING
),
)
model_metadata[name] = metadata

Expand Down Expand Up @@ -671,8 +748,15 @@ def _validate_chat_request(
if not metadata.backend:
raise Exception("Unknown backend")

if not metadata.request_converter:
raise Exception(f"Unknown request format for model: {request.model}")
if not metadata.inference_request_converter:
raise Exception(
f"Unknown inference request format for model: {request.model}"
)

if not metadata.embedding_request_converter:
raise Exception(
f"Unknown embedding request format for model: {request.model}"
)

if (
metadata.lora_names is not None
Expand Down Expand Up @@ -807,8 +891,15 @@ def _validate_completion_request(
if not metadata.backend:
raise Exception("Unknown backend")

if not metadata.request_converter:
raise Exception(f"Unknown request format for model: {request.model}")
if not metadata.inference_request_converter:
raise Exception(
f"Unknown inference request format for model: {request.model}"
)

if not metadata.embedding_request_converter:
raise Exception(
f"Unknown embedding request format for model: {request.model}"
)

if (
metadata.lora_names is not None
Expand Down Expand Up @@ -853,6 +944,32 @@ def _validate_completion_request(
"`stream_options.include_usage` is currently only supported for the vLLM backend"
)

def _validate_embedding_request(
self,
request: CreateEmbeddingRequest,
metadata: TritonModelMetadata,
):
"""
Validates an embedding request to align with currently supported features.
"""

# Reject missing internal information needed to do inference
if not metadata:
raise Exception(f"Unknown model: {request.model}")

if not metadata.backend:
raise Exception("Unknown backend")

if not metadata.inference_request_converter:
raise Exception(
f"Unknown inference request format for model: {request.model}"
)

if not metadata.embedding_request_converter:
raise Exception(
f"Unknown embedding request format for model: {request.model}"
)

def _should_stream_with_auto_tool_parsing(
self, request: CreateChatCompletionRequest
):
Expand Down
Loading
Loading