diff --git a/inference-platforms/README.md b/inference-platforms/README.md index af38c60..18ecc7e 100644 --- a/inference-platforms/README.md +++ b/inference-platforms/README.md @@ -19,6 +19,43 @@ Elastic Stack. * [AgC](AgC) - with [OpenTelemetry export][AgC] * [vLLM](vllm) - with [OpenTelemetry POC][vllm] configuration +## MCP Agent flow + +[agent.py](agent.py) uses the [OpenAI Agents SDK][openai-agents] to search for +flights via [Kiwi's MCP server][kiwi-mcp], proxied through an inference +platform like [Envoy AI Gateway](aigw). + +```mermaid +sequenceDiagram + participant Agent + participant Gateway as AI Gateway + participant LLM as LLM Server + participant MCP as MCP Server + + Agent ->> Gateway: user: "Use the search-flight tool to search for flights from New York to Los Angeles on 18/03/2026" + Gateway ->> LLM: ChatCompletion + activate LLM + LLM ->> Gateway: tool_call: search-flight({origin: "JFK", destination: "LAX", departureDate: "18/03/2026"}) + deactivate LLM + Gateway ->> Agent: + activate Agent + + Agent ->> Gateway: tools/call: search-flight + Gateway ->> MCP: tools/call: search-flight + activate MCP + MCP -->> Gateway: {flights: [{price: 177, route: "JFK→ATL→LAX"}, ...]} + deactivate MCP + Gateway -->> Agent: + deactivate Agent + + Agent ->> Gateway: [user, assistant, tool: {flights}] + Gateway ->> LLM: ChatCompletion + activate LLM + LLM ->> Gateway: "The cheapest flight is JFK → ATL → LAX for €177..." + deactivate LLM + Gateway ->> Agent: +``` + If you use Elastic Stack, an example would look like this in Kibana: ![Kibana screenshot](./kibana-trace.jpg) @@ -114,3 +151,5 @@ To start and use Ollama, do the following: [uv]: https://docs.astral.sh/uv/getting-started/installation/ [ollama-dl]: https://ollama.com/download [otel-tui]: https://github.com/ymtdzzz/otel-tui +[openai-agents]: https://github.com/openai/openai-agents-python +[kiwi-mcp]: https://mcp.kiwi.com diff --git a/inference-platforms/agent.py b/inference-platforms/agent.py index f5032a1..91f1f4e 100644 --- a/inference-platforms/agent.py +++ b/inference-platforms/agent.py @@ -41,10 +41,11 @@ async def run_agent(tools: list[Tool], model_name: str, use_responses: bool): tools=tools, ) - next_week = (datetime.now() + timedelta(weeks=1)).strftime("%Y-%m-%d") + # Small models can't convert between date formats that may be required by tools so this format needs to be precise + next_week = (datetime.now() + timedelta(weeks=1)).strftime("%d/%m/%Y") result = await Runner.run( starting_agent=agent, - input=f"Give me the best flight from New York to Kota Kinabalu on {next_week}", + input=f"Use the search-flight tool to search for flights from New York to Los Angeles on {next_week}", run_config=RunConfig(workflow_name="flight search"), ) print(result.final_output) diff --git a/inference-platforms/aigw/README.md b/inference-platforms/aigw/README.md index 4f29685..43cb008 100644 --- a/inference-platforms/aigw/README.md +++ b/inference-platforms/aigw/README.md @@ -30,6 +30,8 @@ Start Ollama and your OpenTelemetry Collector via this repository's [README](../ ## Run Envoy AI Gateway +### Run with Docker + ```bash docker compose up --force-recreate --pull always --remove-orphans --wait -d ``` @@ -40,6 +42,20 @@ Clean up when finished, like this: docker compose down ``` +### Run with Go + +Download [shdotenv](https://github.com/ko1nksm/shdotenv) to load `env.local` when running. + +``` +curl -O -L https://github.com/ko1nksm/shdotenv/releases/download/v0.14.0/shdotenv +chmod +x ./shdotenv +``` + +Run `aigw` from source after setting ENV variables like this: +```bash +./shdotenv -e env.local go run github.com/envoyproxy/ai-gateway/cmd/aigw@latest run --mcp-json '{"mcpServers":{"kiwi":{"type":"http","url":"https://mcp.kiwi.com"}}}' +``` + ## Call Envoy AI Gateway with python Once Envoy AI Gateway is running, use [uv][uv] to make an OpenAI request via @@ -51,6 +67,11 @@ Once Envoy AI Gateway is running, use [uv][uv] to make an OpenAI request via OPENAI_BASE_URL=http://localhost:1975/v1 uv run --exact -q --env-file env.local ../chat.py ``` +Or, for the OpenAI Responses API +```bash +OPENAI_BASE_URL=http://localhost:1975/v1 uv run --exact -q --env-file env.local ../chat.py --use-responses-api +``` + ### MCP Agent ```bash @@ -60,10 +81,9 @@ OPENAI_BASE_URL=http://localhost:1975/v1 MCP_URL=http://localhost:1975/mcp uv ru ## Notes Here are some constraints about the Envoy AI Gateway implementation: -* Until [this][openai-responses] resolves, don't use `--use-responses-api`. +* Access log integration currently requires the OTLP gRPC transport (`OTEL_EXPORTER_OTLP_PROTOCOL=grpc`). --- [docs]: https://aigateway.envoyproxy.io/docs/cli/ [openinference]: https://github.com/Arize-ai/openinference/tree/main/spec [uv]: https://docs.astral.sh/uv/getting-started/installation/ -[openai-responses]: https://github.com/envoyproxy/ai-gateway/issues/980 diff --git a/inference-platforms/aigw/docker-compose.yml b/inference-platforms/aigw/docker-compose.yml index eb597d6..8ff9b7a 100644 --- a/inference-platforms/aigw/docker-compose.yml +++ b/inference-platforms/aigw/docker-compose.yml @@ -36,7 +36,7 @@ services: environment: - OTEL_SERVICE_NAME=aigw - OPENAI_BASE_URL=http://host.docker.internal:11434/v1 - - OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4318 + - OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4317 ports: - "1975:1975" # OpenAI compatible endpoint at /v1, MCP server at /mcp configs: diff --git a/inference-platforms/aigw/env.local b/inference-platforms/aigw/env.local index 6cf23b7..ad65c9a 100644 --- a/inference-platforms/aigw/env.local +++ b/inference-platforms/aigw/env.local @@ -10,10 +10,8 @@ MCP_HEADERS= # OpenTelemetry configuration OTEL_SERVICE_NAME=openai-agent -OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 -OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf -OTEL_TRACES_EXPORTER=otlp -OTEL_METRICS_EXPORTER=otlp +OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 +OTEL_EXPORTER_OTLP_PROTOCOL=grpc # Reduce trace and metrics export delay for demo purposes OTEL_BSP_SCHEDULE_DELAY=100 diff --git a/inference-platforms/chat.py b/inference-platforms/chat.py index 9d290e3..69c7aec 100644 --- a/inference-platforms/chat.py +++ b/inference-platforms/chat.py @@ -45,7 +45,7 @@ def main(): response = client.responses.create( model=model, input=messages[0]["content"], temperature=0, extra_body=extra_body ) - print(response.output[0].content[0].text) + print(response.output_text) else: chat_completion = client.chat.completions.create( model=model, messages=messages, temperature=0, extra_body=extra_body diff --git a/inference-platforms/kibana-trace.jpg b/inference-platforms/kibana-trace.jpg index 7bd1dac..c00d06e 100644 Binary files a/inference-platforms/kibana-trace.jpg and b/inference-platforms/kibana-trace.jpg differ