Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ frontend:
@echo "Starting Vite frontend on http://localhost:5173"
cd agent/frontend && npm run dev

.PHONY: dashboard
dashboard:
@echo "Starting dashboard..."
cd dashboard-ui && npm run dev

all:
@echo "Starting both backend and frontend..."
@make -j2 backend frontend
Expand Down
34 changes: 34 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,40 @@ asyncio.run(test_agent())
- **Flask** - Frontend web interface
- **Google Cloud Vision API** - Document photo classification
- **Google Maps API** - Location geocoding for clinic search
- **OpenTelemetry** - Distributed tracing and metrics instrumentation
- **Prometheus** - Metrics collection and storage
- **React Dashboard** - Real-time metrics visualization

## Observability & Metrics

The platform includes comprehensive observability features:

### Metrics Collection
- **Total Cost**: Track LLM usage costs in real-time
- **Token Usage**: Monitor input/output tokens across all agents
- **Tool Calls**: Count and analyze tool invocations
- **Execution Duration**: Measure agent response times

### Quick Start
```bash
# Start all services (Prometheus, Backend, Dashboard)
./start.sh

# Stop all services
./stop.sh
```

### Metrics Endpoints
- `http://localhost:8000/metrics` - Prometheus metrics endpoint
- `http://localhost:8000/api/metrics/summary` - Aggregated metrics API
- `http://localhost:8000/api/metrics/by-agent` - Per-agent breakdown
- `http://localhost:8000/api/metrics/time-series` - Historical data

### Dashboard Access
- **Live Metrics Dashboard**: `http://localhost:5173` (see "Live Metrics" tab)
- **Prometheus UI**: `http://localhost:9090`

For detailed setup instructions, see [METRICS_SETUP.md](./METRICS_SETUP.md)

## Testing

Expand Down
45 changes: 45 additions & 0 deletions agent/backend/agents/orchestrator/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,27 @@
import json
import logging
import sys
import time
from dotenv import load_dotenv
from google.adk.agents import Agent
from google.adk.artifacts.in_memory_artifact_service import InMemoryArtifactService
from google.adk.runners import Runner
from google.adk.sessions import InMemorySessionService
from google.genai import types
from opentelemetry import trace


from agent.backend.agents.drivers_license.agent import drivers_license_agent
from agent.backend.agents.scheduler.agent import scheduler_agent
from agent.backend.agents.orchestrator.prompt import PROMPT
from agent.backend.types.types import AgentCallRequest, AgentCallResponse, FunctionPayload
from agent.backend.telemetry import (
total_cost_counter,
total_tokens_counter,
tool_calls_counter,
execution_duration_histogram,
tracer
)

# Configure logging to stdout
logging.basicConfig(
Expand Down Expand Up @@ -61,14 +70,27 @@
user_id_to_session_id = {}


@tracer.start_as_current_span("call_agent")
async def call_agent(req: AgentCallRequest) -> AgentCallResponse:
"""Executes one turn of the agent with a query and full chat context."""
logger.info("="*60)
logger.info("call_agent function invoked")
logger.info(f"Question: {req.question}")
logger.info(f"Session ID: {req.session_id}")

start_time = time.time()
labels = {
"agent_id": ORCHESTRATOR_AGENT.name,
"model": ORCHESTRATOR_AGENT.model,
"customer_id": req.session_id or "unknown"
}

try:
span = trace.get_current_span()
if span.is_recording():
span.set_attribute("question", req.question)
span.set_attribute("session_id", req.session_id or "unknown")

assert req.session_id, "Session ID must be provided"

user_id = req.session_id
Expand Down Expand Up @@ -119,6 +141,25 @@ async def call_agent(req: AgentCallRequest) -> AgentCallResponse:
author = event.author
logger.debug(f"Event from author: {author}")

# Update labels with author if available
current_labels = labels.copy()
if author:
current_labels["agent_id"] = author

# Instrument tokens and cost
if hasattr(event, "usage_metadata") and event.usage_metadata:
input_tokens = event.usage_metadata.prompt_token_count or 0
output_tokens = event.usage_metadata.candidates_token_count or 0
total = event.usage_metadata.total_token_count or (input_tokens + output_tokens)

total_tokens_counter.add(total, current_labels)

# Cost calculation (approximate for Gemini 2.5 Flash)
# Input: $0.075 / 1M tokens
# Output: $0.30 / 1M tokens
cost = (input_tokens * 0.075 / 1_000_000) + (output_tokens * 0.30 / 1_000_000)
total_cost_counter.add(cost, current_labels)

logger.debug("Extracting function calls and responses from event")
function_calls = [
e.function_call for e in event.content.parts if e.function_call
Expand All @@ -134,6 +175,7 @@ async def call_agent(req: AgentCallRequest) -> AgentCallResponse:
full_response += text_response

for func_call in function_calls:
tool_calls_counter.add(1, {**current_labels, "function_name": func_call.name}) # type: ignore
logger.info(f"FUNC CALLS: [{author}]: {func_call.name}({json.dumps(func_call.args)})")

for func_resp in function_responses:
Expand Down Expand Up @@ -170,6 +212,9 @@ async def call_agent(req: AgentCallRequest) -> AgentCallResponse:
logger.info("call_agent execution completed successfully")
logger.info("="*60)

duration = time.time() - start_time
execution_duration_histogram.record(duration, labels)

return response
except Exception as e:
logger.error(f"Error in call_agent: {str(e)}", exc_info=True)
Expand Down
Empty file.
Loading