Skip to content

Commit 4460732

Browse files
committed
add local changes
1 parent eb7dd00 commit 4460732

13 files changed

Lines changed: 907 additions & 65 deletions

Makefile

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,101 @@
11
PYTHON_DIRS = tests examples scripts eval_protocol
2+
PY ?= uv run python
23

34
.PHONY: clean build dist upload test lint typecheck format release sync-docs version tag-version show-version bump-major bump-minor bump-patch full-release quick-release
5+
## -----------------------------
6+
## Local Langfuse + LiteLLM E2E
7+
## -----------------------------
8+
9+
.PHONY: local-install local-langfuse-up local-langfuse-up-local local-langfuse-wait local-litellm-up local-litellm-smoke local-adapter-smoke local-generate-traces local-generate-chinook local-eval local-eval-fireworks-only local-quick-run
10+
11+
local-install:
12+
uv pip install -e ".[langfuse]"
13+
14+
# 1) Start Langfuse per official docs (run from Langfuse repo). Here we just export env.
15+
local-langfuse-up:
16+
@echo "Ensure you started Langfuse via docker compose as per docs."
17+
@echo "Docs: https://langfuse.com/self-hosting/deployment/docker-compose"
18+
@echo "Exporting LANGFUSE env vars for SDK..."
19+
LANGFUSE_PUBLIC_KEY=$${LANGFUSE_PUBLIC_KEY:-local}; \
20+
LANGFUSE_SECRET_KEY=$${LANGFUSE_SECRET_KEY:-local}; \
21+
LANGFUSE_HOST=$${LANGFUSE_HOST:-http://localhost:3000}; \
22+
printf "LANGFUSE_PUBLIC_KEY=%s\nLANGFUSE_SECRET_KEY=%s\nLANGFUSE_HOST=%s\n" $$LANGFUSE_PUBLIC_KEY $$LANGFUSE_SECRET_KEY $$LANGFUSE_HOST
23+
24+
# Start Langfuse using local compose file
25+
local-langfuse-up-local:
26+
docker compose -f examples/local_langfuse_litellm_ollama/langfuse-docker-compose.yml up -d
27+
28+
# Wait until Langfuse UI responds
29+
local-langfuse-wait:
30+
LANGFUSE_HOST=$${LANGFUSE_HOST:-http://localhost:3000}; \
31+
echo "Waiting for $$LANGFUSE_HOST ..."; \
32+
for i in $$(seq 1 60); do \
33+
code=$$(curl -s -o /dev/null -w "%{http_code}" $$LANGFUSE_HOST); \
34+
if [ "$$code" = "200" ] || [ "$$code" = "302" ]; then echo "Langfuse is up (HTTP $$code)"; exit 0; fi; \
35+
sleep 2; \
36+
done; \
37+
echo "Langfuse did not become ready in time."; exit 1
38+
39+
# 2) Start LiteLLM router (requires litellm installed). Keep foreground.
40+
local-litellm-up:
41+
LITELLM_API_KEY=$${LITELLM_API_KEY:-local-demo-key}; \
42+
printf "LITELLM_API_KEY=%s\n" $$LITELLM_API_KEY; \
43+
LITELLM_API_KEY=$$LITELLM_API_KEY uv run litellm --config examples/local_langfuse_litellm_ollama/litellm-config.yaml --port 4000
44+
45+
# 2b) Smoke test LiteLLM endpoints
46+
local-litellm-smoke:
47+
@test -n "$$LITELLM_API_KEY" || (echo "LITELLM_API_KEY not set" && exit 1)
48+
curl -s -H "Authorization: Bearer $$LITELLM_API_KEY" http://127.0.0.1:4000/v1/models | head -n 5 | cat
49+
curl -s \
50+
-H "Authorization: Bearer $$LITELLM_API_KEY" \
51+
-H "Content-Type: application/json" \
52+
http://127.0.0.1:4000/v1/chat/completions \
53+
-d '{"model":"ollama/llama3.1","messages":[{"role":"user","content":"Say hi"}]}' \
54+
| head -n 40 | cat
55+
56+
# 3) Seed one trace into Langfuse
57+
58+
# 4) Adapter smoke test (fetch 1 row)
59+
local-adapter-smoke:
60+
LANGFUSE_HOST=$${LANGFUSE_HOST:-http://localhost:3000}; \
61+
code=$$(curl -s -o /dev/null -w "%{http_code}" $$LANGFUSE_HOST); \
62+
if [ "$$code" != "200" ] && [ "$$code" != "302" ]; then \
63+
echo "Langfuse not reachable at $$LANGFUSE_HOST (HTTP $$code). Start it per docs."; \
64+
exit 1; \
65+
fi; \
66+
LANGFUSE_PUBLIC_KEY=$${LANGFUSE_PUBLIC_KEY:-local}; \
67+
LANGFUSE_SECRET_KEY=$${LANGFUSE_SECRET_KEY:-local}; \
68+
LANGFUSE_PUBLIC_KEY=$$LANGFUSE_PUBLIC_KEY LANGFUSE_SECRET_KEY=$$LANGFUSE_SECRET_KEY LANGFUSE_HOST=$$LANGFUSE_HOST \
69+
$(PY) -c "from eval_protocol.adapters.langfuse import create_langfuse_adapter; a=create_langfuse_adapter(); rows=a.get_evaluation_rows(limit=1, sample_size=1); print('Fetched rows:', len(rows))"
70+
71+
# Generate realistic traces into Langfuse (Chinook) using Fireworks models
72+
local-generate-traces:
73+
@test -n "$$FIREWORKS_API_KEY" || (echo "FIREWORKS_API_KEY not set" && exit 1)
74+
uv pip install -e ".[pydantic,fireworks,chinook]" >/dev/null || true
75+
CHINOOK_USE_STUB_DB=1 uv run pytest tests/chinook/langfuse/generate_traces.py -q
76+
77+
# Force-run Chinook generator with stub DB and Langfuse observe
78+
local-generate-chinook:
79+
@test -n "$$FIREWORKS_API_KEY" || (echo "FIREWORKS_API_KEY not set" && exit 1)
80+
uv pip install -e ".[pydantic,fireworks,chinook]" >/dev/null || true
81+
CHINOOK_USE_STUB_DB=1 uv run pytest tests/chinook/langfuse/generate_traces.py -q
82+
83+
# Fallback generator that does not need external DBs
84+
85+
# 5) Run the local evaluation test (uses Fireworks as judge; requires FIREWORKS_API_KEY)
86+
local-eval:
87+
@test -n "$$FIREWORKS_API_KEY" || (echo "FIREWORKS_API_KEY not set" && exit 1)
88+
uv run pytest eval_protocol/quickstart/llm_judge_langfuse_local.py -k test_llm_judge_local -q
89+
90+
# Run evaluation by calling Fireworks directly (skip LiteLLM router)
91+
local-eval-fireworks-only:
92+
@test -n "$$FIREWORKS_API_KEY" || (echo "FIREWORKS_API_KEY not set" && exit 1)
93+
uv run pytest eval_protocol/quickstart/llm_judge_langfuse_fireworks_only.py -k test_llm_judge_fireworks_only -q
94+
95+
# One-shot: assumes Langfuse is already up externally and LiteLLM already running in another shell
96+
local-quick-run: local-seed-langfuse local-adapter-smoke local-eval
97+
@echo "Done. Check Langfuse UI for scores."
98+
499

5100
clean:
6101
rm -rf build/ dist/ *.egg-info/

README.md

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,66 @@ With hundreds of models and configs, you need objective data to choose the right
1818
- **LLM judge**: Stack-rank models using pairwise Arena-Hard-Auto
1919
- **Local UI**: Pivot/table views for real-time analysis
2020

21-
## ⚡ Quickstart (no labels needed)
21+
## ⚡ Quickstart (local traces + local models)
22+
23+
This end-to-end uses a local Langfuse (Docker Compose), seeds app traces, then runs a model picker with a Fireworks-based judge and your local models (Ollama or llama.cpp). See `examples/local_langfuse_litellm_ollama/README.md` for a full guide.
24+
25+
### 1) Start Langfuse locally (compose file included)
26+
27+
```bash
28+
# From repo root
29+
docker compose -f examples/local_langfuse_litellm_ollama/langfuse-docker-compose.yml up -d
30+
export LANGFUSE_HOST=http://localhost:3000
31+
export LANGFUSE_PUBLIC_KEY=... # create in Langfuse UI
32+
export LANGFUSE_SECRET_KEY=...
33+
export LANGFUSE_ENVIRONMENT=local
34+
```
35+
36+
Open `http://localhost:3000` and confirm the UI loads.
37+
38+
### 2) Seed traces (PydanticAgent, no external DB required)
39+
40+
```bash
41+
export FIREWORKS_API_KEY=...
42+
export CHINOOK_USE_STUB_DB=1
43+
make -C . local-generate-chinook
44+
```
45+
46+
Optionally verify the adapter can fetch rows:
47+
48+
```bash
49+
make -C . local-adapter-smoke
50+
```
51+
52+
### 3) Evaluate with local models
53+
54+
Ollama only, direct (bypass LiteLLM):
55+
56+
```bash
57+
export DIRECT_OLLAMA=1
58+
export OLLAMA_BASE_URL=http://127.0.0.1:11434
59+
export OLLAMA_MODELS='ollama/llama3.1' # comma-separated to compare multiple
60+
export FIREWORKS_API_KEY=...
61+
# Optional debug to verify calls and logging
62+
export EP_DEBUG=1
63+
pytest eval_protocol/quickstart/llm_judge_langfuse_local.py -k test_llm_judge_local -q
64+
```
65+
66+
Optional: via LiteLLM router (Ollama/llama.cpp):
67+
68+
```bash
69+
export LITELLM_API_KEY=local-demo-key
70+
litellm --config examples/local_langfuse_litellm_ollama/litellm-config.yaml --port 4000
71+
export LITELLM_BASE_URL=http://127.0.0.1:4000
72+
export OLLAMA_MODELS='ollama/llama3.1,ollama/llama3.2:1b'
73+
# Optional debug to verify router calls and logging
74+
export EP_DEBUG=1
75+
pytest eval_protocol/quickstart/llm_judge_langfuse_local.py -k test_llm_judge_local -q
76+
```
77+
78+
The pytest output includes local links for a leaderboard and row-level traces at `http://localhost:8000`.
79+
80+
## Basic AHA judge example (remote APIs)
2281

2382
Install with your tracing platform extras and set API keys:
2483

@@ -104,6 +163,12 @@ curl -LsSf https://astral.sh/uv/install.sh | sh
104163
uv add eval-protocol
105164
```
106165

166+
## 🧑‍💻 Developer notes
167+
168+
- The `eval-protocol logs` command currently may show no rows in some local setups even when Langfuse traces exist; use the local UI links printed by pytest and the Langfuse UI to inspect results. We’re tracking improvements to unify local logs with external trace sources.
169+
- For Langfuse seeding, prefer `tests/chinook/langfuse/generate_traces.py` with `CHINOOK_USE_STUB_DB=1` to avoid external DBs.
170+
- To compare multiple local models, set `OLLAMA_MODELS` (comma-separated) or use the LiteLLM config for mix-and-match backends.
171+
107172
## 📚 Resources
108173

109174
- **[Documentation](https://evalprotocol.io)** – Guides and API reference

eval_protocol/pytest/default_single_turn_rollout_process.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import time
55
from typing import List
66

7-
from litellm import acompletion
87
from typing import Dict
98

109
from eval_protocol.dataset_logger import default_logger
@@ -67,10 +66,23 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
6766

6867
_litellm = importlib.import_module("litellm")
6968
acompletion = getattr(_litellm, "acompletion")
69+
if os.getenv("EP_DEBUG", "0").strip() == "1":
70+
try:
71+
dbg_model = request_params.get("model")
72+
dbg_base = request_params.get("base_url")
73+
print(
74+
f"[EP-Debug] LiteLLM call: model={dbg_model}, base_url={dbg_base}, tools={'yes' if 'tools' in request_params else 'no'}"
75+
)
76+
except Exception:
77+
pass
7078
response = await acompletion(**request_params)
71-
7279
assistant_content = response.choices[0].message.content or ""
7380
tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
81+
usage = {
82+
"prompt_tokens": response.usage.prompt_tokens,
83+
"completion_tokens": response.usage.completion_tokens,
84+
"total_tokens": response.usage.total_tokens,
85+
}
7486

7587
converted_tool_calls = None
7688
if tool_calls:
@@ -112,16 +124,23 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
112124
]
113125

114126
row.execution_metadata.usage = CompletionUsage(
115-
prompt_tokens=response.usage.prompt_tokens,
116-
completion_tokens=response.usage.completion_tokens,
117-
total_tokens=response.usage.total_tokens,
127+
prompt_tokens=usage["prompt_tokens"],
128+
completion_tokens=usage["completion_tokens"],
129+
total_tokens=usage["total_tokens"],
118130
)
119131

120132
row.messages = messages
121133

122134
row.execution_metadata.duration_seconds = time.perf_counter() - start_time
123135

124136
default_logger.log(row)
137+
if os.getenv("EP_DEBUG", "0").strip() == "1":
138+
try:
139+
print(
140+
f"[EP-Debug] Logged row to EP: rollout_id={row.execution_metadata.rollout_id}, invoc_id={row.execution_metadata.invocation_id}, msg_count={len(row.messages)}"
141+
)
142+
except Exception:
143+
pass
125144
return row
126145

127146
semaphore = config.semaphore

eval_protocol/quickstart/llm_judge.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Default LLM judge for Eval Protocol. Inspired by Arena-Hard-Auto.
33
"""
44

5+
import os
56
from typing import Optional
67

78
from eval_protocol.models import EvaluationRow, EvaluateResult, MetricResult
@@ -85,6 +86,15 @@ async def aha_judge(
8586
# Upload score to adapter if provided
8687
if adapter and row.evaluation_result and row.evaluation_result.is_score_valid:
8788
model_name = row.input_metadata.completion_params.get("model", "unknown_model")
88-
adapter.upload_score(row, model_name)
89+
try:
90+
if os.getenv("EP_DEBUG", "0").strip() == "1":
91+
print(
92+
f"[EP-Debug] Uploading score to Langfuse: model={model_name}, score={row.evaluation_result.score}"
93+
)
94+
adapter.upload_score(row, model_name)
95+
if os.getenv("EP_DEBUG", "0").strip() == "1":
96+
print("[EP-Debug] Upload score success")
97+
except Exception as e:
98+
print(f"[EP-Debug] Upload score failed: {repr(e)}")
8999

90100
return row
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
"""Evaluate Langfuse traces with Fireworks-only rollout (no LiteLLM router).
2+
3+
This uses SingleTurnRolloutProcessor to call Fireworks directly via the
4+
litellm client (not the proxy server) and then runs the AHA judge (also on
5+
Fireworks by default). Scores are pushed back to Langfuse.
6+
"""
7+
8+
from datetime import datetime
9+
import os
10+
11+
import pytest
12+
13+
from eval_protocol import (
14+
DynamicDataLoader,
15+
EvaluationRow,
16+
SingleTurnRolloutProcessor,
17+
aha_judge,
18+
create_langfuse_adapter,
19+
evaluation_test,
20+
multi_turn_assistant_to_ground_truth,
21+
)
22+
23+
24+
def langfuse_fireworks_data_generator() -> list[EvaluationRow]:
25+
adapter = create_langfuse_adapter()
26+
return adapter.get_evaluation_rows(
27+
environment=os.getenv("LANGFUSE_ENVIRONMENT", "local"),
28+
limit=int(os.getenv("LANGFUSE_LIMIT", "100")),
29+
sample_size=int(os.getenv("LANGFUSE_SAMPLE_SIZE", "20")),
30+
include_tool_calls=bool(int(os.getenv("LANGFUSE_INCLUDE_TOOL_CALLS", "1"))),
31+
sleep_between_gets=float(os.getenv("LANGFUSE_SLEEP", "0.5")),
32+
max_retries=int(os.getenv("LANGFUSE_MAX_RETRIES", "3")),
33+
from_timestamp=None,
34+
to_timestamp=datetime.utcnow(),
35+
)
36+
37+
38+
@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
39+
@pytest.mark.skipif(
40+
not os.getenv("FIREWORKS_API_KEY"),
41+
reason="Requires FIREWORKS_API_KEY",
42+
)
43+
@pytest.mark.parametrize(
44+
"completion_params",
45+
[
46+
{
47+
"model": os.getenv("FIREWORKS_COMPLETION_MODEL", "accounts/fireworks/models/kimi-k2-instruct"),
48+
"api_key": os.getenv("FIREWORKS_API_KEY"),
49+
"base_url": os.getenv("FIREWORKS_BASE_URL", "https://api.fireworks.ai/inference/v1"),
50+
"temperature": float(os.getenv("FIREWORKS_TEMPERATURE", "0.2")),
51+
"max_tokens": int(os.getenv("FIREWORKS_MAX_TOKENS", "2048")),
52+
},
53+
],
54+
)
55+
@evaluation_test(
56+
data_loaders=DynamicDataLoader(
57+
generators=[langfuse_fireworks_data_generator],
58+
preprocess_fn=multi_turn_assistant_to_ground_truth,
59+
),
60+
rollout_processor=SingleTurnRolloutProcessor(),
61+
max_concurrent_evaluations=int(os.getenv("FIREWORKS_MAX_CONCURRENCY", "2")),
62+
)
63+
async def test_llm_judge_fireworks_only(row: EvaluationRow) -> EvaluationRow:
64+
adapter = create_langfuse_adapter()
65+
return await aha_judge(row, adapter=adapter)

0 commit comments

Comments
 (0)