Skip to content

Commit 171b002

Browse files
author
Dylan Huang
authored
aggregated metrics part 6 (#55)
* add --port arg to ep logs * Fix WebSocketManager to reset broadcast task after cancellation * simple tests work * TODO: TestLogsServer * TODO: TestLogsServerIntegration * TODO: test HTML injection - also test TestAsyncWebSocketOperations * add logs server tests * add port parameter testes * use gpt-oss-120b to avoid rate limits * point to port 8000 for dev * woops * fix "uvicorn eval_protocol.utils.logs_server:create_app --factory --reload" * use gpt-oss-120b since less rate limiting (#57) * Aggregated metrics part 7 (#58) * use gpt-oss-120b for less rate limits and faster tests * fix typeerror * Refactor LogsServer event handling and improve integration tests - Moved event_bus.start_listening() to the correct location in LogsServer to ensure it starts listening during the broadcast loop. - Updated integration tests to use multiprocessing for server startup and improved health check validation. - Enhanced test_create_app_factory to be asynchronous and added necessary imports for better clarity. * Enhance test_create_app_factory to verify LogsServer start_loops call - Updated the test_create_app_factory to mock and assert that the start_loops method of LogsServer is called during app creation. - Ensured the test remains asynchronous and maintains clarity in its assertions. * fix * use active logger
1 parent d5ebd03 commit 171b002

9 files changed

Lines changed: 83 additions & 76 deletions

File tree

eval_protocol/utils/logs_server.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,6 @@ def __init__(
248248

249249
# Subscribe to events and start listening for cross-process events
250250
event_bus.subscribe(self._handle_event)
251-
event_bus.start_listening()
252251

253252
logger.info(f"LogsServer initialized on {host}:{port}")
254253

@@ -288,6 +287,12 @@ def _handle_event(self, event_type: str, data: Any) -> None:
288287
data = EvaluationRow(**data)
289288
self.websocket_manager.broadcast_row_upserted(data)
290289

290+
def start_loops(self):
291+
"""Start the broadcast loop and evaluation watcher."""
292+
self.websocket_manager.start_broadcast_loop()
293+
self.evaluation_watcher.start()
294+
event_bus.start_listening()
295+
291296
async def run_async(self):
292297
"""
293298
Run the logs server asynchronously with file watching.
@@ -300,11 +305,7 @@ async def run_async(self):
300305
logger.info(f"Serving files from: {self.build_dir}")
301306
logger.info("WebSocket endpoint available at /ws")
302307

303-
# Start the broadcast loop
304-
self.websocket_manager.start_broadcast_loop()
305-
306-
# Start the evaluation watcher
307-
self.evaluation_watcher.start()
308+
self.start_loops()
308309

309310
config = uvicorn.Config(
310311
self.app,
@@ -336,24 +337,26 @@ def run(self):
336337

337338
def create_app(host: str = "localhost", port: int = 8000, build_dir: Optional[str] = None) -> FastAPI:
338339
"""
339-
Factory function to create a FastAPI app instance.
340+
Factory function to create a FastAPI app instance and start the server with async loops.
340341
341-
This allows uvicorn to call it with parameters and avoids top-level variable instantiation.
342+
This creates a LogsServer instance and starts it in a background thread to ensure
343+
all async loops (WebSocket broadcast, evaluation watching) are running.
342344
343345
Args:
344346
host: Host to bind to
345347
port: Port to bind to
346348
build_dir: Optional custom build directory path
347349
348350
Returns:
349-
FastAPI app instance
351+
FastAPI app instance with server running in background
350352
"""
351353
if build_dir is None:
352354
build_dir = os.path.abspath(
353355
os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "vite-app", "dist")
354356
)
355357

356358
server = LogsServer(host=host, port=port, build_dir=build_dir)
359+
server.start_loops()
357360
return server.app
358361

359362

examples/aime2025_chat_completion/tests/test_aime2025.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
6060
rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
6161
rollout_processor=default_single_turn_rollout_processor,
6262
aggregation_method="mean",
63-
threshold_of_success=None,
63+
passed_threshold=None,
6464
num_runs=2,
6565
max_dataset_rows=2,
6666
max_concurrent_rollouts=4,

examples/gpqa/tests/test_gpqa.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def _load_gpqa_messages_from_csv() -> List[List[Message]]:
6969
], # default to low effort; override via CLI plugin
7070
rollout_processor=default_single_turn_rollout_processor,
7171
aggregation_method="mean",
72-
threshold_of_success=None,
72+
passed_threshold=None,
7373
num_runs=8,
7474
mode="pointwise",
7575
)

examples/healthbench/tests/test_evaluation.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,13 @@
1-
from typing import Dict, List
2-
31
import json
2+
from typing import Dict, List
43

54
from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
6-
from eval_protocol.pytest.evaluation_test import evaluation_test
75
from eval_protocol.pytest.default_single_turn_rollout_process import (
86
default_single_turn_rollout_processor,
97
)
8+
from eval_protocol.pytest.evaluation_test import evaluation_test
109

11-
12-
SYSTEM_PROMPT = (
13-
"You are a clinician assistant. Provide safe, accurate guidance."
14-
)
10+
SYSTEM_PROMPT = "You are a clinician assistant. Provide safe, accurate guidance."
1511

1612

1713
# Inline two small HealthBench-like samples and attach tiny rubrics in-memory
@@ -56,7 +52,7 @@
5652
rollout_input_params=[{"temperature": 0.2, "max_tokens": 512}],
5753
rollout_processor=default_single_turn_rollout_processor,
5854
aggregation_method="mean",
59-
threshold_of_success=None,
55+
passed_threshold=None,
6056
num_runs=1,
6157
max_dataset_rows=2,
6258
mode="pointwise",
@@ -91,5 +87,3 @@ def test_healthbench_pointwise(row: EvaluationRow) -> EvaluationRow:
9187
},
9288
)
9389
return row
94-
95-

tests/pytest/test_markdown_highlighting.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
2828
@evaluation_test(
2929
input_dataset=["tests/pytest/data/markdown_dataset.jsonl"],
3030
dataset_adapter=markdown_dataset_to_evaluation_row,
31-
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
31+
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
3232
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
3333
passed_threshold=0.5,
3434
rollout_processor=default_single_turn_rollout_processor,

tests/pytest/test_pytest_function_calling.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import json
22
from typing import Any, Dict, List
3+
34
from eval_protocol.models import EvaluationRow
45
from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
56
from eval_protocol.rewards.function_calling import exact_tool_match_reward
@@ -19,7 +20,7 @@ def function_calling_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[Evalu
1920

2021
@evaluation_test(
2122
input_dataset=["tests/pytest/data/function_calling.jsonl"],
22-
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
23+
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
2324
mode="pointwise",
2425
dataset_adapter=function_calling_to_evaluation_row,
2526
rollout_processor=default_single_turn_rollout_processor,

tests/test_logs_server.py

Lines changed: 36 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from pathlib import Path
77
from unittest.mock import AsyncMock, MagicMock, Mock, patch
88

9+
import httpx
910
import psutil
1011
import pytest
1112
from fastapi import FastAPI
@@ -332,10 +333,14 @@ async def test_handle_event(self, temp_build_dir):
332333
# The event should be queued for broadcasting
333334
assert not server.websocket_manager._broadcast_queue.empty()
334335

335-
def test_create_app_factory(self, temp_build_dir):
336+
@pytest.mark.asyncio
337+
async def test_create_app_factory(self, temp_build_dir):
336338
"""Test the create_app factory function."""
337-
app = create_app(build_dir=str(temp_build_dir))
338-
assert isinstance(app, FastAPI)
339+
with patch("eval_protocol.utils.logs_server.LogsServer.start_loops") as mock_start_loops:
340+
app = create_app(build_dir=str(temp_build_dir))
341+
assert isinstance(app, FastAPI)
342+
# Verify that start_loops was called
343+
mock_start_loops.assert_called_once()
339344

340345
def test_serve_logs_convenience_function(self, temp_build_dir):
341346
"""Test the serve_logs convenience function."""
@@ -475,13 +480,11 @@ def test_health_endpoint(self, temp_build_dir_with_files):
475480
assert data["status"] == "ok"
476481

477482
@pytest.mark.asyncio
478-
async def test_server_runs_on_specific_port(self, temp_build_dir_with_files):
479-
"""Integration test: verify that LogsServer actually runs on the specified port (async requests)."""
483+
async def test_server_runs_on_specific_port(self):
484+
"""Integration test: verify that LogsServer runs on specified port and handles port parameters correctly."""
485+
import multiprocessing
480486
import socket
481487

482-
import httpx
483-
484-
# Find an available port for testing
485488
def find_free_port():
486489
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
487490
s.bind(("", 0))
@@ -491,54 +494,34 @@ def find_free_port():
491494

492495
test_port = find_free_port()
493496

494-
# Create and start server in background
495-
server = LogsServer(build_dir=str(temp_build_dir_with_files), port=test_port)
496-
497-
# Start server in background task
498-
server_task = asyncio.create_task(server.run_async())
499-
500-
try:
501-
# Wait longer for server to start and be ready
502-
await asyncio.sleep(3)
503-
504-
async with httpx.AsyncClient() as client:
505-
# Test that we can actually connect to the server on the specified port
506-
response = await client.get(f"http://localhost:{test_port}/", timeout=10)
507-
assert response.status_code == 200
508-
assert "Test" in response.text
497+
# Start server with dynamic port and build_dir
498+
server_process = multiprocessing.Process(target=serve_logs, kwargs={"port": test_port}, daemon=True)
499+
server_process.start()
509500

510-
# Test the health endpoint
511-
response = await client.get(f"http://localhost:{test_port}/health", timeout=10)
512-
assert response.status_code == 200
513-
data = response.json()
514-
assert data["status"] == "ok"
515-
516-
finally:
517-
# Clean up
518-
server_task.cancel()
501+
# Wait for server to be ready
502+
for _ in range(20):
519503
try:
520-
await server_task
521-
except asyncio.CancelledError:
504+
response = httpx.get(f"http://localhost:{test_port}/health", timeout=1)
505+
if response.status_code == 200:
506+
break
507+
except httpx.RequestError:
522508
pass
523-
524-
def test_serve_logs_port_parameter_integration(self, temp_build_dir_with_files):
525-
"""Integration test: verify that serve_logs function actually works with port parameter."""
526-
# This test verifies that serve_logs creates LogsServer with the correct port
527-
# without actually starting the server
528-
test_port = 9999
529-
530-
# Use a different approach - mock the LogsServer class and verify the port parameter
531-
with patch("eval_protocol.utils.logs_server.LogsServer") as mock_logs_server_class:
532-
mock_server_instance = Mock()
533-
mock_logs_server_class.return_value = mock_server_instance
534-
535-
# Call serve_logs with specific port
536-
serve_logs(port=test_port)
537-
538-
# Verify that LogsServer was created with the correct port
539-
mock_logs_server_class.assert_called_once_with(port=test_port)
540-
# Verify that the run method was called on the instance
541-
mock_server_instance.run.assert_called_once()
509+
await asyncio.sleep(1)
510+
511+
async with httpx.AsyncClient() as client:
512+
# Test health endpoint
513+
response = await client.get(f"http://localhost:{test_port}/health", timeout=10)
514+
assert response.status_code == 200
515+
data = response.json()
516+
assert data["status"] == "ok"
517+
518+
# Clean up server
519+
if server_process.is_alive():
520+
server_process.terminate()
521+
server_process.join(timeout=2)
522+
if server_process.is_alive():
523+
server_process.kill()
524+
server_process.join(timeout=1)
542525

543526

544527
@pytest.mark.asyncio

vite-app/src/config.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,21 @@ export const discoverServerConfig = async (): Promise<void> => {
4242
return;
4343
}
4444

45+
// Check if we're in Vite development mode
46+
if (import.meta.env.DEV) {
47+
// In dev mode, use localhost:8000
48+
config.websocket.host = 'localhost';
49+
config.websocket.port = '8000';
50+
config.websocket.protocol = 'ws';
51+
52+
config.api.host = 'localhost';
53+
config.api.port = '8000';
54+
config.api.protocol = 'http';
55+
56+
console.log('Using Vite dev config (localhost:8000):', config);
57+
return;
58+
}
59+
4560
// Fallback: Try to discover server configuration from the current location
4661
const currentHost = window.location.hostname;
4762
const currentPort = window.location.port;

vite-app/src/typings.d.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,14 @@ declare module '*.png' {
77
const content: string;
88
export default content;
99
}
10+
/// <reference types="vite/client" />
11+
12+
interface ImportMetaEnv {
13+
readonly DEV: boolean
14+
readonly PROD: boolean
15+
readonly MODE: string
16+
}
17+
18+
interface ImportMeta {
19+
readonly env: ImportMetaEnv
20+
}

0 commit comments

Comments
 (0)