eval-protocol
diff --git a/‎eval_protocol/adapters/fireworks_tracing.py‎
Lines changed: 9 additions & 2 deletions b/‎eval_protocol/adapters/fireworks_tracing.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎eval_protocol/cli.py‎
Lines changed: 3 additions & 1 deletion b/‎eval_protocol/cli.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎eval_protocol/dataset_logger/sqlite_evaluation_row_store.py‎
Lines changed: 4 additions & 2 deletions b/‎eval_protocol/dataset_logger/sqlite_evaluation_row_store.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎eval_protocol/event_bus/sqlite_event_bus_database.py‎
Lines changed: 48 additions & 4 deletions b/‎eval_protocol/event_bus/sqlite_event_bus_database.py‎
Lines changed: 48 additions & 4 deletions
diff --git a/‎eval_protocol/mcp/mcp_multi_client.py‎
Lines changed: 0 additions & 3 deletions b/‎eval_protocol/mcp/mcp_multi_client.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎eval_protocol/pytest/default_agent_rollout_processor.py‎
Lines changed: 5 additions & 1 deletion b/‎eval_protocol/pytest/default_agent_rollout_processor.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py‎
Lines changed: 27 additions & 21 deletions b/‎eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py‎
Lines changed: 27 additions & 21 deletions
diff --git a/‎eval_protocol/pytest/default_mcp_gym_rollout_processor.py‎
Lines changed: 11 additions & 7 deletions b/‎eval_protocol/pytest/default_mcp_gym_rollout_processor.py‎
Lines changed: 11 additions & 7 deletions
@@ -253,17 +253,24 @@ def __init__(
         project_id: Optional[str] = None,
         base_url: str = "https://tracing.fireworks.ai",
         timeout: int = 300,
+        api_key: Optional[str] = None,
     ):
         """Initialize the Fireworks Tracing adapter.
 
         Args:
             project_id: Optional project ID. If not provided, uses the default project configured on the server.
             base_url: The base URL of the tracing proxy (default: https://tracing.fireworks.ai)
             timeout: Request timeout in seconds (default: 300)
+            api_key: Optional API key. If not provided, falls back to FIREWORKS_API_KEY environment variable.
         """
         self.project_id = project_id
         self.base_url = base_url.rstrip("/")
         self.timeout = timeout
+        self._api_key = api_key
+
+    def _get_api_key(self) -> Optional[str]:
+        """Get the API key, preferring instance-level key over environment variable."""
+        return self._api_key or os.environ.get("FIREWORKS_API_KEY")
 
     def search_logs(self, tags: List[str], limit: int = 100, hours_back: int = 24) -> List[Dict[str, Any]]:
         """Fetch logs from Fireworks tracing gateway /logs endpoint.
@@ -276,7 +283,7 @@ def search_logs(self, tags: List[str], limit: int = 100, hours_back: int = 24) -
         from ..common_utils import get_user_agent
 
         headers = {
-            "Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}",
+            "Authorization": f"Bearer {self._get_api_key()}",
             "User-Agent": get_user_agent(),
         }
         params: Dict[str, Any] = {"tags": tags, "limit": limit, "hours_back": hours_back, "program": "eval_protocol"}
@@ -407,7 +414,7 @@ def get_evaluation_rows(
         from ..common_utils import get_user_agent
 
         headers = {
-            "Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}",
+            "Authorization": f"Bearer {self._get_api_key()}",
             "User-Agent": get_user_agent(),
         }
 
 
@@ -284,8 +284,10 @@ def main():
         from dotenv import load_dotenv
 
         # .env.dev for development-specific overrides, .env for general
+        # Use explicit paths to avoid find_dotenv() searching up the directory tree
+        # and potentially finding a different .env file (e.g., in some other repo)
         load_dotenv(dotenv_path=Path(".") / ".env.dev", override=True)
-        load_dotenv(override=True)
+        load_dotenv(dotenv_path=Path(".") / ".env", override=True)
     except ImportError:
         pass
 
 
@@ -7,6 +7,7 @@
 from eval_protocol.event_bus.sqlite_event_bus_database import (
     SQLITE_HARDENED_PRAGMAS,
     check_and_repair_database,
+    connect_with_retry,
     execute_with_sqlite_retry,
 )
 from eval_protocol.models import EvaluationRow
@@ -42,9 +43,10 @@ class EvaluationRow(BaseModel):  # type: ignore
 
         self._EvaluationRow = EvaluationRow
 
-        self._db.connect()
+        # Connect with retry logic that properly handles pragma execution failures
+        connect_with_retry(self._db)
         # Use safe=True to avoid errors when tables/indexes already exist
-        self._db.create_tables([EvaluationRow], safe=True)
+        execute_with_sqlite_retry(lambda: self._db.create_tables([EvaluationRow], safe=True))
 
     @property
     def db_path(self) -> str:
 
@@ -11,8 +11,8 @@
 
 
 # Retry configuration for database operations
-SQLITE_RETRY_MAX_TRIES = 5
-SQLITE_RETRY_MAX_TIME = 30  # seconds
+SQLITE_RETRY_MAX_TRIES = 10
+SQLITE_RETRY_MAX_TIME = 60  # seconds
 
 
 def _is_database_locked_error(e: Exception) -> bool:
@@ -55,6 +55,49 @@ def _execute() -> T:
     return _execute()
 
 
+def connect_with_retry(db: SqliteDatabase) -> None:
+    """
+    Connect to the database with retry logic, ensuring pragmas are always applied.
+
+    Peewee's connect() method sets the connection state *before* executing pragmas
+    (in _initialize_connection). If pragma execution fails with "database is locked",
+    the connection is marked as open but pragmas are not applied. Subsequent calls
+    to connect(reuse_if_open=True) would see the connection as already open and
+    skip pragma execution entirely.
+
+    This function handles this edge case by:
+    1. Closing the connection if a lock error occurs during connect
+    2. Retrying with exponential backoff until pragmas are successfully applied
+
+    Args:
+        db: The SqliteDatabase instance to connect
+    """
+
+    @backoff.on_exception(
+        backoff.expo,
+        OperationalError,
+        max_tries=SQLITE_RETRY_MAX_TRIES,
+        max_time=SQLITE_RETRY_MAX_TIME,
+        giveup=lambda e: not _is_database_locked_error(e),
+        jitter=backoff.full_jitter,
+    )
+    def _connect() -> None:
+        try:
+            # Close any partially-open connection before retrying to ensure
+            # a fresh connection is opened and pragmas are executed
+            if not db.is_closed():
+                db.close()
+            db.connect()
+        except OperationalError:
+            # If connect fails (e.g., during pragma execution), ensure the
+            # connection is closed so the next retry starts fresh
+            if not db.is_closed():
+                db.close()
+            raise
+
+    _connect()
+
+
 # SQLite pragmas for hardened concurrency safety
 SQLITE_HARDENED_PRAGMAS = {
     "journal_mode": "wal",  # Write-Ahead Logging for concurrent reads/writes
@@ -181,9 +224,10 @@ class Event(BaseModel):  # type: ignore
             processed = BooleanField(default=False)  # Track if event has been processed
 
         self._Event = Event
-        self._db.connect()
+        # Connect with retry logic that properly handles pragma execution failures
+        connect_with_retry(self._db)
         # Use safe=True to avoid errors when tables already exist
-        self._db.create_tables([Event], safe=True)
+        execute_with_sqlite_retry(lambda: self._db.create_tables([Event], safe=True))
 
     def publish_event(self, event_type: str, data: Any, process_id: str) -> None:
         """Publish an event to the database."""
 
@@ -13,7 +13,6 @@ class FunctionLike(BaseModel):
     parameters: Any = None
 
 
-from dotenv import load_dotenv
 from mcp import ClientSession, StdioServerParameters
 from mcp.client.stdio import stdio_client
 from mcp.client.streamable_http import streamablehttp_client
@@ -26,8 +25,6 @@ class FunctionLike(BaseModel):
     MCPMultiClientConfiguration,
 )
 
-load_dotenv()  # load environment variables from .env
-
 
 class MCPMultiClient:
     """
 
@@ -22,6 +22,7 @@
 from openai.types import CompletionUsage
 from eval_protocol.pytest.rollout_processor import RolloutProcessor
 from eval_protocol.pytest.types import Dataset, RolloutProcessorConfig
+from eval_protocol.pytest.utils import normalize_fireworks_model_for_litellm
 from pydantic import BaseModel
 from typing import Optional
 
@@ -251,8 +252,11 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             """Process a single row with agent rollout."""
             start_time = time.perf_counter()
 
+            # Normalize Fireworks model names for LiteLLM routing
+            completion_params = normalize_fireworks_model_for_litellm(row.input_metadata.completion_params) or {}
+            row.input_metadata.completion_params = completion_params
             agent = Agent(
-                model=row.input_metadata.completion_params["model"],
+                model=completion_params["model"],
                 row=row,
                 config_path=config.mcp_config_path,
                 logger=config.logger,
 
@@ -11,6 +11,7 @@
 from eval_protocol.models import EvaluationRow
 from eval_protocol.pytest.rollout_processor import RolloutProcessor
 from eval_protocol.pytest.types import RolloutProcessorConfig
+from eval_protocol.pytest.utils import normalize_fireworks_model_for_litellm
 
 from eval_protocol.pytest.default_agent_rollout_processor import Agent
 from klavis import Klavis
@@ -30,15 +31,15 @@ def __init__(
         self.server_name = server_name
         self.initialize_data_factory = initialize_data_factory
         self.klavis_client = Klavis(api_key=os.environ.get("KLAVIS_API_KEY"))
-        
+
     def _init_sandbox(self) -> CreateSandboxResponse:
         try:
             server_name_enum = SandboxMcpServer(self.server_name)
             return self.klavis_client.sandbox.create_sandbox(server_name=server_name_enum)
         except Exception as e:
             logger.error(f"Error creating sandbox: {str(e)}", exc_info=True)
             raise
-    
+
     @staticmethod
     def create_mcp_config(server_url: str, server_key: str = "main", auth_token: str | None = None) -> str:
         """Create a temporary MCP config file and return its path."""
@@ -47,26 +48,24 @@ def create_mcp_config(server_url: str, server_key: str = "main", auth_token: str
                 server_key: {
                     "url": server_url,
                     "transport": "streamable_http",
-                    **({"authorization": f"Bearer {auth_token}"} if auth_token else {})
+                    **({"authorization": f"Bearer {auth_token}"} if auth_token else {}),
                 }
             }
         }
-        
+
         # Create a temp file that persists for the session
         fd, path = tempfile.mkstemp(suffix=".json", prefix="mcp_config_")
-        with os.fdopen(fd, 'w') as f:
+        with os.fdopen(fd, "w") as f:
             json.dump(config, f)
         return path
 
-    def __call__(
-        self, rows: List[EvaluationRow], config: RolloutProcessorConfig
-    ) -> List[asyncio.Task[EvaluationRow]]:
+    def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]:
         """Process evaluation rows with Klavis sandbox lifecycle management"""
         semaphore = config.semaphore
 
         async def process_row(row: EvaluationRow) -> EvaluationRow:
             """Process a single row with complete sandbox lifecycle"""
-            
+
             start_time = time.perf_counter()
             agent: Agent | None = None
             temp_config_path: str | None = None
@@ -88,25 +87,32 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                         if row.input_metadata is not None
                         else None
                     )
-                
+
                 if init_data:
-                    logger.info(f"Initializing {self.server_name} sandbox {sandbox.sandbox_id}")
+                    logger.info(f"Initializing {self.server_name} sandbox {sandbox.sandbox_id}")  # pyright: ignore[reportOptionalMemberAccess]
                     initialize_method = getattr(
-                        self.klavis_client.sandbox, f"initialize_{sandbox.server_name.value}_sandbox"
+                        self.klavis_client.sandbox,
+                        f"initialize_{sandbox.server_name.value}_sandbox",  # pyright: ignore[reportOptionalMemberAccess]
                     )
-                    init_response = initialize_method(sandbox_id=sandbox.sandbox_id, **init_data)
+                    init_response = initialize_method(sandbox_id=sandbox.sandbox_id, **init_data)  # pyright: ignore[reportOptionalMemberAccess]
                     logger.info(f"Initialization response: {init_response}")
-                    
+
                 # Step 2: Create temporary MCP config with sandbox URL
                 temp_config_path = self.create_mcp_config(
-                    server_url=sandbox.server_url, server_key=sandbox.server_name.value
+                    server_url=sandbox.server_url,  # pyright: ignore[reportOptionalMemberAccess]
+                    server_key=sandbox.server_name.value,  # pyright: ignore[reportOptionalMemberAccess]
                 )
                 logger.info(f"MCP config created: {temp_config_path}")
 
                 # Step 3: Run agent with sandbox MCP server
-                logger.info(f"Running agent for row {row.execution_metadata.rollout_id} with {self.server_name} sandbox")
+                logger.info(
+                    f"Running agent for row {row.execution_metadata.rollout_id} with {self.server_name} sandbox"
+                )
+                # Normalize Fireworks model names for LiteLLM routing
+                completion_params = normalize_fireworks_model_for_litellm(row.input_metadata.completion_params) or {}
+                row.input_metadata.completion_params = completion_params
                 agent = Agent(
-                    model=row.input_metadata.completion_params["model"],
+                    model=completion_params["model"],
                     row=row,
                     config_path=temp_config_path,
                     logger=config.logger,
@@ -124,16 +130,16 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                 logger.info(f"Agent execution completed for row {row.execution_metadata.rollout_id}")
 
                 # Step 4: Export sandbox data
-                dump_method = getattr(self.klavis_client.sandbox, f"dump_{sandbox.server_name.value}_sandbox")
-                dump_response = dump_method(sandbox_id=sandbox.sandbox_id)
+                dump_method = getattr(self.klavis_client.sandbox, f"dump_{sandbox.server_name.value}_sandbox")  # pyright: ignore[reportOptionalMemberAccess]
+                dump_response = dump_method(sandbox_id=sandbox.sandbox_id)  # pyright: ignore[reportOptionalMemberAccess]
                 sandbox_data = dump_response.data
                 logger.info(f"Sandbox data: {sandbox_data}")
 
                 # Store sandbox data in row metadata for evaluation
                 if not row.execution_metadata.extra:
                     row.execution_metadata.extra = {}
                 row.execution_metadata.extra["sandbox_data"] = sandbox_data
-                row.execution_metadata.extra["sandbox_id"] = sandbox.sandbox_id
+                row.execution_metadata.extra["sandbox_id"] = sandbox.sandbox_id  # pyright: ignore[reportOptionalMemberAccess]
                 row.execution_metadata.extra["server_name"] = self.server_name
 
             except Exception as e:
@@ -149,7 +155,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                     await agent.mcp_client.cleanup()
                 if temp_config_path and os.path.exists(temp_config_path):
                     os.unlink(temp_config_path)
-                
+
                 # Release sandbox
                 if sandbox and sandbox.sandbox_id:
                     try:
 
@@ -14,6 +14,7 @@
 from eval_protocol.models import EvaluationRow
 from eval_protocol.pytest.rollout_processor import RolloutProcessor
 from eval_protocol.pytest.types import RolloutProcessorConfig, ServerMode
+from eval_protocol.pytest.utils import normalize_fireworks_model_for_litellm
 
 
 class MCPServerManager:
@@ -280,17 +281,20 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
                         "Cannot retry without existing server/environments. Call with start_server=True first."
                     )
 
-        model_id = str((config.completion_params.get("model") if config.completion_params else None) or "gpt-4o-mini")
-        temperature = config.completion_params.get("temperature", 0.0)
-        max_tokens = config.completion_params.get("max_tokens", 4096)
+        # Normalize Fireworks model names for LiteLLM routing
+        completion_params = normalize_fireworks_model_for_litellm(config.completion_params) or {}
+        # Update all rows with normalized completion_params
+        for row in rows:
+            row.input_metadata.completion_params = completion_params
+        model_id = str(completion_params.get("model") or "gpt-4o-mini")
+        temperature = completion_params.get("temperature", 0.0)
+        max_tokens = completion_params.get("max_tokens", 4096)
 
         # Pass all other completion_params (e.g. stream=True) via kwargs
         other_params = {
-            k: v
-            for k, v in (config.completion_params or {}).items()
-            if k not in ["model", "temperature", "max_tokens", "extra_body"]
+            k: v for k, v in completion_params.items() if k not in ["model", "temperature", "max_tokens", "extra_body"]
         }
-        extra_body = config.completion_params.get("extra_body", {}) or {}
+        extra_body = completion_params.get("extra_body", {}) or {}
 
         self.policy = ep.LiteLLMPolicy(
             model_id=model_id,