Add Elasticsearch setup and configuration to RemoteRolloutProcessor

Dylan Huang · Dylan Huang · commit 9d66e5f47020 · 2025-09-30T19:56:05.000-07:00
- Introduced setup method in RemoteRolloutProcessor to initialize Elasticsearch if not disabled.
- Added ElasticSearchConfig model for managing Elasticsearch configuration.
- Implemented logic to parse environment variables from a .env file and start Elasticsearch if necessary.
- Updated evaluation_test to call rollout_processor.setup() for proper initialization.
- Modified RolloutProcessor to include a setup method for potential overrides in subclasses.
diff --git a/.gitignore b/.gitignore
@@ -239,5 +239,3 @@ package-lock.json
 package.json
 tau2-bench
 *.err
-
-elastic-start-local/
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -367,6 +367,8 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo
                         exception_handler_config=exception_handler_config,
                     )
 
+                    rollout_processor.setup()
+
                     async def execute_run(run_idx: int, config: RolloutProcessorConfig):
                         nonlocal all_results
 
diff --git a/eval_protocol/pytest/remote_rollout_processor.py b/eval_protocol/pytest/remote_rollout_processor.py
@@ -1,16 +1,23 @@
 import asyncio
+import subprocess
 import time
 from typing import Any, Dict, List, Optional, Callable
 
+from dotenv import load_dotenv
 import requests
 
+from eval_protocol.directory_utils import find_eval_protocol_dir
 from eval_protocol.models import EvaluationRow, Status
 from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
-from eval_protocol.types.remote_rollout_processor import InitRequest, RolloutMetadata
+from eval_protocol.types.remote_rollout_processor import ElasticSearchConfig, InitRequest, RolloutMetadata
 from .rollout_processor import RolloutProcessor
 from .types import RolloutProcessorConfig
+import logging
+
 import os
 
+logger = logging.getLogger(__name__)
+
 
 class RemoteRolloutProcessor(RolloutProcessor):
     """
@@ -27,6 +34,8 @@ def __init__(
         poll_interval: float = 1.0,
         timeout_seconds: float = 120.0,
         output_data_loader: Callable[[str], DynamicDataLoader],
+        disable_elastic_search: bool = False,
+        elastic_search_config: Optional[ElasticSearchConfig] = None,
     ):
         # Prefer constructor-provided configuration. These can be overridden via
         # config.kwargs at call time for backward compatibility.
@@ -37,6 +46,58 @@ def __init__(
         self._poll_interval = poll_interval
         self._timeout_seconds = timeout_seconds
         self._output_data_loader = output_data_loader
+        self._disable_elastic_search = disable_elastic_search
+        self._elastic_search_config = elastic_search_config
+
+    def setup(self) -> None:
+        if self._disable_elastic_search:
+            logger.info("Elasticsearch is disabled, skipping setup")
+            return
+        logger.info("Setting up Elasticsearch")
+        self._elastic_search_config = self._setup_elastic_search()
+        logger.info("Elasticsearch setup complete")
+
+    def _parse_elastic_env_file(self, env_file_path: str) -> ElasticSearchConfig:
+        """Parse ES_LOCAL_API_KEY and ES_LOCAL_URL from .env file."""
+        loaded = load_dotenv(env_file_path)
+        if not loaded:
+            raise RuntimeError("Failed to load .env file")
+        api_key = os.getenv("ES_LOCAL_API_KEY")
+        url = os.getenv("ES_LOCAL_URL")
+        if not url or not api_key:
+            raise RuntimeError("Failed to parse ES_LOCAL_API_KEY and ES_LOCAL_URL from .env file")
+        return ElasticSearchConfig(url=url, api_key=api_key)
+
+    def _setup_elastic_search(self) -> ElasticSearchConfig:
+        eval_protocol_dir = find_eval_protocol_dir()
+        elastic_start_local_dir = os.path.join(eval_protocol_dir, "elastic-start-local")
+        env_file_path = os.path.join(elastic_start_local_dir, ".env")
+
+        # if elastic-start-local directory exists, return the config
+        if os.path.exists(elastic_start_local_dir):
+            # run start.sh in the elastic-start-local directory
+            from eval_protocol.utils.subprocess_utils import run_script_and_wait
+
+            run_script_and_wait(
+                script_name="start.sh",
+                working_directory=elastic_start_local_dir,
+                inherit_stdout=True,
+            )
+            return self._parse_elastic_env_file(env_file_path)
+
+        # run Elasticsearch start-local script: "curl -fsSL https://elastic.co/start-local | sh -s -- --esonly"
+        process = subprocess.Popen(
+            ["sh", "-c", "curl -fsSL https://elastic.co/start-local | sh -s -- --esonly"],
+            cwd=eval_protocol_dir,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+        )
+        returncode = process.wait()
+        if returncode != 0:
+            raise RuntimeError("Failed to start Elasticsearch")
+
+        return self._parse_elastic_env_file(env_file_path)
 
     def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]:
         tasks: List[asyncio.Task[EvaluationRow]] = []
@@ -119,6 +180,7 @@ async def _process_row(row: EvaluationRow) -> EvaluationRow:
                 tools=row.tools,
                 metadata=meta,
                 model_base_url=model_base_url,
+                elastic_search_config=self._elastic_search_config,
             )
 
             # Fire-and-poll
diff --git a/eval_protocol/pytest/rollout_processor.py b/eval_protocol/pytest/rollout_processor.py
@@ -10,6 +10,10 @@ class RolloutProcessor(ABC):
     Abstract base class for all rollout processor strategies.
     """
 
+    def setup(self) -> None:
+        """Setup resources. Override in subclasses if setup is needed. Executed once per invocation."""
+        pass
+
     @abstractmethod
     def __call__(self, rows: list[EvaluationRow], config: RolloutProcessorConfig) -> list[asyncio.Task[EvaluationRow]]:
         """Process evaluation rows and return async tasks. Must be implemented by subclasses."""
diff --git a/eval_protocol/types/remote_rollout_processor.py b/eval_protocol/types/remote_rollout_processor.py
@@ -7,6 +7,15 @@
 from eval_protocol.models import Message, Status
 
 
+class ElasticSearchConfig(BaseModel):
+    """
+    Configuration for Elasticsearch.
+    """
+
+    url: str
+    api_key: str
+
+
 class RolloutMetadata(BaseModel):
     """Metadata for rollout execution."""
 
@@ -21,6 +30,7 @@ class InitRequest(BaseModel):
     """Request model for POST /init endpoint."""
 
     model: str
+    elastic_search_config: Optional[ElasticSearchConfig] = None
     messages: Optional[List[Message]] = None
     tools: Optional[List[Dict[str, Any]]] = None
 
diff --git a/eval_protocol/utils/subprocess_utils.py b/eval_protocol/utils/subprocess_utils.py
@@ -0,0 +1,118 @@
+"""Cross-platform subprocess utilities for running scripts and commands."""
+
+import os
+import platform
+import subprocess
+from typing import Optional
+
+
+def run_script_cross_platform(
+    script_name: str,
+    working_directory: str,
+    capture_output: bool = True,
+    print_output: bool = False,
+    inherit_stdout: bool = False,
+) -> subprocess.Popen:
+    """
+    Run a script in a cross-platform manner.
+
+    Args:
+        script_name: Name of the script to run (e.g., "start.sh")
+        working_directory: Directory to run the script in
+        capture_output: Whether to capture stdout/stderr
+        print_output: Whether to print output in real-time
+        inherit_stdout: Whether to inherit stdout from parent process
+
+    Returns:
+        subprocess.Popen object for the running process
+
+    Raises:
+        RuntimeError: If the script fails to start or execute
+    """
+    script_path = os.path.join(working_directory, script_name)
+
+    if not os.path.exists(script_path):
+        raise FileNotFoundError(f"Script not found: {script_path}")
+
+    # Determine stdout handling
+    if inherit_stdout:
+        stdout = None  # Inherit from parent process
+        stderr = subprocess.STDOUT  # Still capture stderr
+    elif capture_output:
+        stdout = subprocess.PIPE
+        stderr = subprocess.STDOUT
+    else:
+        stdout = None
+        stderr = None
+
+    if platform.system() == "Windows":
+        # On Windows, use cmd.exe to run the script
+        cmd = ["cmd.exe", "/c", script_name]
+        process = subprocess.Popen(
+            cmd,
+            cwd=working_directory,
+            stdout=stdout,
+            stderr=stderr,
+            text=True,
+        )
+    else:
+        # On Unix-like systems, make executable and run with proper shebang
+        os.chmod(script_path, 0o755)
+
+        # Use the full path to the script with shell=True
+        process = subprocess.Popen(
+            script_path,
+            stdout=stdout,
+            stderr=stderr,
+            text=True,
+            shell=True,
+        )
+
+    # Print output in real-time if requested
+    if print_output and capture_output and process.stdout:
+        for line in process.stdout:
+            print(line, end="")
+
+    return process
+
+
+def run_script_and_wait(
+    script_name: str,
+    working_directory: str,
+    print_output: bool = False,
+    inherit_stdout: bool = False,
+    timeout: Optional[int] = None,
+) -> int:
+    """
+    Run a script and wait for it to complete.
+
+    Args:
+        script_name: Name of the script to run
+        working_directory: Directory to run the script in
+        print_output: Whether to print output in real-time
+        inherit_stdout: Whether to inherit stdout from parent process
+        timeout: Maximum time to wait for the script to complete
+
+    Returns:
+        Return code of the script
+
+    Raises:
+        RuntimeError: If the script fails to execute
+        subprocess.TimeoutExpired: If the script times out
+    """
+    process = run_script_cross_platform(
+        script_name=script_name,
+        working_directory=working_directory,
+        capture_output=print_output and not inherit_stdout,
+        print_output=print_output,
+        inherit_stdout=inherit_stdout,
+    )
+
+    try:
+        returncode = process.wait(timeout=timeout)
+        if returncode != 0:
+            raise RuntimeError(f"Script '{script_name}' failed with return code {returncode}")
+        return returncode
+    except subprocess.TimeoutExpired:
+        process.kill()
+        raise

Original file line number	Diff line number	Diff line change
`@@ -367,6 +367,8 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] \| None, passed: bo`
`367`	`367`	`exception_handler_config=exception_handler_config,`
`368`	`368`	`)`
`369`	`369`
	`370`	`+ rollout_processor.setup()`
	`371`	`+`
`370`	`372`	`async def execute_run(run_idx: int, config: RolloutProcessorConfig):`
`371`	`373`	`nonlocal all_results`
`372`	`374`