Enhance singleton lock functionality and file locking in LocalFSDatasetLoggerAdapter

Dylan Huang · Dylan Huang · commit cc0abb237529 · 2025-08-07T15:58:30.000-07:00
- Updated `is_process_running` to include a timeout parameter, allowing for more flexible process monitoring.
- Implemented file locking mechanisms in `LocalFSDatasetLoggerAdapter` to prevent race conditions during logging operations, ensuring data integrity when multiple processes access log files.
- Added methods for acquiring and releasing file locks, improving the robustness of the logging process.
diff --git a/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py b/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py
@@ -1,21 +1,22 @@
 import json
 import os
-import shutil
-import tempfile
+import time
 from datetime import datetime, timezone
+from pathlib import Path
 from typing import TYPE_CHECKING, List, Optional
 
 from eval_protocol.common_utils import load_jsonl
 from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
 from eval_protocol.dataset_logger.directory_utils import find_eval_protocol_datasets_dir
+from eval_protocol.singleton_lock import acquire_singleton_lock, release_singleton_lock
 
 if TYPE_CHECKING:
     from eval_protocol.models import EvaluationRow
 
 
 class LocalFSDatasetLoggerAdapter(DatasetLogger):
     """
-    Logger that stores logs in the local filesystem.
+    Logger that stores logs in the local filesystem with file locking to prevent race conditions.
     """
 
     def __init__(self):
@@ -39,6 +40,44 @@ def current_jsonl_path(self) -> str:
         """
         return os.path.join(self.datasets_dir, f"{self.current_date}.jsonl")
 
+    def _acquire_file_lock(self, file_path: str, timeout: float = 30.0) -> bool:
+        """
+        Acquire a lock for a specific file using the singleton lock mechanism.
+
+        Args:
+            file_path: Path to the file to lock
+            timeout: Maximum time to wait for lock acquisition in seconds
+
+        Returns:
+            True if lock was acquired, False if timeout occurred
+        """
+        # Create a lock name based on the file path
+        lock_name = f"file_lock_{os.path.basename(file_path)}"
+        base_dir = Path(os.path.dirname(file_path))
+
+        start_time = time.time()
+        while time.time() - start_time < timeout:
+            result = acquire_singleton_lock(base_dir, lock_name)
+            if result is None:
+                # Successfully acquired lock
+                return True
+            else:
+                # Lock is held by another process, wait and retry
+                time.sleep(0.1)
+
+        return False
+
+    def _release_file_lock(self, file_path: str) -> None:
+        """
+        Release the lock for a specific file.
+
+        Args:
+            file_path: Path to the file to unlock
+        """
+        lock_name = f"file_lock_{os.path.basename(file_path)}"
+        base_dir = Path(os.path.dirname(file_path))
+        release_singleton_lock(base_dir, lock_name)
+
     def log(self, row: "EvaluationRow") -> None:
         """Log a row, updating existing row with same ID or appending new row."""
         row_id = row.input_metadata.row_id
@@ -49,25 +88,35 @@ def log(self, row: "EvaluationRow") -> None:
                 if filename.endswith(".jsonl"):
                     file_path = os.path.join(self.datasets_dir, filename)
                     if os.path.exists(file_path):
-                        with open(file_path, "r") as f:
-                            lines = f.readlines()
-
-                        # Find the line with matching ID
-                        for i, line in enumerate(lines):
+                        if self._acquire_file_lock(file_path):
                             try:
-                                line_data = json.loads(line.strip())
-                                if line_data["input_metadata"]["row_id"] == row_id:
-                                    # Update existing row
-                                    lines[i] = row.model_dump_json(exclude_none=True) + os.linesep
-                                    with open(file_path, "w") as f:
-                                        f.writelines(lines)
-                                    return
-                            except json.JSONDecodeError:
-                                continue
+                                with open(file_path, "r") as f:
+                                    lines = f.readlines()
+
+                                # Find the line with matching ID
+                                for i, line in enumerate(lines):
+                                    try:
+                                        line_data = json.loads(line.strip())
+                                        if line_data["input_metadata"]["row_id"] == row_id:
+                                            # Update existing row
+                                            lines[i] = row.model_dump_json(exclude_none=True) + os.linesep
+                                            with open(file_path, "w") as f:
+                                                f.writelines(lines)
+                                            return
+                                    except json.JSONDecodeError:
+                                        continue
+                            finally:
+                                self._release_file_lock(file_path)
 
         # If no existing row found, append new row to current file
-        with open(self.current_jsonl_path, "a") as f:
-            f.write(row.model_dump_json(exclude_none=True) + os.linesep)
+        if self._acquire_file_lock(self.current_jsonl_path):
+            try:
+                with open(self.current_jsonl_path, "a") as f:
+                    f.write(row.model_dump_json(exclude_none=True) + os.linesep)
+            finally:
+                self._release_file_lock(self.current_jsonl_path)
+        else:
+            raise RuntimeError(f"Failed to acquire lock for log file {self.current_jsonl_path}")
 
     def read(self, row_id: Optional[str] = None) -> List["EvaluationRow"]:
         """Read rows from all JSONL files in the datasets directory. Also
@@ -82,14 +131,18 @@ def read(self, row_id: Optional[str] = None) -> List["EvaluationRow"]:
         for filename in os.listdir(self.datasets_dir):
             if filename.endswith(".jsonl"):
                 file_path = os.path.join(self.datasets_dir, filename)
-                data = load_jsonl(file_path)
-                for r in data:
-                    row = EvaluationRow(**r)
-                    if row.input_metadata.row_id not in existing_row_ids:
-                        existing_row_ids.add(row.input_metadata.row_id)
-                    else:
-                        raise ValueError(f"Duplicate Row ID {row.input_metadata.row_id} already exists")
-                    all_rows.append(row)
+                if self._acquire_file_lock(file_path):
+                    try:
+                        data = load_jsonl(file_path)
+                        for r in data:
+                            row = EvaluationRow(**r)
+                            if row.input_metadata.row_id not in existing_row_ids:
+                                existing_row_ids.add(row.input_metadata.row_id)
+                            else:
+                                raise ValueError(f"Duplicate Row ID {row.input_metadata.row_id} already exists")
+                            all_rows.append(row)
+                    finally:
+                        self._release_file_lock(file_path)
 
         if row_id:
             # Filter by row_id if specified
diff --git a/eval_protocol/pytest/eval_watcher.py b/eval_protocol/pytest/eval_watcher.py
@@ -23,7 +23,7 @@
 from eval_protocol.dataset_logger.directory_utils import find_eval_protocol_dir
 from eval_protocol.logging_utils import get_logger
 from eval_protocol.models import EvaluationRow
-from eval_protocol.utils.singleton_lock import (
+from eval_protocol.singleton_lock import (
     acquire_singleton_lock,
     get_lock_file_paths,
     get_lock_holder_pid,
@@ -213,7 +213,7 @@ def _start_watcher_process(check_interval: float) -> Optional[int]:
         return None
 
 
-def ensure_singleton_watcher(check_interval: float = 2.0) -> bool:
+def ensure_singleton_watcher(check_interval: float = 2.0) -> Optional[int]:
     """
     Ensure the singleton EvaluationWatcher instance exists and is running.
     This function is OS-level global - only one watcher will run across all processes.
@@ -223,53 +223,34 @@ def ensure_singleton_watcher(check_interval: float = 2.0) -> bool:
         check_interval: How often to check for terminated processes (seconds)
 
     Returns:
-        True if watcher was started successfully, False if another watcher is already running
+        PID of the watcher process if it was started successfully, None if it failed to start
     """
-
     # Check if a watcher is already running before attempting to start a new one
     if is_watcher_running():
         logger.info("🔍 Evaluation watcher is already running")
-        return False
+        return None
 
     # Start the watcher in a completely independent background process
-    try:
-        pid = _start_watcher_process(check_interval)
-        if pid is None:
-            logger.error("❌ Failed to start evaluation watcher: process creation failed")
-            return False
-
-        logger.info(f"🔍 Started evaluation watcher in independent background process (PID: {pid})")
-
-        # Spin until the watcher is running, or timeout after 10 seconds
-        timeout = 10.0
-        interval = 0.1
-        waited = 0.0
-        while waited < timeout:
-            if is_watcher_running():
-                break
-            time.sleep(interval)
-            waited += interval
-        else:
-            logger.error(
-                f"❌ Watcher process (PID: {pid}) started but didn't acquire the lock after {timeout} seconds"
-            )
-            return False
-
-        # Don't wait for the process - let it run independently
-        return True
-    except Exception as e:
-        logger.error(f"❌ Failed to start evaluation watcher: {e}")
-        return False
+    pid = _start_watcher_process(check_interval)
+    logger.info(f"🔍 Started evaluation watcher in independent background process (PID: {pid})")
+    return pid
 
 
 def is_watcher_running() -> bool:
     """Check if the evaluation watcher is currently running."""
     return is_lock_held(get_eval_protocol_dir(), LOCK_NAME)
 
 
-def get_watcher_pid() -> Optional[int]:
-    """Get the PID of the currently running evaluation watcher."""
-    return get_lock_holder_pid(get_eval_protocol_dir(), LOCK_NAME)
+def get_watcher_pid(timeout: float = 10.0) -> Optional[int]:
+    """Get the PID of the currently running evaluation watcher. Tries for 10 seconds."""
+    interval = 0.1
+    started = time.time()
+    while time.time() - started < timeout:
+        pid = get_lock_holder_pid(get_eval_protocol_dir(), LOCK_NAME)
+        if pid is not None:
+            return pid
+        time.sleep(interval)
+    return None
 
 
 def stop_watcher() -> bool:
@@ -280,7 +261,7 @@ def stop_watcher() -> bool:
         return False
 
     try:
-        os.kill(pid, signal.SIGTERM)
+        os.kill(pid, signal.SIGKILL)
         logger.info(f"🔍 Sent SIGTERM to evaluation watcher process {pid}")
         return True
     except OSError as e:
diff --git a/eval_protocol/singleton_lock.py b/eval_protocol/singleton_lock.py
@@ -13,6 +13,7 @@
 """
 
 import os
+import time
 from pathlib import Path
 from typing import Optional, Tuple
 
@@ -124,7 +125,7 @@ def release_singleton_lock(base_dir: Path, lock_name: str) -> None:
         pass
 
 
-def is_process_running(pid: int) -> bool:
+def is_process_running(pid: int, timeout: float = 10.0) -> bool:
     """
     Check if a process is still running.
 
@@ -134,11 +135,19 @@ def is_process_running(pid: int) -> bool:
     Returns:
         True if the process is running, False otherwise
     """
-    try:
-        os.kill(pid, 0)
-        return True
-    except OSError:
-        return False
+    start = time.time()
+
+    def _is_process_running(pid: int) -> bool:
+        try:
+            os.kill(pid, 0)
+            return True
+        except OSError:
+            return False
+
+    while time.time() - start < timeout:
+        if not _is_process_running(pid):
+            return False
+    return True
 
 
 def is_lock_held(base_dir: Path, lock_name: str) -> bool:
diff --git a/tests/test_eval_watcher.py b/tests/test_eval_watcher.py