@@ -77,7 +77,7 @@ def __init__(
7777 self .project_dir = project_dir
7878 self .root_dir = root_dir
7979 self .process : subprocess .Popen | None = None
80- self ._status : Literal ["stopped" , "running" , "paused" , "crashed" ] = "stopped"
80+ self ._status : Literal ["stopped" , "running" , "paused" , "crashed" , "pausing" , "paused_graceful" ] = "stopped"
8181 self .started_at : datetime | None = None
8282 self ._output_task : asyncio .Task | None = None
8383 self .yolo_mode : bool = False # YOLO mode for rapid prototyping
@@ -96,11 +96,11 @@ def __init__(
9696 self .lock_file = get_agent_lock_path (self .project_dir )
9797
9898 @property
99- def status (self ) -> Literal ["stopped" , "running" , "paused" , "crashed" ]:
99+ def status (self ) -> Literal ["stopped" , "running" , "paused" , "crashed" , "pausing" , "paused_graceful" ]:
100100 return self ._status
101101
102102 @status .setter
103- def status (self , value : Literal ["stopped" , "running" , "paused" , "crashed" ]):
103+ def status (self , value : Literal ["stopped" , "running" , "paused" , "crashed" , "pausing" , "paused_graceful" ]):
104104 old_status = self ._status
105105 self ._status = value
106106 if old_status != value :
@@ -330,6 +330,12 @@ async def _stream_output(self) -> None:
330330 for help_line in AUTH_ERROR_HELP .strip ().split ('\n ' ):
331331 await self ._broadcast_output (help_line )
332332
333+ # Detect graceful pause status transitions from orchestrator output
334+ if "All agents drained - paused." in decoded :
335+ self .status = "paused_graceful"
336+ elif "Resuming from graceful pause..." in decoded :
337+ self .status = "running"
338+
333339 await self ._broadcast_output (sanitized )
334340
335341 except asyncio .CancelledError :
@@ -377,7 +383,7 @@ async def start(
377383 Returns:
378384 Tuple of (success, message)
379385 """
380- if self .status in ("running" , "paused" ):
386+ if self .status in ("running" , "paused" , "pausing" , "paused_graceful" ):
381387 return False , f"Agent is already { self .status } "
382388
383389 if not self ._check_lock ():
@@ -526,6 +532,12 @@ async def stop(self) -> tuple[bool, str]:
526532
527533 self ._cleanup_stale_features ()
528534 self ._remove_lock ()
535+ # Clean up drain signal file if present
536+ try :
537+ from autoforge_paths import get_pause_drain_path
538+ get_pause_drain_path (self .project_dir ).unlink (missing_ok = True )
539+ except Exception :
540+ pass
529541 self .status = "stopped"
530542 self .process = None
531543 self .started_at = None
@@ -586,6 +598,47 @@ async def resume(self) -> tuple[bool, str]:
586598 logger .exception ("Failed to resume agent" )
587599 return False , f"Failed to resume agent: { e } "
588600
601+ async def graceful_pause (self ) -> tuple [bool , str ]:
602+ """Request a graceful pause (drain mode).
603+
604+ Creates a signal file that the orchestrator polls. Running agents
605+ finish their current work before the orchestrator enters a paused state.
606+
607+ Returns:
608+ Tuple of (success, message)
609+ """
610+ if not self .process or self .status not in ("running" ,):
611+ return False , "Agent is not running"
612+
613+ try :
614+ from autoforge_paths import get_pause_drain_path
615+ drain_path = get_pause_drain_path (self .project_dir )
616+ drain_path .parent .mkdir (parents = True , exist_ok = True )
617+ drain_path .write_text (str (self .process .pid ))
618+ self .status = "pausing"
619+ return True , "Graceful pause requested"
620+ except Exception as e :
621+ logger .exception ("Failed to request graceful pause" )
622+ return False , f"Failed to request graceful pause: { e } "
623+
624+ async def graceful_resume (self ) -> tuple [bool , str ]:
625+ """Resume from a graceful pause by removing the drain signal file.
626+
627+ Returns:
628+ Tuple of (success, message)
629+ """
630+ if not self .process or self .status not in ("pausing" , "paused_graceful" ):
631+ return False , "Agent is not in a graceful pause state"
632+
633+ try :
634+ from autoforge_paths import get_pause_drain_path
635+ get_pause_drain_path (self .project_dir ).unlink (missing_ok = True )
636+ self .status = "running"
637+ return True , "Agent resumed from graceful pause"
638+ except Exception as e :
639+ logger .exception ("Failed to resume from graceful pause" )
640+ return False , f"Failed to resume: { e } "
641+
589642 async def healthcheck (self ) -> bool :
590643 """
591644 Check if the agent process is still alive.
@@ -601,8 +654,14 @@ async def healthcheck(self) -> bool:
601654 poll = self .process .poll ()
602655 if poll is not None :
603656 # Process has terminated
604- if self .status in ("running" , "paused" ):
657+ if self .status in ("running" , "paused" , "pausing" , "paused_graceful" ):
605658 self ._cleanup_stale_features ()
659+ # Clean up drain signal file if present
660+ try :
661+ from autoforge_paths import get_pause_drain_path
662+ get_pause_drain_path (self .project_dir ).unlink (missing_ok = True )
663+ except Exception :
664+ pass
606665 self .status = "crashed"
607666 self ._remove_lock ()
608667 return False
@@ -687,8 +746,14 @@ def cleanup_orphaned_locks() -> int:
687746 if not project_path .exists ():
688747 continue
689748
749+ # Clean up stale drain signal files
750+ from autoforge_paths import get_autoforge_dir , get_pause_drain_path
751+ drain_file = get_pause_drain_path (project_path )
752+ if drain_file .exists ():
753+ drain_file .unlink (missing_ok = True )
754+ logger .info ("Removed stale drain signal file for project '%s'" , name )
755+
690756 # Check both legacy and new locations for lock files
691- from autoforge_paths import get_autoforge_dir
692757 lock_locations = [
693758 project_path / ".agent.lock" ,
694759 get_autoforge_dir (project_path ) / ".agent.lock" ,
0 commit comments