diff --git a/CHANGELOG.md b/CHANGELOG.md index a1108f0..782066c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,18 @@ # Changelog -All notable changes to the Pose Spatial Studio project will be documented in this file. +All notable changes to the Computer Vision Demo project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [1.4.3] - 9 April 2026 + +### Fixed +- Graceful processor shutdown: `cleanup_processor` now waits for in-flight thread pool work to finish before destroying processor objects, preventing native GPU segfaults during rapid function switching +- Race condition in `switch_model`: block new frames with `_shutting_down` guard during drain-and-swap to prevent stale processor access +- Frontend function switching now awaits backend cleanup acknowledgment (`cleanup_complete` event) before proceeding, with 3s timeout fallback +- Added `beforeunload` handler to emit `cleanup_processor` on page refresh so backend cleans up gracefully +- Rapid function click debounce prevents overlapping cleanup/init cycles + ## [1.4.2] - 4 April 2026 ### Changed diff --git a/CRASHLOG.md b/CRASHLOG.md new file mode 100644 index 0000000..ed59a34 --- /dev/null +++ b/CRASHLOG.md @@ -0,0 +1,44 @@ +# Crash Log + +Backend server crash incidents and root cause analysis. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). + +## [2026-04-09] GPU Segfault — Rapid Processor Switching + +### Summary +Backend server crashed twice in ~1 minute during rapid GPU processor switching. Supervisord auto-restarted both times; server stabilized on third boot. + +### Timeline +| Time (UTC) | Event | +|------------|-------| +| 13:45:29 | Client `x_5wve` initialized YOLO Pose 2D (GPU) | +| 13:45:47 | YOLO cleaned up → race condition: `RuntimeError: Processor not initialized` (frame processed after cleanup) | +| 13:45:50 | YOLO re-initialized (GPU), cleaned up 1s later | +| 13:45:54 | Switched to Object Detection (GPU delegate) | +| ~13:47:00 | **CRASH #1** — process killed, no graceful shutdown logged | +| 13:47:03 | Supervisord restarted app (PID 344) | +| 13:47:39 | Hand Gesture processor: GPU delegate failed → CPU fallback | +| 13:47:57 | **CRASH #2** — process killed again, no shutdown log | +| 13:47:57 | Supervisord restarted app (PID 404) | +| 13:48:00 | MediaPipe pose: GPU failed → CPU fallback. Object Detection initialized then cleaned up | +| 13:48:23 | All processors cleaned up. Server stable since | + +### Root Cause +**Segfault in native GPU code** during rapid model load/unload cycles. + +- No Python traceback for either crash — process was killed by signal (SIGSEGV), not a Python exception +- Both crashes followed rapid GPU allocation/deallocation: YOLO → cleanup → YOLO → cleanup → Object Detection, all within ~25 seconds +- After second restart, MediaPipe GPU delegates failed and fell back to CPU; server has been stable since — confirms GPU native code as the culprit +- `nvidia-smi` showed no GPU processes after recovery (everything running on CPU) +- MediaPipe GPU delegate consistently fails on this container: `ImageCloneCalculator: GPU processing is disabled in build flags` — this is a build-level issue, not runtime + +### Environment +- GPU: NVIDIA GeForce RTX 5080 (16GB), Driver 580.95.05, CUDA 13.0 +- Deployed version: v1.2.2 (local is v1.4.0) +- Container: `pose-spatial-studio-backend`, managed by supervisord (`autorestart=true`, `startretries=10`) +- ONNX Runtime providers: TensorRT, CUDA, CPU + +### Related +- Race condition in YOLO processor cleanup (`Processor not initialized` after cleanup) may be fixed in local v1.4.0 +- MediaPipe GPU delegate is broken at the build level — processors fall back to CPU, which masks this crash path going forward diff --git a/README.md b/README.md index b5907bf..a1697d6 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Pose Spatial Studio +# Computer Vision Demo Real-time pose estimation, object detection, and 3D avatar rendering with WebSocket streaming. diff --git a/backend/app.py b/backend/app.py index 8c120b4..8fa499a 100644 --- a/backend/app.py +++ b/backend/app.py @@ -40,16 +40,16 @@ def _get_gpu_info() -> dict: @asynccontextmanager async def lifespan(app: FastAPI): gpu_info = _get_gpu_info() - logger.info(f"Starting Pose Spatial Studio v1.2.2") + logger.info(f"Starting Computer Vision Demo v1.4.0") logger.info(f"Host: {config.HOST}:{config.PORT}") logger.info(f"GPU: {gpu_info['device']} | CUDA: {gpu_info['cuda_available']} | Providers: {gpu_info['providers']}") yield logger.info("Shutting down server") if websocket_handler: - websocket_handler.cleanup_all() + await websocket_handler.cleanup_all() app = FastAPI( - title="Pose Spatial Studio API", + title="Computer Vision Demo API", version="1.4.0", description="Real-time 3D pose estimation and visualization", lifespan=lifespan @@ -77,7 +77,7 @@ async def lifespan(app: FastAPI): @app.get("/") async def root(): return { - "message": "Pose Spatial Studio API", + "message": "Computer Vision Demo API", "version": "1.4.0", "status": "running" } diff --git a/backend/core/websocket_handler.py b/backend/core/websocket_handler.py index 9d187bb..79a347e 100644 --- a/backend/core/websocket_handler.py +++ b/backend/core/websocket_handler.py @@ -34,6 +34,8 @@ def __init__(self, sio): self._stream_metrics: Dict[str, Dict[str, Any]] = {} self._last_stats_log: float = 0 self._log_handlers: Dict[str, SocketIOLogHandler] = {} # sid -> handler + self._processor_done: Dict[str, asyncio.Event] = {} # processor_id -> Event (set when thread pool finishes) + self._shutting_down: set = set() # processor_ids currently being cleaned up logger.info(f"Thread pool initialized with {POSE_WORKERS} workers") self.setup_handlers() @@ -52,7 +54,7 @@ async def disconnect(sid): processors_to_cleanup = [pid for pid in self.processors.keys() if pid.startswith(f"{sid}_")] logger.debug(f"[DISC] Found {len(processors_to_cleanup)} processors to cleanup: {processors_to_cleanup}") for processor_id in processors_to_cleanup: - self.cleanup_processor(processor_id) + await self.cleanup_processor(processor_id) # Clean up log handler if subscribed handler = self._log_handlers.pop(sid, None) @@ -183,11 +185,8 @@ async def process_frame(sid, data): timestamp = data.get('timestamp_ms', 0) logger.debug(f"[FRAME] Received frame for {stream_id} t={timestamp}") - if processor_id not in self.processors: - logger.warning(f"[ERROR] Processor {processor_id} not found") - await self.sio.emit('error', { - 'message': 'Stream not initialized. Call initialize_stream first.' - }, room=sid) + if processor_id not in self.processors or processor_id in self._shutting_down: + logger.warning(f"[FRAME] Processor {processor_id} not found or shutting down") return frame = self._decode_frame(data.get('frame')) @@ -213,7 +212,8 @@ async def process_frame(sid, data): async def cleanup_processor(sid, data): stream_id = data.get('stream_id') processor_id = f"{sid}_{stream_id}" - self.cleanup_processor(processor_id) + await self.cleanup_processor(processor_id) + await self.sio.emit('cleanup_complete', {'stream_id': stream_id}, room=sid) @self.sio.event async def switch_model(sid, data): @@ -307,8 +307,8 @@ async def switch_model(sid, data): }, room=sid) return - # Drain pending frames and wait for in-flight processing to finish - # before swapping, to avoid using a cleaned-up processor + # Block new frames, drain pending, and wait for in-flight processing + self._shutting_down.add(processor_id) self._latest_frames.pop(processor_id, None) while processor_id in self._active_streams: await asyncio.sleep(0.01) @@ -317,6 +317,7 @@ async def switch_model(sid, data): if current_pose: current_pose.cleanup() pipeline['pose_processor'] = new_pose + self._shutting_down.discard(processor_id) logger.info(f"[SWITCH] Stream {stream_id} switched to {new_processor_type}") @@ -408,8 +409,13 @@ async def solve_ik(sid, data): async def _process_latest_frame(self, processor_id: str): """Process the latest frame for a stream, then check for newer ones.""" + done_event = asyncio.Event() + self._processor_done[processor_id] = done_event try: while processor_id in self._latest_frames: + if processor_id in self._shutting_down: + break + # Grab the latest frame and clear the buffer frame, timestamp, sid, stream_id = self._latest_frames.pop(processor_id) @@ -456,9 +462,13 @@ async def _process_latest_frame(self, processor_id: str): logger.error(f"Error in _process_latest_frame: {e}", exc_info=True) finally: self._active_streams.discard(processor_id) + done_event.set() def _run_pipeline_sync(self, processor_pipeline, frame, timestamp, processor_id=None): """Run the processor pipeline synchronously. Called from thread pool.""" + if processor_id and processor_id in self._shutting_down: + return frame, None + t0 = time.perf_counter() pose_data = None processed_frame = frame @@ -510,8 +520,24 @@ def _decode_frame(self, frame_data: str) -> np.ndarray: logger.error(f"Error decoding frame: {e}") return None - def cleanup_processor(self, processor_id: str): + async def cleanup_processor(self, processor_id: str): + if processor_id in self._shutting_down: + return # already being cleaned up + self._shutting_down.add(processor_id) + + # Stop new frames from being queued self._latest_frames.pop(processor_id, None) + + # Wait for in-flight thread pool work to finish + if processor_id in self._active_streams: + done_event = self._processor_done.get(processor_id) + if done_event: + try: + await asyncio.wait_for(done_event.wait(), timeout=5.0) + except asyncio.TimeoutError: + logger.warning(f"[CLEANUP] Timed out waiting for {processor_id} to drain") + + # Now safe to destroy processor objects self._active_streams.discard(processor_id) self._stream_metrics.pop(processor_id, None) if processor_id in self.processors: @@ -523,14 +549,17 @@ def cleanup_processor(self, processor_id: str): logger.debug(f"[CLEANUP] Remaining processors: {list(self.processors.keys())}") else: logger.debug(f"[CLEANUP] Processor {processor_id} not found (already cleaned up?)") + + self._processor_done.pop(processor_id, None) + self._shutting_down.discard(processor_id) - def cleanup_all(self): + async def cleanup_all(self): for _, handler in list(self._log_handlers.items()): logging.getLogger().removeHandler(handler) handler.close() self._log_handlers.clear() for processor_id in list(self.processors.keys()): - self.cleanup_processor(processor_id) + await self.cleanup_processor(processor_id) self._executor.shutdown(wait=False) def get_stats(self) -> Dict[str, Any]: diff --git a/frontend/index.html b/frontend/index.html index cf34b69..cf19bea 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -3,7 +3,7 @@
-