From 99c2c1ce92e876adf6d82cdf6f0a9d5dad128081 Mon Sep 17 00:00:00 2001 From: Ying Date: Thu, 9 Apr 2026 14:25:02 +0100 Subject: [PATCH] fix: graceful shutdown; instruction for camera --- CHANGELOG.md | 11 +++- CRASHLOG.md | 44 +++++++++++++ README.md | 2 +- backend/app.py | 8 +-- backend/core/websocket_handler.py | 53 +++++++++++---- frontend/index.html | 2 +- frontend/src/App.tsx | 2 +- frontend/src/components/Controls.tsx | 34 ++++++++-- frontend/src/components/View2D.tsx | 2 +- frontend/src/components/View3D.tsx | 2 +- frontend/src/hooks/useWebSocket.ts | 84 ++++++++++++++++-------- frontend/src/services/socketService.ts | 46 ++++++++++++- tests/README.md | 2 +- tests/package.json | 2 +- tests/playwright.config.ts | 2 +- tests/specs/avatar-voice-control.spec.ts | 2 +- tests/specs/pose-validation.spec.ts | 4 +- 17 files changed, 236 insertions(+), 66 deletions(-) create mode 100644 CRASHLOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md index a1108f0..782066c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,18 @@ # Changelog -All notable changes to the Pose Spatial Studio project will be documented in this file. +All notable changes to the Computer Vision Demo project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [1.4.3] - 9 April 2026 + +### Fixed +- Graceful processor shutdown: `cleanup_processor` now waits for in-flight thread pool work to finish before destroying processor objects, preventing native GPU segfaults during rapid function switching +- Race condition in `switch_model`: block new frames with `_shutting_down` guard during drain-and-swap to prevent stale processor access +- Frontend function switching now awaits backend cleanup acknowledgment (`cleanup_complete` event) before proceeding, with 3s timeout fallback +- Added `beforeunload` handler to emit `cleanup_processor` on page refresh so backend cleans up gracefully +- Rapid function click debounce prevents overlapping cleanup/init cycles + ## [1.4.2] - 4 April 2026 ### Changed diff --git a/CRASHLOG.md b/CRASHLOG.md new file mode 100644 index 0000000..ed59a34 --- /dev/null +++ b/CRASHLOG.md @@ -0,0 +1,44 @@ +# Crash Log + +Backend server crash incidents and root cause analysis. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). + +## [2026-04-09] GPU Segfault — Rapid Processor Switching + +### Summary +Backend server crashed twice in ~1 minute during rapid GPU processor switching. Supervisord auto-restarted both times; server stabilized on third boot. + +### Timeline +| Time (UTC) | Event | +|------------|-------| +| 13:45:29 | Client `x_5wve` initialized YOLO Pose 2D (GPU) | +| 13:45:47 | YOLO cleaned up → race condition: `RuntimeError: Processor not initialized` (frame processed after cleanup) | +| 13:45:50 | YOLO re-initialized (GPU), cleaned up 1s later | +| 13:45:54 | Switched to Object Detection (GPU delegate) | +| ~13:47:00 | **CRASH #1** — process killed, no graceful shutdown logged | +| 13:47:03 | Supervisord restarted app (PID 344) | +| 13:47:39 | Hand Gesture processor: GPU delegate failed → CPU fallback | +| 13:47:57 | **CRASH #2** — process killed again, no shutdown log | +| 13:47:57 | Supervisord restarted app (PID 404) | +| 13:48:00 | MediaPipe pose: GPU failed → CPU fallback. Object Detection initialized then cleaned up | +| 13:48:23 | All processors cleaned up. Server stable since | + +### Root Cause +**Segfault in native GPU code** during rapid model load/unload cycles. + +- No Python traceback for either crash — process was killed by signal (SIGSEGV), not a Python exception +- Both crashes followed rapid GPU allocation/deallocation: YOLO → cleanup → YOLO → cleanup → Object Detection, all within ~25 seconds +- After second restart, MediaPipe GPU delegates failed and fell back to CPU; server has been stable since — confirms GPU native code as the culprit +- `nvidia-smi` showed no GPU processes after recovery (everything running on CPU) +- MediaPipe GPU delegate consistently fails on this container: `ImageCloneCalculator: GPU processing is disabled in build flags` — this is a build-level issue, not runtime + +### Environment +- GPU: NVIDIA GeForce RTX 5080 (16GB), Driver 580.95.05, CUDA 13.0 +- Deployed version: v1.2.2 (local is v1.4.0) +- Container: `pose-spatial-studio-backend`, managed by supervisord (`autorestart=true`, `startretries=10`) +- ONNX Runtime providers: TensorRT, CUDA, CPU + +### Related +- Race condition in YOLO processor cleanup (`Processor not initialized` after cleanup) may be fixed in local v1.4.0 +- MediaPipe GPU delegate is broken at the build level — processors fall back to CPU, which masks this crash path going forward diff --git a/README.md b/README.md index b5907bf..a1697d6 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Pose Spatial Studio +# Computer Vision Demo Real-time pose estimation, object detection, and 3D avatar rendering with WebSocket streaming. diff --git a/backend/app.py b/backend/app.py index 8c120b4..8fa499a 100644 --- a/backend/app.py +++ b/backend/app.py @@ -40,16 +40,16 @@ def _get_gpu_info() -> dict: @asynccontextmanager async def lifespan(app: FastAPI): gpu_info = _get_gpu_info() - logger.info(f"Starting Pose Spatial Studio v1.2.2") + logger.info(f"Starting Computer Vision Demo v1.4.0") logger.info(f"Host: {config.HOST}:{config.PORT}") logger.info(f"GPU: {gpu_info['device']} | CUDA: {gpu_info['cuda_available']} | Providers: {gpu_info['providers']}") yield logger.info("Shutting down server") if websocket_handler: - websocket_handler.cleanup_all() + await websocket_handler.cleanup_all() app = FastAPI( - title="Pose Spatial Studio API", + title="Computer Vision Demo API", version="1.4.0", description="Real-time 3D pose estimation and visualization", lifespan=lifespan @@ -77,7 +77,7 @@ async def lifespan(app: FastAPI): @app.get("/") async def root(): return { - "message": "Pose Spatial Studio API", + "message": "Computer Vision Demo API", "version": "1.4.0", "status": "running" } diff --git a/backend/core/websocket_handler.py b/backend/core/websocket_handler.py index 9d187bb..79a347e 100644 --- a/backend/core/websocket_handler.py +++ b/backend/core/websocket_handler.py @@ -34,6 +34,8 @@ def __init__(self, sio): self._stream_metrics: Dict[str, Dict[str, Any]] = {} self._last_stats_log: float = 0 self._log_handlers: Dict[str, SocketIOLogHandler] = {} # sid -> handler + self._processor_done: Dict[str, asyncio.Event] = {} # processor_id -> Event (set when thread pool finishes) + self._shutting_down: set = set() # processor_ids currently being cleaned up logger.info(f"Thread pool initialized with {POSE_WORKERS} workers") self.setup_handlers() @@ -52,7 +54,7 @@ async def disconnect(sid): processors_to_cleanup = [pid for pid in self.processors.keys() if pid.startswith(f"{sid}_")] logger.debug(f"[DISC] Found {len(processors_to_cleanup)} processors to cleanup: {processors_to_cleanup}") for processor_id in processors_to_cleanup: - self.cleanup_processor(processor_id) + await self.cleanup_processor(processor_id) # Clean up log handler if subscribed handler = self._log_handlers.pop(sid, None) @@ -183,11 +185,8 @@ async def process_frame(sid, data): timestamp = data.get('timestamp_ms', 0) logger.debug(f"[FRAME] Received frame for {stream_id} t={timestamp}") - if processor_id not in self.processors: - logger.warning(f"[ERROR] Processor {processor_id} not found") - await self.sio.emit('error', { - 'message': 'Stream not initialized. Call initialize_stream first.' - }, room=sid) + if processor_id not in self.processors or processor_id in self._shutting_down: + logger.warning(f"[FRAME] Processor {processor_id} not found or shutting down") return frame = self._decode_frame(data.get('frame')) @@ -213,7 +212,8 @@ async def process_frame(sid, data): async def cleanup_processor(sid, data): stream_id = data.get('stream_id') processor_id = f"{sid}_{stream_id}" - self.cleanup_processor(processor_id) + await self.cleanup_processor(processor_id) + await self.sio.emit('cleanup_complete', {'stream_id': stream_id}, room=sid) @self.sio.event async def switch_model(sid, data): @@ -307,8 +307,8 @@ async def switch_model(sid, data): }, room=sid) return - # Drain pending frames and wait for in-flight processing to finish - # before swapping, to avoid using a cleaned-up processor + # Block new frames, drain pending, and wait for in-flight processing + self._shutting_down.add(processor_id) self._latest_frames.pop(processor_id, None) while processor_id in self._active_streams: await asyncio.sleep(0.01) @@ -317,6 +317,7 @@ async def switch_model(sid, data): if current_pose: current_pose.cleanup() pipeline['pose_processor'] = new_pose + self._shutting_down.discard(processor_id) logger.info(f"[SWITCH] Stream {stream_id} switched to {new_processor_type}") @@ -408,8 +409,13 @@ async def solve_ik(sid, data): async def _process_latest_frame(self, processor_id: str): """Process the latest frame for a stream, then check for newer ones.""" + done_event = asyncio.Event() + self._processor_done[processor_id] = done_event try: while processor_id in self._latest_frames: + if processor_id in self._shutting_down: + break + # Grab the latest frame and clear the buffer frame, timestamp, sid, stream_id = self._latest_frames.pop(processor_id) @@ -456,9 +462,13 @@ async def _process_latest_frame(self, processor_id: str): logger.error(f"Error in _process_latest_frame: {e}", exc_info=True) finally: self._active_streams.discard(processor_id) + done_event.set() def _run_pipeline_sync(self, processor_pipeline, frame, timestamp, processor_id=None): """Run the processor pipeline synchronously. Called from thread pool.""" + if processor_id and processor_id in self._shutting_down: + return frame, None + t0 = time.perf_counter() pose_data = None processed_frame = frame @@ -510,8 +520,24 @@ def _decode_frame(self, frame_data: str) -> np.ndarray: logger.error(f"Error decoding frame: {e}") return None - def cleanup_processor(self, processor_id: str): + async def cleanup_processor(self, processor_id: str): + if processor_id in self._shutting_down: + return # already being cleaned up + self._shutting_down.add(processor_id) + + # Stop new frames from being queued self._latest_frames.pop(processor_id, None) + + # Wait for in-flight thread pool work to finish + if processor_id in self._active_streams: + done_event = self._processor_done.get(processor_id) + if done_event: + try: + await asyncio.wait_for(done_event.wait(), timeout=5.0) + except asyncio.TimeoutError: + logger.warning(f"[CLEANUP] Timed out waiting for {processor_id} to drain") + + # Now safe to destroy processor objects self._active_streams.discard(processor_id) self._stream_metrics.pop(processor_id, None) if processor_id in self.processors: @@ -523,14 +549,17 @@ def cleanup_processor(self, processor_id: str): logger.debug(f"[CLEANUP] Remaining processors: {list(self.processors.keys())}") else: logger.debug(f"[CLEANUP] Processor {processor_id} not found (already cleaned up?)") + + self._processor_done.pop(processor_id, None) + self._shutting_down.discard(processor_id) - def cleanup_all(self): + async def cleanup_all(self): for _, handler in list(self._log_handlers.items()): logging.getLogger().removeHandler(handler) handler.close() self._log_handlers.clear() for processor_id in list(self.processors.keys()): - self.cleanup_processor(processor_id) + await self.cleanup_processor(processor_id) self._executor.shutdown(wait=False) def get_stats(self) -> Dict[str, Any]: diff --git a/frontend/index.html b/frontend/index.html index cf34b69..cf19bea 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -3,7 +3,7 @@ - Pose Spatial Studio + Computer Vision Demo
diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index e7f8de6..436e5d7 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -55,7 +55,7 @@ function App() {
-

Pose Spatial Studio

+

Computer Vision Demo

{connected ? 'Connected' : 'Disconnected'} diff --git a/frontend/src/components/Controls.tsx b/frontend/src/components/Controls.tsx index 319f6b9..478013b 100644 --- a/frontend/src/components/Controls.tsx +++ b/frontend/src/components/Controls.tsx @@ -33,6 +33,7 @@ export function Controls({ connected, socket }: ControlsProps) { const comingSoonClicks = useRef(0); const comingSoonTimer = useRef>(undefined); + const switchingRef = useRef(false); const needsCamera = functionDef?.processorType !== null; @@ -43,14 +44,33 @@ export function Controls({ connected, socket }: ControlsProps) { } }, [activeFunction]); - const handleFunctionSelect = (fnId: string) => { - // If switching away from active stream, clean up - if (isStreamActive && socket) { - socket.emit('cleanup_processor', { stream_id: ACTIVE_STREAM_ID }); - setStreamActive(false); - setBackendResult(null); + const handleFunctionSelect = async (fnId: string) => { + if (switchingRef.current) return; + switchingRef.current = true; + try { + // If switching away from active stream, wait for backend cleanup + if (isStreamActive && socket) { + await new Promise((resolve) => { + const onComplete = (data: { stream_id: string }) => { + if (data.stream_id === ACTIVE_STREAM_ID) { + socket.off('cleanup_complete', onComplete); + resolve(); + } + }; + socket.on('cleanup_complete', onComplete); + socket.emit('cleanup_processor', { stream_id: ACTIVE_STREAM_ID }); + setTimeout(() => { + socket.off('cleanup_complete', onComplete); + resolve(); + }, 3000); + }); + setStreamActive(false); + setBackendResult(null); + } + selectFunction(fnId as any); + } finally { + switchingRef.current = false; } - selectFunction(fnId as any); }; const handleStart = async () => { diff --git a/frontend/src/components/View2D.tsx b/frontend/src/components/View2D.tsx index e01abb4..03534e5 100644 --- a/frontend/src/components/View2D.tsx +++ b/frontend/src/components/View2D.tsx @@ -53,7 +53,7 @@ export function View2D({ socket }: View2DProps) { color: 'rgba(255,255,255,0.4)', fontSize: 14, }}> - Waiting for camera... + Choose a Streaming Source on the left panel (A Camera or a Video File)
) : ( - Waiting for camera... + Choose a Streaming Source on the left panel (A Camera or a Video File) )} diff --git a/frontend/src/hooks/useWebSocket.ts b/frontend/src/hooks/useWebSocket.ts index 185700c..7dbe4d8 100644 --- a/frontend/src/hooks/useWebSocket.ts +++ b/frontend/src/hooks/useWebSocket.ts @@ -22,41 +22,67 @@ export function useWebSocket(): UseWebSocketReturn { const flushActiveStream = () => { socket?.emit('flush_stream', { stream_id: ACTIVE_STREAM_ID }); useAppStore.getState().setBackendResult(null); - lastUpdateTime.current = 0; + lastUpdateTime.current = Date.now(); }; useEffect(() => { - const socketInstance = socketService.connect(); - setSocket(socketInstance); - - socketInstance.on('connect', () => { - console.log('[WS] Connected, sid:', socketInstance.id); - setConnected(true); - }); - socketInstance.on('disconnect', (reason) => { - console.warn('[WS] Disconnected, reason:', reason); - setConnected(false); - }); - - socketInstance.on('pose_result', (result: BackendResult) => { - try { - const age = Date.now() - result.timestamp_ms; - if (age > RESULT_TIMEOUT_MS) return; - - if (result.timestamp_ms >= lastUpdateTime.current) { - lastUpdateTime.current = result.timestamp_ms; - setBackendResult(result); + let cancelled = false; + + const init = async () => { + await socketService.resolveUrl(); + if (cancelled) return; + + const socketInstance = socketService.connect(); + setSocket(socketInstance); + + socketInstance.on('connect', () => { + console.log('[WS] Connected, sid:', socketInstance.id); + setConnected(true); + }); + socketInstance.on('disconnect', (reason) => { + console.warn('[WS] Disconnected, reason:', reason); + setConnected(false); + }); + + socketInstance.on('pose_result', (result: BackendResult) => { + try { + if (!useAppStore.getState().isStreamActive) return; + + const age = Date.now() - result.timestamp_ms; + if (age > RESULT_TIMEOUT_MS) return; + + if (result.timestamp_ms >= lastUpdateTime.current) { + lastUpdateTime.current = result.timestamp_ms; + setBackendResult(result); + } + } catch (err) { + console.error('[WS] Error in pose_result handler:', err); } - } catch (err) { - console.error('[WS] Error in pose_result handler:', err); - } - }); + }); + + socketInstance.on('error', (error) => + console.error('[WS] Socket error:', error), + ); - socketInstance.on('error', (error) => - console.error('[WS] Socket error:', error), - ); + const handleBeforeUnload = () => { + if (socketInstance.connected) { + socketInstance.emit('cleanup_processor', { stream_id: ACTIVE_STREAM_ID }); + } + }; + window.addEventListener('beforeunload', handleBeforeUnload); + cleanupBeforeUnload = handleBeforeUnload; + }; - return () => socketService.disconnect(); + let cleanupBeforeUnload: (() => void) | null = null; + init(); + + return () => { + cancelled = true; + if (cleanupBeforeUnload) { + window.removeEventListener('beforeunload', cleanupBeforeUnload); + } + socketService.disconnect(); + }; }, [setBackendResult]); return { socket, connected, flushActiveStream }; diff --git a/frontend/src/services/socketService.ts b/frontend/src/services/socketService.ts index a4bc49f..e0f48d7 100644 --- a/frontend/src/services/socketService.ts +++ b/frontend/src/services/socketService.ts @@ -4,12 +4,54 @@ import { io, Socket } from 'socket.io-client'; // Local dev: set in .env.local | Production: set in GitHub Actions secrets const SOCKET_URL = import.meta.env.VITE_BACKEND_URL; +// Dev-only fallback chain: local → staging → production +const DEV_FALLBACK_URLS = [ + 'http://localhost:49101', + 'https://pose-backend-staging.yingliu.site', + 'https://pose-backend.yingliu.site', +]; + +async function probeHealth(url: string, timeoutMs = 3000): Promise { + try { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), timeoutMs); + const res = await fetch(`${url}/health`, { signal: controller.signal }); + clearTimeout(timer); + return res.ok; + } catch { + return false; + } +} + class SocketService { private socket: Socket | null = null; + private resolvedUrl: string | null = null; + + /** Resolve the backend URL once (dev: fallback chain, prod: env var). */ + async resolveUrl(): Promise { + if (this.resolvedUrl) return this.resolvedUrl; + + if (import.meta.env.DEV) { + for (const url of DEV_FALLBACK_URLS) { + console.log(`[Socket] Trying ${url}...`); + if (await probeHealth(url)) { + console.log(`[Socket] Using backend: ${url}`); + this.resolvedUrl = url; + return url; + } + } + // All failed — fall back to default so reconnection can retry + console.warn('[Socket] No backend reachable, defaulting to', SOCKET_URL); + } + + this.resolvedUrl = SOCKET_URL; + return SOCKET_URL; + } - connect(): Socket { + connect(url?: string): Socket { if (!this.socket) { - this.socket = io(SOCKET_URL, { + const target = url ?? this.resolvedUrl ?? SOCKET_URL; + this.socket = io(target, { transports: ['websocket'], reconnection: true, reconnectionAttempts: 5, diff --git a/tests/README.md b/tests/README.md index 991a533..d431f94 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,6 +1,6 @@ # E2E Tests (Playwright) -Automated UI tests for the Pose Spatial Studio application. +Automated UI tests for the Computer Vision Demo application. ## Structure diff --git a/tests/package.json b/tests/package.json index 17bdecf..278d292 100644 --- a/tests/package.json +++ b/tests/package.json @@ -1,7 +1,7 @@ { "name": "pose-spatial-studio-tests", "version": "1.1.0", - "description": "Playwright E2E tests for Pose Spatial Studio", + "description": "Playwright E2E tests for Computer Vision Demo", "private": true, "scripts": { "test": "playwright test", diff --git a/tests/playwright.config.ts b/tests/playwright.config.ts index 77063af..1b51ab7 100644 --- a/tests/playwright.config.ts +++ b/tests/playwright.config.ts @@ -1,7 +1,7 @@ import { defineConfig, devices } from '@playwright/test'; /** - * Playwright configuration for Pose Spatial Studio + * Playwright configuration for Computer Vision Demo * See https://playwright.dev/docs/test-configuration */ export default defineConfig({ diff --git a/tests/specs/avatar-voice-control.spec.ts b/tests/specs/avatar-voice-control.spec.ts index d03e981..956b6ed 100644 --- a/tests/specs/avatar-voice-control.spec.ts +++ b/tests/specs/avatar-voice-control.spec.ts @@ -15,7 +15,7 @@ test.describe('Avatar Voice Control', () => { }); test('should load app and show connected status', async ({ page }) => { - await expect(page).toHaveTitle(/Pose Spatial Studio/i); + await expect(page).toHaveTitle(/Computer Vision Demo/i); await expect(page.locator('.connection-status')).toBeVisible(); await expect(page.locator('text=Connected').first()).toBeVisible({ timeout: 15_000 }); }); diff --git a/tests/specs/pose-validation.spec.ts b/tests/specs/pose-validation.spec.ts index 42f4cef..bce0c71 100644 --- a/tests/specs/pose-validation.spec.ts +++ b/tests/specs/pose-validation.spec.ts @@ -2,7 +2,7 @@ import { test, expect } from '@playwright/test'; import path from 'path'; /** - * Pose Spatial Studio - Automated UI Testing + * Computer Vision Demo - Automated UI Testing * * Validates the stream creation UI, pose detection, and 3D rendering. * Selectors are aligned with frontend/src/components/Controls.tsx. @@ -18,7 +18,7 @@ test.describe('Pose Capture and Avatar Validation', () => { }); test('should load the application with correct title', async ({ page }) => { - await expect(page).toHaveTitle(/Pose Spatial Studio/i); + await expect(page).toHaveTitle(/Computer Vision Demo/i); }); test('should show connection status', async ({ page }) => {