feat(diffusers): add Shutdown method to release GPU memory

JairoGuo · JairoGuo · commit e3a64e078470 · 2026-02-05T14:17:34.000+08:00
Add Shutdown method to the diffusers backend that properly releases GPU
memory when a model is unloaded. This enables dynamic model reloading
with different configurations (e.g., switching LoRA adapters) without
restarting the service.

The Shutdown method:
- Releases the pipeline, controlnet, and compel objects
- Clears CUDA cache with torch.cuda.empty_cache()
- Resets state flags (img2vid, txt2vid, ltx2_pipeline)

This works with LocalAI's existing /backend/shutdown API endpoint,
which terminates the gRPC process. The explicit cleanup ensures
GPU memory is properly released before process termination.

Tested with Qwen-Image (~95GB) on NVIDIA H20 GPUs.
diff --git a/backend/python/diffusers/backend.py b/backend/python/diffusers/backend.py
@@ -443,6 +443,47 @@ def _load_pipeline(self, request, modelFile, fromSingleFile, torchType, variant)
     def Health(self, request, context):
         return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
 
+    def Shutdown(self, request, context):
+        """
+        Shutdown and release GPU memory for the loaded model.
+        This allows dynamic model reloading with different configurations (e.g., different LoRA adapters).
+        """
+        try:
+            print("Shutting down diffusers backend...", file=sys.stderr)
+
+            # Release pipeline
+            if hasattr(self, 'pipe') and self.pipe is not None:
+                del self.pipe
+                self.pipe = None
+
+            # Release controlnet
+            if hasattr(self, 'controlnet') and self.controlnet is not None:
+                del self.controlnet
+                self.controlnet = None
+
+            # Release compel
+            if hasattr(self, 'compel') and self.compel is not None:
+                del self.compel
+                self.compel = None
+
+            # Clear CUDA cache to release GPU memory
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.synchronize()
+                print("CUDA cache cleared", file=sys.stderr)
+
+            # Reset state flags
+            self.img2vid = False
+            self.txt2vid = False
+            self.ltx2_pipeline = False
+            self.options = {}
+
+            print("Diffusers backend shutdown complete", file=sys.stderr)
+            return backend_pb2.Result(message="Model unloaded successfully", success=True)
+        except Exception as err:
+            print(f"Error during shutdown: {err}", file=sys.stderr)
+            return backend_pb2.Result(success=False, message=f"Shutdown error: {err}")
+
     def LoadModel(self, request, context):
         try:
             print(f"Loading model {request.Model}...", file=sys.stderr)