Lightricks · Sergio Gil Jiménez (jimeneztion) · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # LTX Desktop
 
-LTX Desktop is an open-source desktop app for generating videos with LTX models — locally on supported Windows/Linux NVIDIA GPUs, with an API mode for unsupported hardware and macOS.
+LTX Desktop is an open-source desktop app for generating videos with LTX models — locally on supported Windows/Linux NVIDIA GPUs and Apple Silicon Macs, with an API mode for unsupported hardware.
 
 > **Status: Beta.** Expect breaking changes.
 > Frontend architecture is under active refactor; large UI PRs may be declined for now (see [`CONTRIBUTING.md`](docs/CONTRIBUTING.md)).
@@ -34,7 +34,8 @@ LTX Desktop is an open-source desktop app for generating videos with LTX models
 | Windows (no CUDA, <16GB VRAM, or unknown VRAM) | API-only | **LTX API key required** |
 | Linux + CUDA GPU with **≥16GB VRAM** | Local generation | Downloads model weights locally |
 | Linux (no CUDA, <16GB VRAM, or unknown VRAM) | API-only | **LTX API key required** |
-| macOS (Apple Silicon builds) | API-only | **LTX API key required** |
+| macOS + Apple Silicon with **≥15GB unified memory** | Local generation | Downloads model weights locally |
+| macOS + Apple Silicon with <15GB unified memory | API-only | **LTX API key required** |
 
 In API-only mode, available resolutions/durations may be limited to what the API supports.
 
@@ -55,9 +56,15 @@ In API-only mode, available resolutions/durations may be limited to what the API
 - 16GB+ RAM (32GB recommended)
 - Plenty of free disk space for model weights and outputs
 
+### macOS (local generation)
+
+- Apple Silicon (arm64) with **≥15GB unified memory**
+- macOS 13+ (Ventura)
+- **160GB+ free disk space** (for model weights, Python environment, and outputs)
+
 ### macOS (API-only)
 
-- Apple Silicon (arm64)
+- Apple Silicon (arm64) with <15GB unified memory
 - macOS 13+ (Ventura)
 - Stable internet connection
 
@@ -91,10 +98,10 @@ Text encoding: to generate videos you must configure text encoding:
 The LTX API is used for:
 
 - **Cloud text encoding and prompt enhancement** — **FREE**; text encoding is highly recommended to speed up inference and save memory
-- API-based video generations (required on macOS and on unsupported Windows hardware) — paid
+- API-based video generations (required on unsupported hardware and low-memory Apple Silicon Macs) — paid
 - Retake — paid
 
-An LTX API key is required in API-only mode, but optional on Windows/Linux local mode if you enable the Local Text Encoder.
+An LTX API key is required in API-only mode, but optional on Windows/Linux/macOS local mode if you enable the Local Text Encoder.
 
 Generate a FREE API key at the [LTX Console](https://console.ltx.video/). Text encoding is free; video generation API usage is paid. [Read more](https://ltx.io/model/model-blog/ltx-2-better-control-for-real-workflows).
 

diff --git a/backend/handlers/health_handler.py b/backend/handlers/health_handler.py
@@ -12,6 +12,7 @@
 from handlers.pipelines_handler import PipelinesHandler
 from logging_policy import log_background_exception
 from services.interfaces import GpuInfo
+from services.services_utils import get_device_type
 from state.app_state_types import AppState, GpuSlot, StartupError, StartupLoading, StartupPending, StartupReady, VideoPipelineState, VideoPipelineWarmth
 
 if TYPE_CHECKING:
@@ -108,14 +109,30 @@ def default_warmup(self) -> None:
             self.set_startup_loading("Loading Fast pipeline", 30)
             self._pipelines.load_gpu_pipeline("fast", should_warm=False)
 
-            self.set_startup_loading("Warming Fast pipeline", 60)
-            self._pipelines.warmup_pipeline("fast")
-            with self._lock:
-                match self.state.gpu_slot:
-                    case GpuSlot(active_pipeline=VideoPipelineState() as state):
-                        state.warmth = VideoPipelineWarmth.WARM
-                    case _:
-                        pass
+            if get_device_type(self.config.device) != "mps":
+                self.set_startup_loading("Warming Fast pipeline", 60)
+                with self._lock:
+                    match self.state.gpu_slot:
+                        case GpuSlot(active_pipeline=VideoPipelineState() as state):
+                            state.warmth = VideoPipelineWarmth.WARMING
+                        case _:
+                            pass
+                try:
+                    self._pipelines.warmup_pipeline("fast")
+                except Exception:
+                    with self._lock:
+                        match self.state.gpu_slot:
+                            case GpuSlot(active_pipeline=VideoPipelineState() as state) if state.warmth == VideoPipelineWarmth.WARMING:
+                                state.warmth = VideoPipelineWarmth.COLD
+                            case _:
+                                pass
+                    raise
+                with self._lock:
+                    match self.state.gpu_slot:
+                        case GpuSlot(active_pipeline=VideoPipelineState() as state):
+                            state.warmth = VideoPipelineWarmth.WARM
+                        case _:
+                            pass
 
             zit_models_path = resolve_model_path(self.models_dir, self.config.model_download_specs,"zit")
             zit_exists = zit_models_path.exists() and any(zit_models_path.iterdir())

diff --git a/backend/handlers/pipelines_handler.py b/backend/handlers/pipelines_handler.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import logging
+import time
 from threading import RLock
 from typing import TYPE_CHECKING
 
@@ -256,6 +257,16 @@ def load_gpu_pipeline(self, model_type: VideoPipelineModelType, should_warm: boo
                     case _:
                         pass
 
+        if state is not None and state.warmth == VideoPipelineWarmth.WARMING:
+            while state.warmth == VideoPipelineWarmth.WARMING:
+                time.sleep(1.0)
+            with self._lock:
+                match self.state.gpu_slot:
+                    case GpuSlot(active_pipeline=VideoPipelineState() as refreshed):
+                        state = refreshed
+                    case _:
+                        state = None
+
         if state is None:
             self._evict_gpu_pipeline_for_swap()
             state = self._create_video_pipeline(model_type)
@@ -377,6 +388,15 @@ def load_retake_pipeline(self, *, distilled: bool = True) -> RetakePipelineState
         return state
 
     def warmup_pipeline(self, model_type: VideoPipelineModelType) -> None:
-        state = self.load_gpu_pipeline(model_type, should_warm=False)
+        with self._lock:
+            match self.state.gpu_slot:
+                case GpuSlot(active_pipeline=VideoPipelineState() as existing_state):
+                    state: VideoPipelineState | None = existing_state
+                case _:
+                    state = None
+
+        if state is None or not self._pipeline_matches_model_type(model_type):
+            state = self.load_gpu_pipeline(model_type, should_warm=False)
+
         warmup_path = self.config.outputs_dir / f"_warmup_{model_type}.mp4"
         state.pipeline.warmup(output_path=str(warmup_path))
diff --git a/backend/handlers/video_generation_handler.py b/backend/handlers/video_generation_handler.py
@@ -136,10 +136,8 @@ def get_9_16_size(res: str) -> tuple[int, int]:
         seed = self._resolve_seed()
 
         try:
-            self._pipelines.load_gpu_pipeline("fast", should_warm=False)
-            self._generation.start_generation(generation_id)
-
             output_path = self.generate_video(
+                generation_id=generation_id,
                 prompt=req.prompt,
                 image=image,
                 height=height,
@@ -164,6 +162,7 @@ def get_9_16_size(res: str) -> tuple[int, int]:
 
     def generate_video(
         self,
+        generation_id: str,
         prompt: str,
         image: Image.Image | None,
         height: int,
@@ -186,12 +185,13 @@ def generate_video(
 
         total_steps = 8
 
-        self._generation.update_progress("loading_model", 5, 0, total_steps)
         t_load_start = time.perf_counter()
         pipeline_state = self._pipelines.load_gpu_pipeline("fast", should_warm=False)
         t_load_end = time.perf_counter()
         logger.info("[%s] Pipeline load: %.2fs", gen_mode, t_load_end - t_load_start)
 
+        self._generation.start_generation(generation_id)
+        self._generation.update_progress("loading_model", 5, 0, total_steps)
         self._generation.update_progress("encoding_text", 10, 0, total_steps)
 
         enhanced_prompt = prompt + self.config.camera_motion_prompts.get(camera_motion, "")
@@ -224,6 +224,10 @@ def generate_video(
             height = round(height / 64) * 64
             width = round(width / 64) * 64
 
+            def _on_denoising_step(current_step: int, denoising_total: int) -> None:
+                pct = 15 + int(75 * current_step / denoising_total)
+                self._generation.update_progress("inference", pct, current_step, denoising_total)
+
             t_inference_start = time.perf_counter()
             pipeline_state.pipeline.generate(
                 prompt=enhanced_prompt,
@@ -234,6 +238,7 @@ def generate_video(
                 frame_rate=fps,
                 images=images,
                 output_path=str(output_path),
+                progress_callback=_on_denoising_step,
             )
             t_inference_end = time.perf_counter()
             logger.info("[%s] Inference: %.2fs", gen_mode, t_inference_end - t_inference_start)
@@ -286,6 +291,7 @@ def _generate_a2v(
         try:
             a2v_state = self._pipelines.load_a2v_pipeline()
             self._generation.start_generation(generation_id)
+            self._generation.update_progress("loading_model", 5, 0, 11)
 
             enhanced_prompt = req.prompt + self.config.camera_motion_prompts.get(req.cameraMotion, "")
             neg = req.negativePrompt if req.negativePrompt else self.config.default_negative_prompt
@@ -306,8 +312,6 @@ def _generate_a2v(
                 a2v_enhance = a2v_use_api and a2v_settings.prompt_enhancer_enabled_i2v
             else:
                 a2v_enhance = a2v_use_api and a2v_settings.prompt_enhancer_enabled_t2v
-
-            self._generation.update_progress("loading_model", 5, 0, total_steps)
             self._generation.update_progress("encoding_text", 10, 0, total_steps)
             self._text.prepare_text_encoding(enhanced_prompt, enhance_prompt=a2v_enhance)
             self._generation.update_progress("inference", 15, 0, total_steps)

diff --git a/backend/ltx2_server.py b/backend/ltx2_server.py
@@ -34,6 +34,14 @@
 del _safetensors_loader_fix
 import services.patches.safetensors_metadata_fix as _safetensors_metadata_fix  # pyright: ignore[reportUnusedImport]  # Remove once safetensors supports read-only mmap
 del _safetensors_metadata_fix
+import services.patches.mps_layer_streaming_fix as _mps_layer_streaming_fix  # pyright: ignore[reportUnusedImport]  # Remove once ltx-core adds MPS awareness to _LayerStore
+del _mps_layer_streaming_fix
+import services.patches.mps_gpu_model_fix as _mps_gpu_model_fix  # pyright: ignore[reportUnusedImport]  # Remove once ltx-pipelines adds MPS awareness to gpu_model
+del _mps_gpu_model_fix
+import services.patches.mps_vocoder_fix as _mps_vocoder_fix  # pyright: ignore[reportUnusedImport]  # Remove once ltx-core adds MPS awareness to VocoderWithBWE
+del _mps_vocoder_fix
+import services.patches.mps_chunked_attention_fix as _mps_chunked_attention_fix  # pyright: ignore[reportUnusedImport]  # Remove once ltx-core ships a memory-efficient attention path for MPS
+del _mps_chunked_attention_fix
 
 from state.app_settings import AppSettings
 

diff --git a/backend/runtime_config/runtime_policy.py b/backend/runtime_config/runtime_policy.py
@@ -6,7 +6,9 @@
 def decide_force_api_generations(system: str, cuda_available: bool, vram_gb: int | None) -> bool:
     """Return whether API-only generation must be forced for this runtime."""
     if system == "Darwin":
-        return True
+        if vram_gb is None:
+            return True
+        return vram_gb < 15
 
     if system in ("Windows", "Linux"):
         if not cuda_available:

diff --git a/backend/services/a2v_pipeline/ltx_a2v_pipeline.py b/backend/services/a2v_pipeline/ltx_a2v_pipeline.py
@@ -9,7 +9,7 @@
 
 from api_types import ImageConditioningInput
 from services.ltx_pipeline_common import default_tiling_config, encode_video_output, video_chunks_number
-from services.services_utils import AudioOrNone, TilingConfigType, device_supports_fp8
+from services.services_utils import AudioOrNone, TilingConfigType, device_supports_fp8, get_device_type
 
 
 class LTXa2vPipeline:
@@ -45,6 +45,11 @@ def __init__(
             device=device,
             quantization=QuantizationPolicy.fp8_cast() if device_supports_fp8(device) else None,
         )
+        # MPS does not support CUDA streams or pin_memory(), so prefetch_count must be 0
+        # (synchronous layer streaming) rather than None (no streaming — loads the full
+        # transformer into GPU memory at once, which causes OOM on large generations).
+        # The mps_layer_streaming_fix patch makes synchronous streaming safe on MPS.
+        self._streaming_prefetch_count: int | None = 1 if get_device_type(device) == "mps" else 2
 
     def _run_inference(
         self,
@@ -74,7 +79,7 @@ def _run_inference(
             audio_start_time=audio_start_time,
             audio_max_duration=audio_max_duration,
             tiling_config=tiling_config,
-            streaming_prefetch_count=2,
+            streaming_prefetch_count=self._streaming_prefetch_count,
         )
 
     @torch.inference_mode()

diff --git a/backend/services/fast_video_pipeline/ltx_fast_video_pipeline.py b/backend/services/fast_video_pipeline/ltx_fast_video_pipeline.py
@@ -2,15 +2,58 @@
 
 from __future__ import annotations
 
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
+from contextlib import contextmanager
 import os
-from typing import Final, cast
+from typing import Any, Final, cast
 
 import torch
 
 from api_types import ImageConditioningInput
 from services.ltx_pipeline_common import default_tiling_config, encode_video_output, video_chunks_number
-from services.services_utils import AudioOrNone, TilingConfigType, device_supports_fp8
+from services.services_utils import AudioOrNone, TilingConfigType, device_supports_fp8, get_device_type
+
+# Stage 1: 8 denoising steps, Stage 2: 3 denoising steps.
+_STAGE1_STEPS = 8
+_STAGE2_STEPS = 3
+_TOTAL_DENOISING_STEPS = _STAGE1_STEPS + _STAGE2_STEPS
+
+StepCallback = Callable[[int, int], None]  # (current_step, total_steps)
+
+
+@contextmanager
+def _tqdm_progress_interceptor(callback: StepCallback) -> Iterator[None]:
+    """Patch tqdm in ltx_pipelines.utils.samplers to forward step updates to callback.
+
+    The denoising loops in samplers.py use tqdm directly with no external
+    callback hook.  We replace tqdm there with a thin wrapper that calls
+    callback(current_step, total_steps) on each update() call.
+    """
+    import ltx_pipelines.utils.samplers as _samplers_module
+
+    _step_counter: list[int] = [0]
+
+    original_tqdm = _samplers_module.tqdm
+
+    class _ProgressTqdm:
+        def __init__(self, iterable: Any = None, **kwargs: Any) -> None:
+            self._items = list(iterable) if iterable is not None else []
+            self._tqdm = original_tqdm(self._items, **kwargs)
+
+        def __iter__(self) -> Iterator[Any]:
+            for item in self._tqdm:
+                yield item
+                _step_counter[0] += 1
+                callback(_step_counter[0], _TOTAL_DENOISING_STEPS)
+
+        def __len__(self) -> int:
+            return len(self._items)
+
+    try:
+        _samplers_module.tqdm = _ProgressTqdm  # type: ignore[attr-defined]
+        yield
+    finally:
+        _samplers_module.tqdm = original_tqdm  # type: ignore[attr-defined]
 
 
 class LTXFastVideoPipeline:
@@ -39,6 +82,11 @@ def __init__(self, checkpoint_path: str, gemma_root: str | None, upsampler_path:
         self._upsampler_path = upsampler_path
         self._device = device
         self._quantization = QuantizationPolicy.fp8_cast() if device_supports_fp8(device) else None
+        # MPS does not support CUDA streams or pin_memory(), so prefetch_count must be 0
+        # (synchronous layer streaming) rather than None (no streaming — loads the full
+        # transformer into GPU memory at once, which causes OOM on large generations).
+        # The mps_layer_streaming_fix patch makes synchronous streaming safe on MPS.
+        self._streaming_prefetch_count: int | None = 1 if get_device_type(device) == "mps" else 2
 
         self.pipeline = DistilledPipeline(
             distilled_checkpoint_path=checkpoint_path,
@@ -71,7 +119,7 @@ def _run_inference(
             frame_rate=frame_rate,
             images=[_LtxImageInput(img.path, img.frame_idx, img.strength) for img in images],
             tiling_config=tiling_config,
-            streaming_prefetch_count=2,
+            streaming_prefetch_count=self._streaming_prefetch_count,
         )
 
     @torch.inference_mode()
@@ -85,18 +133,32 @@ def generate(
         frame_rate: float,
         images: list[ImageConditioningInput],
         output_path: str,
+        progress_callback: StepCallback | None = None,
     ) -> None:
         tiling_config = default_tiling_config()
-        video, audio = self._run_inference(
-            prompt=prompt,
-            seed=seed,
-            height=height,
-            width=width,
-            num_frames=num_frames,
-            frame_rate=frame_rate,
-            images=images,
-            tiling_config=tiling_config,
-        )
+        if progress_callback is not None:
+            with _tqdm_progress_interceptor(progress_callback):
+                video, audio = self._run_inference(
+                    prompt=prompt,
+                    seed=seed,
+                    height=height,
+                    width=width,
+                    num_frames=num_frames,
+                    frame_rate=frame_rate,
+                    images=images,
+                    tiling_config=tiling_config,
+                )
+        else:
+            video, audio = self._run_inference(
+                prompt=prompt,
+                seed=seed,
+                height=height,
+                width=width,
+                num_frames=num_frames,
+                frame_rate=frame_rate,
+                images=images,
+                tiling_config=tiling_config,
+            )
         chunks = video_chunks_number(num_frames, tiling_config)
         encode_video_output(video=video, audio=audio, fps=int(frame_rate), output_path=output_path, video_chunks_number_value=chunks)