From d1b19be6d93c8a23f3a5906fa6dd00b0c1d743d8 Mon Sep 17 00:00:00 2001
From: Dan <dan@DandeMacBook-Air.local>
Date: Sun, 26 Apr 2026 21:35:19 +0800
Subject: [PATCH 01/13] Add Mac metal MLX inference

---
 mlx_infer/__init__.py   |   6 +
 mlx_infer/aggregator.py | 445 +++++++++++++++++++++++++++++++++
 mlx_infer/demo.py       | 253 +++++++++++++++++++
 mlx_infer/heads.py      | 532 ++++++++++++++++++++++++++++++++++++++++
 mlx_infer/layers.py     | 325 ++++++++++++++++++++++++
 mlx_infer/model.py      | 245 ++++++++++++++++++
 mlx_infer/rope.py       | 193 +++++++++++++++
 mlx_infer/weights.py    | 221 +++++++++++++++++
 8 files changed, 2220 insertions(+)
 create mode 100644 mlx_infer/__init__.py
 create mode 100644 mlx_infer/aggregator.py
 create mode 100644 mlx_infer/demo.py
 create mode 100644 mlx_infer/heads.py
 create mode 100644 mlx_infer/layers.py
 create mode 100644 mlx_infer/model.py
 create mode 100644 mlx_infer/rope.py
 create mode 100644 mlx_infer/weights.py

diff --git a/mlx_infer/__init__.py b/mlx_infer/__init__.py
new file mode 100644
index 0000000..6732827
--- /dev/null
+++ b/mlx_infer/__init__.py
@@ -0,0 +1,6 @@
+"""MLX inference package for GCTStream on Apple Silicon."""
+
+from .model import GCTStreamMLX
+from .weights import load_checkpoint
+
+__all__ = ["GCTStreamMLX", "load_checkpoint"]
diff --git a/mlx_infer/aggregator.py b/mlx_infer/aggregator.py
new file mode 100644
index 0000000..2a8a811
--- /dev/null
+++ b/mlx_infer/aggregator.py
@@ -0,0 +1,445 @@
+"""
+MLX Streaming Aggregator.
+
+Mirrors AggregatorBase + AggregatorStream from lingbot_map/aggregator/.
+Architecture:
+  DINOv2 ViT-L backbone (ViTMLX)
+  → frame blocks  (TransformerBlock + 2D RoPE, per-frame)
+  → global blocks (StreamingBlock + KV cache, cross-frame causal)
+
+Special tokens: camera [1] + register [4] + scale [1] = patch_start_idx = 6
+Selected output indices (block groups): [4, 11, 17, 23]
+Output list shape: [B, S, P, 2C] per element (frame + global concatenated).
+"""
+
+from typing import Optional, List, Tuple, Dict, Any
+import numpy as np
+import mlx.core as mx
+import mlx.nn as nn
+
+from .layers import LayerNorm, TransformerBlock, StreamingBlock
+from .rope import RotaryEmbedding2D, PositionGetter2D
+
+# ImageNet normalisation constants (same as PyTorch side)
+_RESNET_MEAN = [0.485, 0.456, 0.406]
+_RESNET_STD  = [0.229, 0.224, 0.225]
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def slice_expand_and_flatten(token: mx.array, B: int, S: int,
+                              first_num_frame: int = 1) -> mx.array:
+    """
+    Expand a [1, 2, N, C] special-token parameter to [B*S, N, C].
+
+    The first variant (index 0) is used for the first `first_num_frame` frames;
+    the second variant (index 1) is used for the remaining frames.
+    """
+    N, C = token.shape[2], token.shape[3]
+    if first_num_frame > 1:
+        t_first = mx.broadcast_to(token[:, :1], (B, first_num_frame, N, C))
+        t_rest  = mx.broadcast_to(token[:, 1:], (B, S - first_num_frame, N, C))
+    else:
+        t_first = mx.broadcast_to(token[:, :1], (B, 1, N, C))
+        t_rest  = mx.broadcast_to(token[:, 1:], (B, S - 1, N, C))
+    return mx.concatenate([t_first, t_rest], axis=1).reshape(B * S, N, C)
+
+
+# ---------------------------------------------------------------------------
+# DINOv2 ViT-L backbone
+# ---------------------------------------------------------------------------
+
+class ViTMLX(nn.Module):
+    """DINOv2 ViT-L patch-embedding backbone (channel-last).
+
+    Accepts images in NHWC format [B, H, W, 3].
+    Returns x_norm_patchtokens: [B, N_patch, embed_dim].
+
+    Weight loading note: PyTorch stores the patch projection weight as
+    Conv2d [O, I, kH, kW]; MLX expects [O, kH, kW, I].  The weights.py
+    conversion handles this transposition automatically.
+    """
+
+    def __init__(
+        self,
+        img_size: int = 518,
+        patch_size: int = 14,
+        embed_dim: int = 1024,
+        depth: int = 24,
+        num_heads: int = 16,
+        mlp_ratio: float = 4.0,
+        num_register_tokens: int = 4,
+        init_values: float = 1.0,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        num_patches = (img_size // patch_size) ** 2
+
+        # Patch projection (channel-last Conv2d)
+        self.patch_embed = nn.Conv2d(3, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+        # Learnable tokens and positional embedding
+        # pos_embed covers CLS slot + all patch positions (registers have no pos_embed)
+        self.cls_token      = mx.zeros((1, 1, embed_dim))
+        self.register_tokens = mx.zeros((1, num_register_tokens, embed_dim))
+        self.pos_embed      = mx.zeros((1, num_patches + 1, embed_dim))  # +1 for CLS
+
+        # Transformer blocks (standard ViT, no RoPE — uses absolute pos_embed)
+        # DINOv2 ViT-L uses qk_norm=False in its backbone blocks (unlike the
+        # aggregator's own frame/global blocks which do use qk_norm=True).
+        self.blocks = [
+            TransformerBlock(
+                embed_dim, num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=True,
+                proj_bias=True,
+                ffn_bias=True,
+                qk_norm=False,
+                init_values=init_values,
+            )
+            for _ in range(depth)
+        ]
+
+        self.norm = LayerNorm(embed_dim)
+
+    def _interpolate_pos_embed(self, h: int, w: int) -> mx.array:
+        """Interpolate patch positional embeddings to (h, w) patch grid.
+
+        Matches DINOv2's interpolate_pos_encoding: bicubic resize of the MxM
+        learnt grid to h×w using scipy (CPU, cached per shape).
+        """
+        M = int(round(np.sqrt(self.pos_embed.shape[1] - 1)))
+        if h == M and w == M:
+            return self.pos_embed
+        key = (h, w)
+        if not hasattr(self, "_pos_embed_cache"):
+            self._pos_embed_cache = {}
+        if key not in self._pos_embed_cache:
+            from scipy.ndimage import zoom
+            patch_pe = np.array(self.pos_embed[0, 1:])      # [M*M, C]
+            patch_pe = patch_pe.reshape(M, M, -1)            # [M, M, C]
+            scale_h = h / M
+            scale_w = w / M
+            patch_pe = zoom(patch_pe, (scale_h, scale_w, 1), order=3)  # bicubic
+            patch_pe = patch_pe.reshape(1, h * w, -1)        # [1, h*w, C]
+            cls_pe = np.array(self.pos_embed[0:1, :1])       # [1, 1, C]
+            full_pe = np.concatenate([cls_pe, patch_pe], axis=1)
+            self._pos_embed_cache[key] = mx.array(full_pe.astype(np.float32))
+        return self._pos_embed_cache[key]
+
+    def __call__(self, x: mx.array) -> mx.array:
+        """x: [B, H, W, 3] → patch tokens [B, N_patch, embed_dim]."""
+        B = x.shape[0]
+        C = x.shape[-1]  # 3
+
+        # Patch embed: Conv2d [B, H, W, 3] → [B, H', W', embed_dim] → [B, N, embed_dim]
+        tokens = self.patch_embed(x)
+        h, w = tokens.shape[1], tokens.shape[2]
+        tokens = tokens.reshape(B, h * w, -1)                  # [B, N_patch, C]
+
+        # Add patch positional embeddings (interpolating if needed for non-square input)
+        pos_embed = self._interpolate_pos_embed(h, w)
+        tokens = tokens + pos_embed[:, 1:]
+
+        # Prepend CLS token (with its positional embedding)
+        cls = mx.broadcast_to(self.cls_token, (B, 1, tokens.shape[-1]))
+        cls = cls + pos_embed[:, :1]
+
+        # Insert register tokens (no positional embedding)
+        regs = mx.broadcast_to(self.register_tokens, (B, self.num_register_tokens, tokens.shape[-1]))
+
+        # Layout: [CLS, registers, patches]
+        tokens = mx.concatenate([cls, regs, tokens], axis=1)   # [B, 1+R+N, C]
+
+        for block in self.blocks:
+            tokens = block(tokens)
+
+        tokens = self.norm(tokens)
+
+        # Return only patch tokens (skip CLS + register prefix)
+        return tokens[:, 1 + self.num_register_tokens:]        # [B, N_patch, C]
+
+
+# ---------------------------------------------------------------------------
+# Streaming Aggregator
+# ---------------------------------------------------------------------------
+
+class AggregatorMLX(nn.Module):
+    """Streaming causal aggregator for GCTStream (MLX version).
+
+    Architecture mirrors AggregatorStream (use_sdpa backend):
+      - ViTMLX backbone → patch tokens
+      - Special tokens (camera + register + scale) prepended
+      - aa_block_num groups of [frame_block, global_block]
+      - Global blocks share a per-block KV cache for causal streaming
+
+    Parameters
+    ----------
+    img_size, patch_size, embed_dim, depth, num_heads:
+        Match the PyTorch checkpoint (ViT-L: 518, 14, 1024, 24, 16).
+    aa_block_size:
+        Number of frame/global blocks per alternating-attention group (default 1).
+    num_register_tokens:
+        DINOv2 register tokens (default 4).
+    kv_cache_sliding_window, kv_cache_scale_frames:
+        KV cache eviction policy (match PyTorch defaults: 64, 8).
+    """
+
+    def __init__(
+        self,
+        img_size: int = 518,
+        patch_size: int = 14,
+        embed_dim: int = 1024,
+        depth: int = 24,
+        num_heads: int = 16,
+        mlp_ratio: float = 4.0,
+        num_register_tokens: int = 4,
+        aa_block_size: int = 1,
+        rope_freq: float = 100.0,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        qk_norm: bool = True,
+        init_values: float = 0.01,
+        kv_cache_sliding_window: int = 64,
+        kv_cache_scale_frames: int = 8,
+        kv_cache_keep_special: bool = True,
+    ):
+        super().__init__()
+        assert depth % aa_block_size == 0
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.depth = depth
+        self.aa_block_size = aa_block_size
+        self.aa_block_num = depth // aa_block_size
+        self.num_register_tokens = num_register_tokens
+
+        # Image normalisation buffers (not trainable)
+        self._mean = mx.array(_RESNET_MEAN, dtype=mx.float32).reshape(1, 1, 1, 3)
+        self._std  = mx.array(_RESNET_STD,  dtype=mx.float32).reshape(1, 1, 1, 3)
+
+        # DINOv2 ViT-L backbone
+        self.patch_embed = ViTMLX(
+            img_size=img_size, patch_size=patch_size, embed_dim=embed_dim,
+            depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio,
+            num_register_tokens=num_register_tokens,
+        )
+
+        # 2D RoPE (used in both frame and global blocks)
+        self.rope = RotaryEmbedding2D(base=rope_freq)
+        self.position_getter = PositionGetter2D()
+
+        block_kw = dict(
+            dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias, proj_bias=proj_bias, ffn_bias=ffn_bias,
+            qk_norm=qk_norm, init_values=init_values,
+        )
+
+        # Frame blocks: standard TransformerBlock (per-frame, no KV cache)
+        self.frame_blocks = [TransformerBlock(**block_kw) for _ in range(depth)]
+
+        # Global blocks: StreamingBlock with causal KV cache
+        self.global_blocks = [
+            StreamingBlock(
+                **block_kw,
+                sliding_window=kv_cache_sliding_window,
+                scale_frames=kv_cache_scale_frames,
+                keep_special=kv_cache_keep_special,
+            )
+            for _ in range(depth)
+        ]
+
+        # Special tokens: camera [1, 2, 1, C], register [1, 2, R, C], scale [1, 2, 1, C]
+        # shape [1, 2, N, C]: dim-1 indexes first-frame vs rest-of-frames variant
+        self.camera_token   = mx.zeros((1, 2, 1, embed_dim))
+        self.register_token = mx.zeros((1, 2, num_register_tokens, embed_dim))
+        self.scale_token    = mx.ones((1, 2, 1, embed_dim))
+
+        # patch_start_idx: camera(1) + register(num_register_tokens) + scale(1)
+        self.patch_start_idx = 1 + num_register_tokens + 1
+        self.num_special_tokens = self.patch_start_idx
+
+        # KV cache: one dict per global block, initialised lazily
+        self._kv_cache: Optional[List[Dict[str, Any]]] = None
+        self.total_frames_processed = 0
+
+    # ------------------------------------------------------------------
+    # KV cache management
+    # ------------------------------------------------------------------
+
+    def _init_kv_cache(self):
+        """Create fresh per-block KV cache dicts."""
+        self._kv_cache = [
+            {"k": None, "v": None, "k_special": None, "v_special": None,
+             "_skip_append": False}
+            for _ in range(self.depth)
+        ]
+        self.total_frames_processed = 0
+
+    def clean_kv_cache(self):
+        self._init_kv_cache()
+
+    def set_skip_append(self, skip: bool):
+        if self._kv_cache is not None:
+            for d in self._kv_cache:
+                d["_skip_append"] = skip
+
+    # ------------------------------------------------------------------
+    # Position embeddings
+    # ------------------------------------------------------------------
+
+    def _get_positions(self, B: int, S: int, H: int, W: int) -> mx.array:
+        """2D patch positions [B*S, P, 2] with offset=1 for special tokens."""
+        pph = H // self.patch_size
+        ppw = W // self.patch_size
+        pos = self.position_getter(B * S, pph, ppw)  # [B*S, N_patch, 2]
+        pos = pos + 1                                 # patches start at position index 1
+        # Special tokens sit at position (0, 0)
+        pos_special = mx.zeros((B * S, self.num_special_tokens, 2), dtype=mx.int32)
+        return mx.concatenate([pos_special, pos], axis=1)  # [B*S, P, 2]
+
+    # ------------------------------------------------------------------
+    # Special token preparation
+    # ------------------------------------------------------------------
+
+    def _prepare_special_tokens(
+        self, B: int, S: int, C: int, scale_frames: int,
+        in_streaming: bool = False, S_cached: int = 0,
+    ) -> mx.array:
+        """
+        Build [B*S, num_special_tokens, C] special token tensor.
+
+        In streaming mode (S_cached > 0) we expand to the full historical
+        length and slice the last S rows to match PyTorch behaviour.
+        """
+        S_true = S_cached + S if in_streaming else S
+        eff_scale = min(scale_frames, S_true)
+
+        if in_streaming and S_cached > 0:
+            cam   = slice_expand_and_flatten(self.camera_token,   B, S_true)[-S:]
+            reg   = slice_expand_and_flatten(self.register_token, B, S_true)[-S:]
+            scale = slice_expand_and_flatten(self.scale_token, B, S_true,
+                                              first_num_frame=eff_scale)[-S:]
+        else:
+            cam   = slice_expand_and_flatten(self.camera_token,   B, S)
+            reg   = slice_expand_and_flatten(self.register_token, B, S)
+            scale = slice_expand_and_flatten(self.scale_token, B, S,
+                                              first_num_frame=eff_scale)
+
+        return mx.concatenate([cam, reg, scale], axis=1)  # [B*S, N_sp, C]
+
+    # ------------------------------------------------------------------
+    # Forward
+    # ------------------------------------------------------------------
+
+    def __call__(
+        self,
+        images: mx.array,
+        selected_idx: Optional[List[int]] = None,
+        num_frame_for_scale: Optional[int] = None,
+        num_frame_per_block: int = 1,
+    ) -> Tuple[List[mx.array], int]:
+        """
+        Parameters
+        ----------
+        images: [B, S, 3, H, W] in [0, 1] range (PyTorch channel-first convention).
+        selected_idx: block group indices to include in the output list (None = all).
+        num_frame_for_scale: frames treated as scale frames (affects scale_token).
+        num_frame_per_block: frames processed as one block (1 for streaming).
+
+        Returns
+        -------
+        (output_list, patch_start_idx)
+        output_list: List of [B, S, P, 2*embed_dim] tensors.
+        """
+        B, S, _, H, W = images.shape
+        scale_frames = num_frame_for_scale if num_frame_for_scale is not None else 1
+
+        # Determine streaming state
+        in_streaming = (self._kv_cache is not None and
+                        self._kv_cache[0]["k"] is not None)
+        S_cached = 0
+        if in_streaming:
+            cached_k = self._kv_cache[0]["k"]  # [B, H, n_frames, T, D]
+            S_cached = cached_k.shape[2] if cached_k is not None else 0
+
+        # ---- Normalise images ----
+        # images: [B, S, 3, H, W] (channel-first)
+        # transpose to NHWC: [B*S, H, W, 3]
+        imgs = images.reshape(B * S, 3, H, W)
+        imgs = imgs.transpose(0, 2, 3, 1)                   # [B*S, H, W, 3]
+        mean = mx.array(_RESNET_MEAN, dtype=imgs.dtype).reshape(1, 1, 1, 3)
+        std  = mx.array(_RESNET_STD,  dtype=imgs.dtype).reshape(1, 1, 1, 3)
+        imgs = (imgs - mean) / std
+
+        # ---- DINOv2 patch embedding ----
+        patch_tokens = self.patch_embed(imgs)                # [B*S, N_patch, C]
+        C = patch_tokens.shape[-1]
+
+        # ---- Special tokens ----
+        special = self._prepare_special_tokens(
+            B, S, C, scale_frames,
+            in_streaming=in_streaming, S_cached=S_cached,
+        )                                                    # [B*S, N_sp, C]
+        tokens = mx.concatenate([special, patch_tokens], axis=1)  # [B*S, P, C]
+        P = tokens.shape[1]
+
+        # ---- 2D RoPE positions ----
+        pos = self._get_positions(B, S, H, W)                # [B*S, P, 2]
+        head_dim = self.embed_dim // self.num_heads          # 64 for ViT-L
+        cos, sin = self.rope.get_cos_sin(
+            None, pos, head_dim=head_dim)                    # [B*S, 1, P, 64]
+
+        # ---- Alternating frame / global attention ----
+        output_list: List[mx.array] = []
+        frame_idx = 0
+        global_idx = 0
+
+        # Reshape for global blocks: [B, S*P, C]
+        tokens_global = tokens.reshape(B, S * P, C)
+        # Reshape cos/sin for global attention: [B, 1, S*P, D]
+        cos_g = cos.reshape(B, 1, S * P, cos.shape[-1])
+        sin_g = sin.reshape(B, 1, S * P, sin.shape[-1])
+
+        for group in range(self.aa_block_num):
+            frame_outs = []
+            global_outs = []
+
+            # -- Frame attention (aa_block_size blocks, per-frame) --
+            for _ in range(self.aa_block_size):
+                # tokens as [B*S, P, C]; cos/sin [B*S, 1, P, D]
+                tokens_flat = tokens_global.reshape(B * S, P, C)
+                tokens_flat = self.frame_blocks[frame_idx](
+                    tokens_flat, rope_cos=cos, rope_sin=sin)
+                tokens_global = tokens_flat.reshape(B, S * P, C)
+                frame_outs.append(tokens_global.reshape(B, S, P, C))
+                frame_idx += 1
+
+            # -- Global (causal cross-frame) attention --
+            for _ in range(self.aa_block_size):
+                kv = self._kv_cache[global_idx] if self._kv_cache is not None else None
+                tokens_global = self.global_blocks[global_idx](
+                    tokens_global,
+                    kv_cache=kv,
+                    num_frame_per_block=num_frame_per_block,
+                    rope_cos=cos_g,
+                    rope_sin=sin_g,
+                    patch_start_idx=self.patch_start_idx,
+                )
+                global_outs.append(tokens_global.reshape(B, S, P, C))
+                global_idx += 1
+
+            # Collect output for this group
+            if selected_idx is None or group in selected_idx:
+                for fi, gi in zip(frame_outs, global_outs):
+                    output_list.append(mx.concatenate([fi, gi], axis=-1))  # [B, S, P, 2C]
+
+        # Update frame counter (only on keyframe path, skip_append=False)
+        if self._kv_cache is not None and not self._kv_cache[0].get("_skip_append", False):
+            self.total_frames_processed += S
+
+        return output_list, self.patch_start_idx
diff --git a/mlx_infer/demo.py b/mlx_infer/demo.py
new file mode 100644
index 0000000..0fc2f3c
--- /dev/null
+++ b/mlx_infer/demo.py
@@ -0,0 +1,253 @@
+"""
+MLX inference demo for GCTStreamMLX.
+
+Usage:
+    python mlx_infer/demo.py --checkpoint model.pt --images /path/to/images --output out.npz
+
+The script mirrors demo.py but uses the MLX model for GPU-accelerated inference
+on Apple Silicon (unified memory, no Metal OOM accumulation).
+"""
+
+import argparse
+import os
+import sys
+import glob
+import time
+from pathlib import Path
+
+import numpy as np
+import mlx.core as mx
+
+# Make parent directory importable when run as script
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from mlx_infer.model import GCTStreamMLX
+from mlx_infer.weights import load_checkpoint
+
+
+# ---------------------------------------------------------------------------
+# Image loading helpers
+# ---------------------------------------------------------------------------
+
+def _load_images_from_dir(image_dir: str, img_size: int = 518,
+                          patch_size: int = 14) -> np.ndarray:
+    """Load images from a directory using the same crop preprocessing as demo.py.
+
+    Matches PyTorch demo's load_and_preprocess_images(mode="crop"):
+    - Resize so width = img_size, maintaining aspect ratio
+    - Center-crop height to img_size if taller than img_size
+    - Round height to nearest patch_size multiple
+
+    Returns [S, 3, H, W] float32 in [0, 1].
+    """
+    from PIL import Image
+
+    paths = sorted(
+        glob.glob(os.path.join(image_dir, "*.jpg"))
+        + glob.glob(os.path.join(image_dir, "*.png"))
+    )
+    if not paths:
+        raise ValueError(f"No jpg/png images found in {image_dir}")
+
+    frames = []
+    for p in paths:
+        img = Image.open(p).convert("RGB")
+        w, h = img.size
+        new_w = img_size
+        new_h = round(h * (new_w / w) / patch_size) * patch_size
+        img = img.resize((new_w, new_h), Image.BICUBIC)
+        if new_h > img_size:
+            start_y = (new_h - img_size) // 2
+            arr = np.array(img, dtype=np.float32)[start_y:start_y + img_size]
+        else:
+            arr = np.array(img, dtype=np.float32)
+        frames.append(arr / 255.0)
+
+    arr = np.stack(frames, axis=0)          # [S, H, W, 3]
+    arr = arr.transpose(0, 3, 1, 2)         # [S, 3, H, W]
+    return arr
+
+
+# ---------------------------------------------------------------------------
+# Post-processing: MLX outputs → pred_dict for PointCloudViewer
+# ---------------------------------------------------------------------------
+
+def postprocess(predictions: dict, image_hw: tuple) -> dict:
+    """Convert MLX inference outputs to the pred_dict format expected by PointCloudViewer.
+
+    Parameters
+    ----------
+    predictions : dict
+        Raw output from inference_streaming. Tensors have shape [B, S, ...].
+        B=1 is squeezed away here.
+    image_hw : tuple
+        (H, W) of the input images (needed for FoV → intrinsics conversion).
+
+    Returns
+    -------
+    pred_dict : dict with numpy arrays ready for PointCloudViewer.
+    """
+    import torch
+    from lingbot_map.utils.pose_enc import pose_encoding_to_extri_intri
+    from lingbot_map.utils.geometry import closed_form_inverse_se3_general
+
+    H, W = image_hw
+
+    # Convert MLX → numpy, squeeze batch dim (B=1)
+    def to_np(x):
+        return np.asarray(x.astype(mx.float32))[0]   # [B, S, ...] → [S, ...]
+
+    pose_enc_np = to_np(predictions["pose_enc"])       # [S, 9]
+    images_np   = to_np(predictions["images"])         # [S, 3, H, W]
+
+    # pose_enc → w2c extrinsics + intrinsics (via torch)
+    pose_t = torch.from_numpy(pose_enc_np).float().unsqueeze(0)   # [1, S, 9]
+    extrinsic_t, intrinsic_t = pose_encoding_to_extri_intri(pose_t, (H, W))
+    # extrinsic_t: [1, S, 3, 4] w2c  →  convert to c2w
+    ext4 = torch.zeros(*extrinsic_t.shape[:-2], 4, 4)
+    ext4[..., :3, :4] = extrinsic_t
+    ext4[..., 3, 3] = 1.0
+    ext4_c2w = closed_form_inverse_se3_general(ext4)             # [1, S, 4, 4] c2w
+    extrinsic_np = ext4_c2w[0, :, :3, :4].numpy()               # [S, 3, 4] c2w
+    intrinsic_np = intrinsic_t[0].numpy()                        # [S, 3, 3]
+
+    pred_dict = {
+        "images":    images_np,      # [S, 3, H, W]
+        "extrinsic": extrinsic_np,   # [S, 3, 4] c2w
+        "intrinsic": intrinsic_np,   # [S, 3, 3]
+    }
+
+    if "depth" in predictions:
+        pred_dict["depth"]      = to_np(predictions["depth"])       # [S, H, W, 1]
+        pred_dict["depth_conf"] = to_np(predictions["depth_conf"])  # [S, H, W]
+
+    if "world_points" in predictions:
+        pred_dict["world_points"]      = to_np(predictions["world_points"])       # [S, H, W, 3]
+        pred_dict["world_points_conf"] = to_np(predictions["world_points_conf"])  # [S, H, W]
+
+    return pred_dict
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(description="GCTStream MLX inference demo")
+    parser.add_argument("--checkpoint", required=True,
+                        help="Path to PyTorch .pt checkpoint")
+    parser.add_argument("--images", required=True,
+                        help="Directory of input images (sorted alphabetically)")
+    parser.add_argument("--output", default="mlx_output.npz",
+                        help="Output file (.npz)")
+    parser.add_argument("--img-size", type=int, default=518,
+                        help="Resize images to this square size")
+    parser.add_argument("--scale-frames", type=int, default=8,
+                        help="Number of initial scale frames (Phase 1)")
+    parser.add_argument("--keyframe-interval", type=int, default=1,
+                        help="Keyframe interval (1 = every frame)")
+    parser.add_argument("--dtype", choices=["float32", "float16"], default="float32",
+                        help="Compute dtype (float16 is faster but less stable)")
+    parser.add_argument("--max-frames", type=int, default=None,
+                        help="Only process first N frames (default: all)")
+    # Visualization
+    parser.add_argument("--no-vis", action="store_true",
+                        help="Skip 3D visualization server (just save .npz)")
+    parser.add_argument("--port", type=int, default=8080,
+                        help="Viser visualization server port")
+    parser.add_argument("--conf-threshold", type=float, default=1.5,
+                        help="Confidence threshold for point cloud filtering")
+    parser.add_argument("--downsample-factor", type=int, default=10,
+                        help="Point cloud downsample factor")
+    parser.add_argument("--point-size", type=float, default=0.00001,
+                        help="Initial point size in the 3D viewer")
+    args = parser.parse_args()
+
+    # ---- Build model ----
+    print("Building GCTStreamMLX model...")
+    model = GCTStreamMLX(
+        img_size=args.img_size,
+        patch_size=14,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        num_register_tokens=4,
+        kv_cache_sliding_window=64,
+        kv_cache_scale_frames=args.scale_frames,
+        camera_num_iterations=4,
+        enable_depth=True,
+        enable_point=False,
+    )
+
+    # ---- Load weights ----
+    print(f"Loading checkpoint: {args.checkpoint}")
+    load_checkpoint(model, args.checkpoint, verbose=True)
+
+    # ---- Load images ----
+    print(f"Loading images from: {args.images}")
+    images_np = _load_images_from_dir(args.images, img_size=args.img_size)
+    print(f"Loaded {images_np.shape[0]} frames at {images_np.shape[2]}×{images_np.shape[3]}")
+
+    if args.max_frames is not None:
+        images_np = images_np[:args.max_frames]
+        print(f"Trimmed to {images_np.shape[0]} frames")
+
+    images = mx.array(images_np)   # [S, 3, H, W]
+
+    if args.dtype == "float16":
+        images = images.astype(mx.float16)
+        model.apply(lambda x: x.astype(mx.float16) if mx.is_array(x) else x)
+
+    # ---- Streaming inference ----
+    print("Running streaming inference...")
+    t0 = time.perf_counter()
+
+    predictions = model.inference_streaming(
+        images,
+        num_scale_frames=args.scale_frames,
+        keyframe_interval=args.keyframe_interval,
+    )
+
+    mx.eval(predictions)
+    elapsed = time.perf_counter() - t0
+    S = images_np.shape[0]
+    print(f"Inference done: {S} frames in {elapsed:.1f}s  ({S/elapsed:.1f} fps)")
+
+    # ---- Save outputs ----
+    out = {}
+    for k, v in predictions.items():
+        if k == "images":
+            continue
+        out[k] = np.asarray(v)
+    np.savez(args.output, **out)
+    print(f"Saved predictions to {args.output}")
+    for k, v in out.items():
+        print(f"  {k}: {v.shape} {v.dtype}")
+
+    # ---- Visualization ----
+    if args.no_vis:
+        return
+
+    try:
+        from lingbot_map.vis import PointCloudViewer
+    except ImportError:
+        print("viser not installed. Install with: pip install lingbot-map[vis]")
+        return
+
+    print("Post-processing for visualization...")
+    H, W = images_np.shape[2], images_np.shape[3]
+    pred_dict = postprocess(predictions, (H, W))
+
+    print(f"Launching 3D viewer at http://localhost:{args.port}")
+    viewer = PointCloudViewer(
+        pred_dict=pred_dict,
+        port=args.port,
+        vis_threshold=args.conf_threshold,
+        downsample_factor=args.downsample_factor,
+        point_size=args.point_size,
+    )
+    viewer.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mlx_infer/heads.py b/mlx_infer/heads.py
new file mode 100644
index 0000000..9e7f849
--- /dev/null
+++ b/mlx_infer/heads.py
@@ -0,0 +1,532 @@
+"""
+MLX prediction heads.
+
+CameraHeadMLX — iterative pose refinement with KV cache.
+DPTHeadMLX    — dense prediction (depth / world points) via DPT architecture.
+
+Both mirror their PyTorch equivalents in lingbot_map/heads/.
+"""
+
+from typing import Optional, List, Tuple, Dict, Any
+import math
+import numpy as np
+import mlx.core as mx
+import mlx.nn as nn
+
+from .layers import LayerNorm, MLP, StreamingBlock
+
+
+# ---------------------------------------------------------------------------
+# Activations (mirrors head_act.py)
+# ---------------------------------------------------------------------------
+
+def activate_pose(pred: mx.array,
+                  trans_act: str = "linear",
+                  quat_act: str = "linear",
+                  fl_act: str = "relu") -> mx.array:
+    T    = _base_act(pred[..., :3], trans_act)
+    quat = _base_act(pred[..., 3:7], quat_act)
+    fl   = _base_act(pred[..., 7:],  fl_act)
+    return mx.concatenate([T, quat, fl], axis=-1)
+
+
+def _base_act(x: mx.array, act_type: str) -> mx.array:
+    if act_type == "linear":
+        return x
+    elif act_type == "inv_log":
+        return mx.sign(x) * (mx.expm1(mx.abs(x)))
+    elif act_type == "exp":
+        return mx.exp(x)
+    elif act_type == "relu":
+        return mx.maximum(x, 0)
+    raise ValueError(f"Unknown act_type: {act_type}")
+
+
+def activate_head(out: mx.array,
+                  activation: str = "inv_log",
+                  conf_activation: str = "expp1") -> Tuple[mx.array, mx.array]:
+    """out: [B, H, W, C] (channel-last) → (pts3d, conf)."""
+    # out layout: [..., :-1] are xyz-like values, [..., -1] is confidence logit
+    xyz  = out[..., :-1]
+    conf = out[..., -1]
+
+    if activation == "inv_log":
+        pts3d = mx.sign(xyz) * mx.expm1(mx.abs(xyz))
+    elif activation == "exp":
+        pts3d = mx.exp(xyz)
+    elif activation == "norm_exp":
+        d = mx.sqrt(mx.sum(xyz * xyz, axis=-1, keepdims=True)).clip(1e-8)
+        pts3d = (xyz / d) * mx.expm1(d)
+    elif activation == "relu":
+        pts3d = mx.maximum(xyz, 0)
+    elif activation == "linear":
+        pts3d = xyz
+    else:
+        raise ValueError(f"Unknown activation: {activation}")
+
+    if conf_activation == "expp1":
+        conf_out = 1 + mx.exp(conf)
+    elif conf_activation == "expp0":
+        conf_out = mx.exp(conf)
+    elif conf_activation == "sigmoid":
+        conf_out = mx.sigmoid(conf)
+    else:
+        raise ValueError(f"Unknown conf_activation: {conf_activation}")
+
+    return pts3d, conf_out
+
+
+# ---------------------------------------------------------------------------
+# Camera head
+# ---------------------------------------------------------------------------
+
+def _modulate(x: mx.array, shift: mx.array, scale: mx.array) -> mx.array:
+    return x * (1 + scale) + shift
+
+
+class CameraHeadMLX(nn.Module):
+    """Iterative camera pose refinement head (causal streaming version).
+
+    Corresponds to CameraCausalHead in lingbot_map/heads/camera_head.py.
+
+    The trunk is a list of StreamingBlock layers so the same KV-cache
+    eviction logic from the aggregator global blocks applies here too.
+    """
+
+    def __init__(
+        self,
+        dim_in: int = 2048,
+        trunk_depth: int = 4,
+        num_heads: int = 16,
+        mlp_ratio: float = 4.0,
+        init_values: float = 0.01,
+        num_iterations: int = 4,
+        kv_cache_sliding_window: int = 64,
+        kv_cache_scale_frames: int = 8,
+        kv_cache_keep_special: bool = True,
+    ):
+        super().__init__()
+        self.dim_in = dim_in
+        self.trunk_depth = trunk_depth
+        self.num_iterations = num_iterations
+        self.target_dim = 9   # absT_quaR_FoV
+
+        # Trunk: causal transformer blocks (same eviction policy as aggregator)
+        self.trunk = [
+            StreamingBlock(
+                dim_in, num_heads,
+                mlp_ratio=mlp_ratio,
+                init_values=init_values,
+                sliding_window=kv_cache_sliding_window,
+                scale_frames=kv_cache_scale_frames,
+                keep_special=kv_cache_keep_special,
+            )
+            for _ in range(trunk_depth)
+        ]
+
+        self.token_norm = LayerNorm(dim_in)
+        self.trunk_norm = LayerNorm(dim_in)
+
+        # Learnable empty pose token
+        self.empty_pose_tokens = mx.zeros((1, 1, self.target_dim))
+        self.embed_pose = nn.Linear(self.target_dim, dim_in, bias=True)
+
+        # AdaLN modulation: SiLU → Linear(dim_in, 3*dim_in)
+        self.poseLN_silu   = nn.SiLU()
+        self.poseLN_linear = nn.Linear(dim_in, 3 * dim_in, bias=True)
+
+        # AdaLN norm (no affine params — elementwise_affine=False)
+        self.adaln_norm = LayerNorm(dim_in, affine=False)
+
+        # Output branch: MLP(dim_in → dim_in//2 → target_dim)
+        self.pose_branch = MLP(dim_in, dim_in // 2, self.target_dim, bias=True)
+
+        # KV cache (list of dicts, one per trunk block, per iteration)
+        # Layout: kv_cache[iteration][block_key] = tensor
+        self.kv_cache: Optional[List[List[Dict[str, Any]]]] = None
+        self.frame_idx = 0
+
+    # ------------------------------------------------------------------
+
+    def clean_kv_cache(self):
+        self.kv_cache = None
+        self.frame_idx = 0
+
+    def set_skip_append(self, skip: bool):
+        if self.kv_cache is not None:
+            for iter_cache in self.kv_cache:
+                for d in iter_cache:
+                    d["_skip_append"] = skip
+
+    def _ensure_kv_cache(self):
+        if self.kv_cache is None:
+            self.kv_cache = [
+                [
+                    {"k": None, "v": None, "k_special": None, "v_special": None,
+                     "_skip_append": False}
+                    for _ in range(self.trunk_depth)
+                ]
+                for _ in range(self.num_iterations)
+            ]
+
+    # ------------------------------------------------------------------
+
+    def __call__(
+        self,
+        aggregated_tokens_list: List[mx.array],
+        causal_inference: bool = False,
+        num_iterations: Optional[int] = None,
+        num_frame_per_block: int = 1,
+        num_frame_for_scale: int = -1,
+    ) -> List[mx.array]:
+        """
+        aggregated_tokens_list: List of [B, S, P, 2C] tensors.
+        Returns: list of [B, S, 9] pose encodings, one per iteration.
+        """
+        if num_iterations is None:
+            num_iterations = self.num_iterations
+
+        if causal_inference:
+            self._ensure_kv_cache()
+
+        # Camera token is at index 0 of the token sequence
+        tokens = aggregated_tokens_list[-1]   # [B, S, P, 2C]
+        pose_tokens = tokens[:, :, 0, :]      # [B, S, 2C]
+        pose_tokens = self.token_norm(pose_tokens)
+
+        B, S, C = pose_tokens.shape
+        pred_pose_enc = None
+        pred_pose_enc_list: List[mx.array] = []
+
+        for i in range(num_iterations):
+            # Build module_input from current pose estimate
+            if pred_pose_enc is None:
+                module_input = self.embed_pose(
+                    mx.broadcast_to(self.empty_pose_tokens, (B, S, self.target_dim))
+                )
+            else:
+                module_input = self.embed_pose(mx.stop_gradient(pred_pose_enc))
+
+            # AdaLN modulation
+            mod = self.poseLN_linear(self.poseLN_silu(module_input))  # [B, S, 3C]
+            shift_msa = mod[..., :C]
+            scale_msa = mod[..., C:2*C]
+            gate_msa  = mod[..., 2*C:]
+
+            modulated = gate_msa * _modulate(self.adaln_norm(pose_tokens), shift_msa, scale_msa)
+            modulated = modulated + pose_tokens   # residual
+
+            # Apply trunk blocks
+            for j, block in enumerate(self.trunk):
+                kv = self.kv_cache[i][j] if causal_inference else None
+                modulated = block(
+                    modulated,
+                    kv_cache=kv,
+                    num_frame_per_block=num_frame_per_block,
+                )
+
+            # Pose delta
+            delta = self.pose_branch(self.trunk_norm(modulated))   # [B, S, 9]
+            if pred_pose_enc is None:
+                pred_pose_enc = delta
+            else:
+                pred_pose_enc = pred_pose_enc + delta
+
+            pred_pose_enc_list.append(
+                activate_pose(pred_pose_enc)
+            )
+
+        # Advance frame counter for streaming
+        if causal_inference:
+            self.frame_idx += S
+
+        return pred_pose_enc_list
+
+
+# ---------------------------------------------------------------------------
+# DPT head helpers
+# ---------------------------------------------------------------------------
+
+class _ResidualConvUnit(nn.Module):
+    """Two conv layers with residual connection (mirrors ResidualConvUnit).
+
+    PyTorch uses ReLU(inplace=True), which modifies the input in-place before
+    the residual add, so the effective formula is:
+        out = conv2(relu(conv1(relu(x)))) + relu(x)
+    not conv2(...) + x.  We replicate that here.
+    """
+
+    def __init__(self, features: int):
+        super().__init__()
+        self.conv1 = nn.Conv2d(features, features, kernel_size=3, padding=1, bias=True)
+        self.conv2 = nn.Conv2d(features, features, kernel_size=3, padding=1, bias=True)
+
+    def __call__(self, x: mx.array) -> mx.array:
+        residual = nn.relu(x)          # mirrors in-place relu that overwrites x
+        out = self.conv1(residual)
+        out = nn.relu(out)
+        out = self.conv2(out)
+        return out + residual          # residual is relu(x), not x
+
+
+class _FeatureFusionBlock(nn.Module):
+    """Fuses two feature maps: residual unit + bilinear upsample + 1×1 conv."""
+
+    def __init__(self, features: int, has_residual: bool = True):
+        super().__init__()
+        self.has_residual = has_residual
+        if has_residual:
+            self.resConfUnit1 = _ResidualConvUnit(features)
+        self.resConfUnit2 = _ResidualConvUnit(features)
+        self.out_conv = nn.Conv2d(features, features, kernel_size=1, bias=True)
+
+    def __call__(self, x: mx.array,
+                 skip: Optional[mx.array] = None,
+                 target_hw: Optional[Tuple[int, int]] = None) -> mx.array:
+        if self.has_residual and skip is not None:
+            x = x + self.resConfUnit1(skip)
+        x = self.resConfUnit2(x)
+        # Bilinear upsample
+        if target_hw is not None:
+            x = _bilinear(x, target_hw)
+        else:
+            x = _bilinear(x, (x.shape[1] * 2, x.shape[2] * 2))
+        x = self.out_conv(x)
+        return x
+
+
+def _bilinear(x: mx.array, hw: Tuple[int, int], align_corners: bool = True) -> mx.array:
+    """Bilinear resize [B, H, W, C] → [B, h, w, C]."""
+    B, H, W, C = x.shape
+    h, w = hw
+    if (h, w) == (H, W):
+        return x
+
+    if align_corners:
+        y_src = mx.arange(h, dtype=mx.float32) * ((H - 1) / max(h - 1, 1))
+        x_src = mx.arange(w, dtype=mx.float32) * ((W - 1) / max(w - 1, 1))
+    else:
+        y_src = (mx.arange(h, dtype=mx.float32) + 0.5) * (H / h) - 0.5
+        x_src = (mx.arange(w, dtype=mx.float32) + 0.5) * (W / w) - 0.5
+
+    y0 = mx.clip(mx.floor(y_src).astype(mx.int32), 0, H - 1)
+    y1 = mx.clip(y0 + 1, 0, H - 1)
+    x0 = mx.clip(mx.floor(x_src).astype(mx.int32), 0, W - 1)
+    x1 = mx.clip(x0 + 1, 0, W - 1)
+
+    wy1 = (y_src - mx.floor(y_src)).reshape(1, h, 1, 1)
+    wx1 = (x_src - mx.floor(x_src)).reshape(1, 1, w, 1)
+    wy0 = 1.0 - wy1
+    wx0 = 1.0 - wx1
+
+    q00 = x[:, y0, :, :][:, :, x0, :]   # [B, h, w, C]
+    q01 = x[:, y0, :, :][:, :, x1, :]
+    q10 = x[:, y1, :, :][:, :, x0, :]
+    q11 = x[:, y1, :, :][:, :, x1, :]
+
+    return q00 * wy0 * wx0 + q01 * wy0 * wx1 + q10 * wy1 * wx0 + q11 * wy1 * wx1
+
+
+# ---------------------------------------------------------------------------
+# DPT head
+# ---------------------------------------------------------------------------
+
+class DPTHeadMLX(nn.Module):
+    """Dense Prediction Transformer head (channel-last for MLX).
+
+    Mirrors DPTHead from lingbot_map/heads/dpt_head.py.
+
+    Key differences from the PyTorch version:
+    - All Conv2d operate on NHWC tensors (channel last).
+    - ConvTranspose2d in resize_layers[0] and [1] are replaced by bilinear
+      upsample + Conv2d (MLX ConvTranspose2d is available but bilinear is faster).
+    - Positional embeddings are computed as sinusoidal grids on-the-fly.
+
+    Parameters
+    ----------
+    dim_in : int
+        Input token dimension (2*embed_dim = 2048 for ViT-L).
+    patch_size : int
+        Patch size (14).
+    output_dim : int
+        Output channels: 2 for depth (value+conf), 4 for points (xyz+conf).
+    activation, conf_activation : str
+        Activation types passed to activate_head.
+    features : int
+        DPT fusion feature channels (256).
+    out_channels : list
+        Per-layer projection channels (default [256, 512, 1024, 1024]).
+    """
+
+    def __init__(
+        self,
+        dim_in: int = 2048,
+        patch_size: int = 14,
+        output_dim: int = 4,
+        activation: str = "inv_log",
+        conf_activation: str = "expp1",
+        features: int = 256,
+        out_channels: List[int] = None,
+    ):
+        super().__init__()
+        if out_channels is None:
+            out_channels = [256, 512, 1024, 1024]
+        self.patch_size = patch_size
+        self.activation = activation
+        self.conf_activation = conf_activation
+        self.out_channels = out_channels
+
+        self.norm = LayerNorm(dim_in)
+
+        # Token-to-spatial projection: one 1×1 Conv2d per DPT level
+        self.projects = [
+            nn.Conv2d(dim_in, oc, kernel_size=1)
+            for oc in out_channels
+        ]
+
+        # Resize layers matching PyTorch DPTHead.resize_layers exactly:
+        #   [0] ConvTranspose2d 4× upsample
+        #   [1] ConvTranspose2d 2× upsample
+        #   [2] identity (no parameters)
+        #   [3] Conv2d stride-2 downsample
+        # Weights remapped from checkpoint keys resize_layers.{0,1,3}.*
+        self.resize_conv0 = nn.ConvTranspose2d(out_channels[0], out_channels[0], kernel_size=4, stride=4)
+        self.resize_conv1 = nn.ConvTranspose2d(out_channels[1], out_channels[1], kernel_size=2, stride=2)
+        self.resize_conv3 = nn.Conv2d(out_channels[3], out_channels[3], kernel_size=3, stride=2, padding=1)
+
+        # Scratch-level readout convolutions (checkpoint keys: scratch.layer{1-4}_rn.weight)
+        self.layer1_rn = nn.Conv2d(out_channels[0], features, kernel_size=3, padding=1, bias=False)
+        self.layer2_rn = nn.Conv2d(out_channels[1], features, kernel_size=3, padding=1, bias=False)
+        self.layer3_rn = nn.Conv2d(out_channels[2], features, kernel_size=3, padding=1, bias=False)
+        self.layer4_rn = nn.Conv2d(out_channels[3], features, kernel_size=3, padding=1, bias=False)
+
+        # Fusion blocks (checkpoint keys: scratch.refinenet{1-4}.*)
+        self.refinenet4 = _FeatureFusionBlock(features, has_residual=False)
+        self.refinenet3 = _FeatureFusionBlock(features, has_residual=True)
+        self.refinenet2 = _FeatureFusionBlock(features, has_residual=True)
+        self.refinenet1 = _FeatureFusionBlock(features, has_residual=True)
+
+        # Output convolutions (checkpoint keys: scratch.output_conv1/2.0/2.*)
+        self.output_conv1  = nn.Conv2d(features, features // 2, kernel_size=3, padding=1)
+        self.output_conv2a = nn.Conv2d(features // 2, 32, kernel_size=3, padding=1)
+        self.output_conv2b = nn.Conv2d(32, output_dim, kernel_size=1)
+
+        # Pos-embed cache
+        self._pos_embed_cache: Dict[tuple, mx.array] = {}
+
+    # ------------------------------------------------------------------
+
+    def __call__(
+        self,
+        aggregated_tokens_list: List[mx.array],
+        images: mx.array,
+        patch_start_idx: int,
+    ) -> Tuple[mx.array, mx.array]:
+        """
+        aggregated_tokens_list: List of [B, S, P, 2C].
+        images: [B, S, 3, H, W] (PyTorch channel-first convention).
+        Returns: (preds, conf) each [B, S, H, W, C-1] and [B, S, H, W].
+        """
+        B, _, _, H, W = images.shape
+        S = aggregated_tokens_list[0].shape[1]
+        patch_h = H // self.patch_size
+        patch_w = W // self.patch_size
+
+        out_feats = []
+        for level, layer_idx in enumerate([0, 1, 2, 3]):
+            x = aggregated_tokens_list[layer_idx][:, :, patch_start_idx:]  # [B, S, N_patch, 2C]
+            x = x.reshape(B * S, patch_h * patch_w, x.shape[-1])
+            x = self.norm(x)
+
+            # Reshape to spatial grid (channel-last)
+            x = x.reshape(B * S, patch_h, patch_w, x.shape[-1])   # [B*S, H', W', 2C]
+
+            # 1×1 projection
+            x = self.projects[level](x)                              # [B*S, H', W', oc]
+
+            # Optional positional embedding
+            x = self._apply_pos_embed(x, W, H)
+
+            # Resize
+            if level == 0:
+                x = self.resize_conv0(x)   # ConvTranspose2d: 4× upsample
+            elif level == 1:
+                x = self.resize_conv1(x)   # ConvTranspose2d: 2× upsample
+            elif level == 2:
+                pass  # identity
+            elif level == 3:
+                x = self.resize_conv3(x)   # Conv2d stride 2: 2× downsample
+
+            out_feats.append(x)
+
+        # Scratch fusion
+        l1 = self.layer1_rn(out_feats[0])
+        l2 = self.layer2_rn(out_feats[1])
+        l3 = self.layer3_rn(out_feats[2])
+        l4 = self.layer4_rn(out_feats[3])
+
+        out = self.refinenet4(l4, target_hw=(l3.shape[1], l3.shape[2]))
+        out = self.refinenet3(out, skip=l3, target_hw=(l2.shape[1], l2.shape[2]))
+        out = self.refinenet2(out, skip=l2, target_hw=(l1.shape[1], l1.shape[2]))
+        out = self.refinenet1(out, skip=l1)
+
+        # Output head
+        out = self.output_conv1(out)             # [B*S, H', W', features//2]
+        # Final upsample to full patch resolution
+        out = _bilinear(out, (patch_h * self.patch_size, patch_w * self.patch_size))
+        out = self._apply_pos_embed(out, W, H)
+
+        out = nn.relu(self.output_conv2a(out))   # [B*S, H, W, 32]
+        out = self.output_conv2b(out)             # [B*S, H, W, output_dim]
+
+        preds, conf = activate_head(out, self.activation, self.conf_activation)
+
+        # Reshape back to [B, S, H, W, ...]
+        preds = preds.reshape(B, S, *preds.shape[1:])
+        conf  = conf.reshape(B, S, *conf.shape[1:])
+        return preds, conf
+
+    def _apply_pos_embed(self, x: mx.array, W: int, H: int, ratio: float = 0.1) -> mx.array:
+        """Sinusoidal UV positional embedding matching PyTorch DPTHead._apply_pos_embed.
+
+        Uses create_uv_grid + position_grid_to_embed logic from lingbot_map/heads/utils.py.
+        x: [B, ph, pw, C] (channel-last).
+        """
+        ph, pw, C = x.shape[1], x.shape[2], x.shape[3]
+        key = (pw, ph, W / H, C)
+        if key not in self._pos_embed_cache:
+            aspect = W / H
+            diag = (aspect ** 2 + 1.0) ** 0.5
+            span_x = aspect / diag
+            span_y = 1.0 / diag
+
+            # Bounds matching create_uv_grid (align_corners-like: (N-1)/N scaling)
+            lx = -span_x * (pw - 1) / pw
+            rx =  span_x * (pw - 1) / pw
+            ty = -span_y * (ph - 1) / ph
+            by =  span_y * (ph - 1) / ph
+
+            # meshgrid xy-indexing → uu[i,j]=x[j], vv[i,j]=y[i], shape [ph, pw]
+            x_c = np.linspace(lx, rx, pw, dtype=np.float32)
+            y_c = np.linspace(ty, by, ph, dtype=np.float32)
+            uu, vv = np.meshgrid(x_c, y_c)            # [ph, pw]
+            uv = np.stack([uu, vv], axis=-1)          # [ph, pw, 2]
+            pos = uv.reshape(-1, 2)                   # [ph*pw, 2]
+
+            # position_grid_to_embed: half C for x, half for y; omega_0=100
+            def _sincos(coords: np.ndarray, dim: int) -> np.ndarray:
+                """[N] → [N, dim] sincos embedding."""
+                omega = np.arange(dim // 2, dtype=np.float32) / (dim / 2.0)
+                omega = 1.0 / (100.0 ** omega)           # [dim//2]
+                out = np.outer(coords, omega)             # [N, dim//2]
+                return np.concatenate([np.sin(out), np.cos(out)], axis=-1)
+
+            half = C // 2
+            emb_x = _sincos(pos[:, 0], half)             # [ph*pw, C//2]
+            emb_y = _sincos(pos[:, 1], half)             # [ph*pw, C//2]
+            emb = np.concatenate([emb_x, emb_y], axis=-1)  # [ph*pw, C]
+            emb = emb.reshape(ph, pw, C).astype(np.float32) * ratio
+            self._pos_embed_cache[key] = mx.array(emb[None])  # [1, ph, pw, C]
+
+        emb = self._pos_embed_cache[key]
+        return x + mx.broadcast_to(emb, x.shape)
diff --git a/mlx_infer/layers.py b/mlx_infer/layers.py
new file mode 100644
index 0000000..0c1f37d
--- /dev/null
+++ b/mlx_infer/layers.py
@@ -0,0 +1,325 @@
+"""
+Core MLX layers: Linear, LayerNorm, MLP, Attention, Transformer Block.
+
+MLX notes vs PyTorch:
+- mx.array is the tensor type; created lazily (no data until evaluated)
+- mlx.nn.Linear has weight shape [out, in] like PyTorch, but no transpose in matmul needed
+- mlx.core.matmul broadcasts differently; prefer nn.Linear for weight contractions
+- mx.fast.scaled_dot_product_attention is available and supports causal masking
+"""
+
+import math
+from typing import Optional, List, Tuple, Dict, Any
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+
+
+class LayerNorm(nn.Module):
+    def __init__(self, dims: int, eps: float = 1e-6, affine: bool = True):
+        super().__init__()
+        self.dims = dims
+        self.eps = eps
+        if affine:
+            self.weight = mx.ones((dims,))
+            self.bias = mx.zeros((dims,))
+        else:
+            self.weight = None
+            self.bias = None
+
+    def __call__(self, x: mx.array) -> mx.array:
+        return mx.fast.layer_norm(x, self.weight, self.bias, self.eps)
+
+
+class Linear(nn.Linear):
+    """Thin wrapper so we can load PyTorch weight dicts by name."""
+    pass
+
+
+class MLP(nn.Module):
+    def __init__(self, in_features: int, hidden_features: int, out_features: int,
+                 bias: bool = True, drop: float = 0.0):
+        super().__init__()
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+
+    def __call__(self, x: mx.array) -> mx.array:
+        return self.fc2(nn.gelu(self.fc1(x)))
+
+
+class LayerScale(nn.Module):
+    def __init__(self, dim: int, init_values: float = 1e-5):
+        super().__init__()
+        self.gamma = mx.full((dim,), init_values)
+
+    def __call__(self, x: mx.array) -> mx.array:
+        return x * self.gamma
+
+
+def rotate_half(x: mx.array) -> mx.array:
+    """Rotate the last dimension by splitting and negating halves."""
+    d = x.shape[-1] // 2
+    x1, x2 = x[..., :d], x[..., d:]
+    return mx.concatenate([-x2, x1], axis=-1)
+
+
+def _rope_1d(x: mx.array, cos: mx.array, sin: mx.array) -> mx.array:
+    """Standard 1D RoPE on a D-dim vector: x * cos + rotate_half(x) * sin."""
+    d = x.shape[-1] // 2
+    x1, x2 = x[..., :d], x[..., d:]
+    return mx.concatenate([x1 * cos[..., :d] - x2 * sin[..., :d],
+                            x2 * cos[..., d:] + x1 * sin[..., d:]], axis=-1)
+
+
+def apply_rope_2d(q: mx.array, k: mx.array, cos: mx.array, sin: mx.array) -> Tuple[mx.array, mx.array]:
+    """Apply 2D RoPE to q and k. cos/sin shape: [B, 1, T, D].
+
+    The first D//2 dims get 1D RoPE with y-positions; the last D//2 dims get
+    1D RoPE with x-positions, each applied independently (matching PyTorch's
+    RotaryPositionEmbedding2D which processes vertical/horizontal halves separately).
+    """
+    h = q.shape[-1] // 2
+    cos_y, cos_x = cos[..., :h], cos[..., h:]
+    sin_y, sin_x = sin[..., :h], sin[..., h:]
+    q_rot = mx.concatenate([_rope_1d(q[..., :h], cos_y, sin_y),
+                             _rope_1d(q[..., h:], cos_x, sin_x)], axis=-1)
+    k_rot = mx.concatenate([_rope_1d(k[..., :h], cos_y, sin_y),
+                             _rope_1d(k[..., h:], cos_x, sin_x)], axis=-1)
+    return q_rot, k_rot
+
+
+def apply_rotary_emb(x: mx.array, freqs: mx.array) -> mx.array:
+    """Apply real-valued rotary position embedding (matches rope.apply_rotary_emb).
+
+    x:     [B, H, T, D]
+    freqs: [1, 1, T, D//2]  (real angles)
+    """
+    # Split x into pairs and apply rotation
+    x_f32 = x.astype(mx.float32)
+    x_r = x_f32[..., 0::2]   # [B, H, T, D//2]
+    x_i = x_f32[..., 1::2]
+    cos = mx.cos(freqs)       # [1, 1, T, D//2]
+    sin = mx.sin(freqs)
+    out_r = x_r * cos - x_i * sin
+    out_i = x_r * sin + x_i * cos
+    # Interleave back
+    out = mx.stack([out_r, out_i], axis=-1)  # [B, H, T, D//2, 2]
+    out = out.reshape(*x.shape[:-1], x.shape[-1])
+    return out.astype(x.dtype)
+
+
+class Attention(nn.Module):
+    """Multi-head self-attention with optional 2D RoPE and QK-norm."""
+
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = True,
+                 proj_bias: bool = True, qk_norm: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.q_norm = LayerNorm(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = LayerNorm(self.head_dim) if qk_norm else nn.Identity()
+
+    def __call__(self, x: mx.array, rope_cos: Optional[mx.array] = None,
+                 rope_sin: Optional[mx.array] = None) -> mx.array:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
+        qkv = qkv.transpose(0, 2, 3, 1, 4)   # [B, 3, H, N, D]  -- mlx transpose
+        q, k, v = qkv[:, 0], qkv[:, 1], qkv[:, 2]  # each [B, H, N, D]
+
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+
+        if rope_cos is not None:
+            q, k = apply_rope_2d(q, k, rope_cos, rope_sin)
+
+        # mlx SDPA: [B, H, N, D]
+        x = mx.fast.scaled_dot_product_attention(q, k, v, scale=self.scale)
+        x = x.transpose(0, 2, 1, 3).reshape(B, N, C)
+        return self.proj(x)
+
+
+class TransformerBlock(nn.Module):
+    """Standard pre-norm transformer block."""
+
+    def __init__(self, dim: int, num_heads: int, mlp_ratio: float = 4.0,
+                 qkv_bias: bool = True, proj_bias: bool = True, ffn_bias: bool = True,
+                 qk_norm: bool = False, init_values: Optional[float] = None):
+        super().__init__()
+        self.norm1 = LayerNorm(dim)
+        self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias,
+                              proj_bias=proj_bias, qk_norm=qk_norm)
+        self.ls1 = LayerScale(dim, init_values) if init_values else nn.Identity()
+        self.norm2 = LayerNorm(dim)
+        self.mlp = MLP(dim, int(dim * mlp_ratio), dim, bias=ffn_bias)
+        self.ls2 = LayerScale(dim, init_values) if init_values else nn.Identity()
+
+    def __call__(self, x: mx.array,
+                 rope_cos: Optional[mx.array] = None,
+                 rope_sin: Optional[mx.array] = None) -> mx.array:
+        x = x + self.ls1(self.attn(self.norm1(x), rope_cos=rope_cos, rope_sin=rope_sin))
+        x = x + self.ls2(self.mlp(self.norm2(x)))
+        return x
+
+
+# ---------------------------------------------------------------------------
+# KV-cache attention for streaming (causal) global blocks
+# ---------------------------------------------------------------------------
+
+class CausalAttentionMLX(nn.Module):
+    """Self-attention with a simple list-based KV cache for streaming inference.
+
+    The KV cache stores (k, v) pairs with shape [B, H, T_cached, D].
+    Each new frame appends its K/V; eviction keeps scale_frames + sliding_window.
+    """
+
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = True,
+                 proj_bias: bool = True, qk_norm: bool = False,
+                 sliding_window: int = 64, scale_frames: int = 8,
+                 keep_special: bool = True):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.sliding_window = sliding_window
+        self.scale_frames = scale_frames
+        self.keep_special = keep_special
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.q_norm = LayerNorm(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = LayerNorm(self.head_dim) if qk_norm else nn.Identity()
+
+    def __call__(self, x: mx.array,
+                 kv_cache: Optional[Dict[str, Any]] = None,
+                 num_frame_per_block: int = 1,
+                 rope_freqs: Optional[mx.array] = None,
+                 rope_cos: Optional[mx.array] = None,
+                 rope_sin: Optional[mx.array] = None,
+                 patch_start_idx: int = 6) -> mx.array:
+        B, N, C = x.shape
+        tokens_per_frame = N // num_frame_per_block
+
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
+        qkv = qkv.transpose(0, 2, 3, 1, 4)  # [B, 3, H, N, D]
+        q, k, v = qkv[:, 0], qkv[:, 1], qkv[:, 2]
+
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+
+        if rope_cos is not None:
+            q, k = apply_rope_2d(q, k, rope_cos, rope_sin)
+        elif rope_freqs is not None:
+            q = apply_rotary_emb(q, rope_freqs)
+            k = apply_rotary_emb(k, rope_freqs)
+
+        if kv_cache is None:
+            # Batch mode (Phase 1 scale frames) — simple causal-within-batch attention
+            x_out = mx.fast.scaled_dot_product_attention(q, k, v, scale=self.scale)
+        else:
+            # Streaming mode: append new K/V then attend to full cache
+            k_new = k.reshape(B, self.num_heads, num_frame_per_block,
+                               tokens_per_frame, self.head_dim)
+            v_new = v.reshape(B, self.num_heads, num_frame_per_block,
+                               tokens_per_frame, self.head_dim)
+
+            skip = kv_cache.get("_skip_append", False)
+
+            if kv_cache.get("k") is None:
+                if not skip:
+                    kv_cache["k"] = k_new
+                    kv_cache["v"] = v_new
+                k_full = k_new.reshape(B, self.num_heads, -1, self.head_dim)
+                v_full = v_new.reshape(B, self.num_heads, -1, self.head_dim)
+            else:
+                if not skip:
+                    k_cat = mx.concatenate([kv_cache["k"], k_new], axis=2)
+                    v_cat = mx.concatenate([kv_cache["v"], v_new], axis=2)
+                    # Evict: keep scale_frames + last sliding_window frames
+                    n_frames = k_cat.shape[2]
+                    total_keep = self.scale_frames + self.sliding_window
+                    if n_frames > total_keep:
+                        # Preserve special tokens from evicted frames
+                        if self.keep_special:
+                            evict_k = k_cat[:, :, self.scale_frames:n_frames - self.sliding_window, :patch_start_idx, :]
+                            evict_v = v_cat[:, :, self.scale_frames:n_frames - self.sliding_window, :patch_start_idx, :]
+                            if kv_cache.get("k_special") is None:
+                                kv_cache["k_special"] = evict_k
+                                kv_cache["v_special"] = evict_v
+                            else:
+                                kv_cache["k_special"] = mx.concatenate([kv_cache["k_special"], evict_k], axis=2)
+                                kv_cache["v_special"] = mx.concatenate([kv_cache["v_special"], evict_v], axis=2)
+                        kv_cache["k"] = mx.concatenate([
+                            k_cat[:, :, :self.scale_frames],
+                            k_cat[:, :, -self.sliding_window:],
+                        ], axis=2)
+                        kv_cache["v"] = mx.concatenate([
+                            v_cat[:, :, :self.scale_frames],
+                            v_cat[:, :, -self.sliding_window:],
+                        ], axis=2)
+                    else:
+                        kv_cache["k"] = k_cat
+                        kv_cache["v"] = v_cat
+
+                k_cached = kv_cache["k"]
+                v_cached = kv_cache["v"]
+                if skip:
+                    # Non-keyframe: attend to cache + current but don't store
+                    k_cached = mx.concatenate([k_cached, k_new], axis=2)
+                    v_cached = mx.concatenate([v_cached, v_new], axis=2)
+
+                k_full = k_cached.reshape(B, self.num_heads, -1, self.head_dim)
+                v_full = v_cached.reshape(B, self.num_heads, -1, self.head_dim)
+
+                # Prepend preserved special tokens
+                if kv_cache.get("k_special") is not None:
+                    ks = kv_cache["k_special"]
+                    vs = kv_cache["v_special"]
+                    sa, sb, sc, sd, se = ks.shape
+                    ks = ks.reshape(sa, sb, sc * sd, se)
+                    vs = vs.reshape(sa, sb, sc * sd, se)
+                    k_full = mx.concatenate([ks, k_full], axis=2)
+                    v_full = mx.concatenate([vs, v_full], axis=2)
+
+            x_out = mx.fast.scaled_dot_product_attention(q, k_full, v_full, scale=self.scale)
+
+        x_out = x_out.transpose(0, 2, 1, 3).reshape(B, N, C)
+        return self.proj(x_out)
+
+
+class StreamingBlock(nn.Module):
+    """Global (cross-frame) transformer block for streaming with KV cache."""
+
+    def __init__(self, dim: int, num_heads: int, mlp_ratio: float = 4.0,
+                 qkv_bias: bool = True, proj_bias: bool = True, ffn_bias: bool = True,
+                 qk_norm: bool = False, init_values: Optional[float] = None,
+                 sliding_window: int = 64, scale_frames: int = 8,
+                 keep_special: bool = True):
+        super().__init__()
+        self.norm1 = LayerNorm(dim)
+        self.attn = CausalAttentionMLX(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias,
+            qk_norm=qk_norm, sliding_window=sliding_window,
+            scale_frames=scale_frames, keep_special=keep_special,
+        )
+        self.ls1 = LayerScale(dim, init_values) if init_values else nn.Identity()
+        self.norm2 = LayerNorm(dim)
+        self.mlp = MLP(dim, int(dim * mlp_ratio), dim, bias=ffn_bias)
+        self.ls2 = LayerScale(dim, init_values) if init_values else nn.Identity()
+
+    def __call__(self, x: mx.array,
+                 kv_cache: Optional[Dict[str, Any]] = None,
+                 num_frame_per_block: int = 1,
+                 rope_freqs: Optional[mx.array] = None,
+                 rope_cos: Optional[mx.array] = None,
+                 rope_sin: Optional[mx.array] = None,
+                 patch_start_idx: int = 6) -> mx.array:
+        attn_out = self.attn(self.norm1(x), kv_cache=kv_cache,
+                             num_frame_per_block=num_frame_per_block,
+                             rope_freqs=rope_freqs, rope_cos=rope_cos, rope_sin=rope_sin,
+                             patch_start_idx=patch_start_idx)
+        x = x + self.ls1(attn_out)
+        x = x + self.ls2(self.mlp(self.norm2(x)))
+        return x
diff --git a/mlx_infer/model.py b/mlx_infer/model.py
new file mode 100644
index 0000000..97bafca
--- /dev/null
+++ b/mlx_infer/model.py
@@ -0,0 +1,245 @@
+"""
+GCTStreamMLX — full streaming model in MLX.
+
+Mirrors GCTStream from lingbot_map/models/gct_stream.py:
+  - AggregatorMLX backbone (DINOv2 + frame/global blocks)
+  - CameraHeadMLX (pose, 4-iteration refinement)
+  - DPTHeadMLX    (depth, world_points)
+  - inference_streaming: Phase 1 (scale frames) + Phase 2 (frame-by-frame KV cache)
+"""
+
+from typing import Optional, Dict, List, Any
+import numpy as np
+import mlx.core as mx
+import mlx.nn as nn
+from tqdm.auto import tqdm
+
+from .aggregator import AggregatorMLX
+from .heads import CameraHeadMLX, DPTHeadMLX
+
+
+class GCTStreamMLX(nn.Module):
+    """MLX streaming GCT model.
+
+    Parameters
+    ----------
+    img_size, patch_size, embed_dim : int
+        ViT-L defaults: 518, 14, 1024.
+    kv_cache_sliding_window : int
+        Sliding window for KV cache eviction (default 64 frames).
+    kv_cache_scale_frames : int
+        Number of scale frames kept in KV cache (default 8).
+    camera_num_iterations : int
+        Refinement iterations in camera head (default 4).
+    enable_depth, enable_point : bool
+        Which dense heads to build.
+    """
+
+    def __init__(
+        self,
+        img_size: int = 518,
+        patch_size: int = 14,
+        embed_dim: int = 1024,
+        depth: int = 24,
+        num_heads: int = 16,
+        num_register_tokens: int = 4,
+        kv_cache_sliding_window: int = 64,
+        kv_cache_scale_frames: int = 8,
+        kv_cache_keep_special: bool = True,
+        camera_num_iterations: int = 4,
+        enable_depth: bool = True,
+        enable_point: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.patch_size = patch_size
+        dim_2c = 2 * embed_dim   # concatenated frame+global output dim
+
+        self.aggregator = AggregatorMLX(
+            img_size=img_size,
+            patch_size=patch_size,
+            embed_dim=embed_dim,
+            depth=depth,
+            num_heads=num_heads,
+            num_register_tokens=num_register_tokens,
+            kv_cache_sliding_window=kv_cache_sliding_window,
+            kv_cache_scale_frames=kv_cache_scale_frames,
+            kv_cache_keep_special=kv_cache_keep_special,
+        )
+
+        self.camera_head = CameraHeadMLX(
+            dim_in=dim_2c,
+            trunk_depth=4,
+            num_heads=num_heads,
+            num_iterations=camera_num_iterations,
+            kv_cache_sliding_window=kv_cache_sliding_window,
+            kv_cache_scale_frames=kv_cache_scale_frames,
+            kv_cache_keep_special=kv_cache_keep_special,
+        )
+
+        self.depth_head = DPTHeadMLX(
+            dim_in=dim_2c, patch_size=patch_size, output_dim=2,
+            activation="exp", conf_activation="expp1",
+        ) if enable_depth else None
+
+        self.point_head = DPTHeadMLX(
+            dim_in=dim_2c, patch_size=patch_size, output_dim=4,
+            activation="inv_log", conf_activation="expp1",
+        ) if enable_point else None
+
+    # ------------------------------------------------------------------
+    # KV cache management
+    # ------------------------------------------------------------------
+
+    def clean_kv_cache(self):
+        self.aggregator.clean_kv_cache()
+        self.camera_head.clean_kv_cache()
+
+    def _set_skip_append(self, skip: bool):
+        self.aggregator.set_skip_append(skip)
+        self.camera_head.set_skip_append(skip)
+
+    # ------------------------------------------------------------------
+    # Single forward pass
+    # ------------------------------------------------------------------
+
+    def __call__(
+        self,
+        images: mx.array,
+        num_frame_for_scale: Optional[int] = None,
+        num_frame_per_block: int = 1,
+        causal_inference: bool = False,
+    ) -> Dict[str, mx.array]:
+        """
+        images: [B, S, 3, H, W] in [0, 1].
+        Returns dict with 'pose_enc', optionally 'depth', 'depth_conf',
+        'world_points', 'world_points_conf'.
+        """
+        if images.ndim == 4:
+            images = images[None]   # add batch dim
+
+        # Aggregate features
+        agg_list, patch_start_idx = self.aggregator(
+            images,
+            selected_idx=[4, 11, 17, 23],
+            num_frame_for_scale=num_frame_for_scale,
+            num_frame_per_block=num_frame_per_block,
+        )
+        mx.eval(agg_list)   # materialise before heads
+
+        predictions: Dict[str, mx.array] = {}
+
+        # Camera
+        pose_list = self.camera_head(
+            agg_list,
+            causal_inference=causal_inference,
+            num_iterations=None,
+            num_frame_per_block=num_frame_per_block,
+            num_frame_for_scale=num_frame_for_scale if num_frame_for_scale is not None else -1,
+        )
+        predictions["pose_enc"] = pose_list[-1]
+
+        # Depth
+        if self.depth_head is not None:
+            depth, depth_conf = self.depth_head(agg_list, images, patch_start_idx)
+            predictions["depth"] = depth
+            predictions["depth_conf"] = depth_conf
+
+        # World points
+        if self.point_head is not None:
+            pts3d, pts3d_conf = self.point_head(agg_list, images, patch_start_idx)
+            predictions["world_points"] = pts3d
+            predictions["world_points_conf"] = pts3d_conf
+
+        return predictions
+
+    # ------------------------------------------------------------------
+    # Streaming inference
+    # ------------------------------------------------------------------
+
+    def inference_streaming(
+        self,
+        images: mx.array,
+        num_scale_frames: Optional[int] = None,
+        keyframe_interval: int = 1,
+    ) -> Dict[str, mx.array]:
+        """
+        Streaming inference: process scale frames first, then frame-by-frame.
+
+        Parameters
+        ----------
+        images : mx.array
+            [S, 3, H, W] or [B, S, 3, H, W] in [0, 1].
+        num_scale_frames : int, optional
+            Initial bidirectional frames (default: aggregator patch_start_idx).
+        keyframe_interval : int
+            Every N-th frame after scale phase is a keyframe (KV stored).
+            1 = every frame (default, original behaviour).
+
+        Returns
+        -------
+        dict with keys: pose_enc, depth, depth_conf, world_points,
+                        world_points_conf, images.
+        """
+        if images.ndim == 4:
+            images = images[None]   # [1, S, 3, H, W]
+        B, S, _, H, W = images.shape
+
+        scale_frames = num_scale_frames if num_scale_frames is not None else 1
+        scale_frames = min(scale_frames, S)
+
+        # Clean caches before new sequence
+        self.clean_kv_cache()
+
+        # ------ Phase 1: scale frames (bidirectional via scale_token) ------
+        scale_out = self(
+            images[:, :scale_frames],
+            num_frame_for_scale=scale_frames,
+            num_frame_per_block=scale_frames,
+            causal_inference=True,
+        )
+        mx.eval(scale_out)
+
+        all_pose  = [scale_out["pose_enc"]]
+        all_depth = [scale_out["depth"]] if "depth" in scale_out else []
+        all_dconf = [scale_out["depth_conf"]] if "depth_conf" in scale_out else []
+        all_pts   = [scale_out["world_points"]] if "world_points" in scale_out else []
+        all_pconf = [scale_out["world_points_conf"]] if "world_points_conf" in scale_out else []
+        del scale_out
+
+        # ------ Phase 2: streaming frame-by-frame ------
+        for i in tqdm(range(scale_frames, S), desc="Streaming", initial=scale_frames, total=S):
+            is_keyframe = (keyframe_interval <= 1) or ((i - scale_frames) % keyframe_interval == 0)
+
+            if not is_keyframe:
+                self._set_skip_append(True)
+
+            frame_out = self(
+                images[:, i:i+1],
+                num_frame_for_scale=scale_frames,
+                num_frame_per_block=1,
+                causal_inference=True,
+            )
+            mx.eval(frame_out)
+
+            if not is_keyframe:
+                self._set_skip_append(False)
+
+            all_pose.append(frame_out["pose_enc"])
+            if "depth"           in frame_out: all_depth.append(frame_out["depth"])
+            if "depth_conf"      in frame_out: all_dconf.append(frame_out["depth_conf"])
+            if "world_points"    in frame_out: all_pts.append(frame_out["world_points"])
+            if "world_points_conf" in frame_out: all_pconf.append(frame_out["world_points_conf"])
+            del frame_out
+
+        self.clean_kv_cache()
+
+        result: Dict[str, mx.array] = {
+            "pose_enc": mx.concatenate(all_pose, axis=1),
+            "images":   images,
+        }
+        if all_depth: result["depth"]              = mx.concatenate(all_depth, axis=1)
+        if all_dconf: result["depth_conf"]         = mx.concatenate(all_dconf, axis=1)
+        if all_pts:   result["world_points"]       = mx.concatenate(all_pts,   axis=1)
+        if all_pconf: result["world_points_conf"]  = mx.concatenate(all_pconf, axis=1)
+        return result
diff --git a/mlx_infer/rope.py b/mlx_infer/rope.py
new file mode 100644
index 0000000..4552918
--- /dev/null
+++ b/mlx_infer/rope.py
@@ -0,0 +1,193 @@
+"""
+Rotary Position Embeddings for MLX.
+
+Two variants:
+  RotaryEmbedding2D   — per-frame spatial RoPE (replaces RotaryPositionEmbedding2D)
+  RotaryEmbedding3D   — temporal+spatial RoPE for camera head (replaces WanRotaryPosEmbed)
+"""
+
+from typing import Optional, Dict, Tuple
+import math
+import numpy as np
+import mlx.core as mx
+import mlx.nn as nn
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _freq_components(dim: int, base: float = 100.0, scaling: float = 1.0,
+                     max_seq: int = 512) -> Tuple[mx.array, mx.array]:
+    """Pre-compute (cos, sin) frequency tables of shape [max_seq, dim].
+
+    Kept as CPU computation to avoid any device-buffer-size issues.
+    """
+    exponents = np.arange(0, dim, 2, dtype=np.float32) / dim
+    inv_freq = 1.0 / (base ** exponents) / scaling          # [dim//2]
+    positions = np.arange(max_seq, dtype=np.float32)        # [max_seq]
+    angles = np.outer(positions, inv_freq)                   # [max_seq, dim//2]
+    angles = np.concatenate([angles, angles], axis=-1)       # [max_seq, dim]
+    return mx.array(np.cos(angles)), mx.array(np.sin(angles))
+
+
+def _lookup(positions: mx.array, table: mx.array) -> mx.array:
+    """Embed integer positions using a pre-computed frequency table.
+
+    positions: [B, T]  (integer grid indices)
+    table:     [max_seq, dim]
+    returns:   [B, 1, T, dim]  (ready to broadcast over heads)
+    """
+    # MLX take is equivalent to F.embedding
+    emb = table[positions]   # [B, T, dim]
+    return mx.expand_dims(emb, axis=1)  # [B, 1, T, dim]
+
+
+# ---------------------------------------------------------------------------
+# 2D spatial RoPE (used in frame-level blocks)
+# ---------------------------------------------------------------------------
+
+class RotaryEmbedding2D(nn.Module):
+    """2D Rotary Position Embedding for patch grids.
+
+    Mirrors RotaryPositionEmbedding2D from lingbot_map/layers/rope.py.
+    Splits the head dimension into vertical and horizontal halves.
+    """
+
+    def __init__(self, base: float = 100.0, scaling: float = 1.0):
+        super().__init__()
+        self.base = base
+        self.scaling = scaling
+        # Tables are built lazily (depend on head_dim + max_position)
+        self._cache: Dict[Tuple, Tuple[mx.array, mx.array]] = {}
+
+    def _get_tables(self, head_dim: int, max_pos: int) -> Tuple[mx.array, mx.array]:
+        key = (head_dim, max_pos)
+        if key not in self._cache:
+            cos_t, sin_t = _freq_components(
+                head_dim, base=self.base, scaling=self.scaling,
+                max_seq=max_pos + 1,
+            )
+            self._cache[key] = (cos_t, sin_t)
+        return self._cache[key]
+
+    def get_cos_sin(self, q: mx.array, positions: mx.array,
+                    head_dim: Optional[int] = None) -> Tuple[mx.array, mx.array]:
+        """Return (cos, sin) tensors shaped [B, 1, T, D] for the given spatial positions.
+
+        positions: [B, T, 2]  (y, x integer coords)
+        q:         [B, H, T, D]  — used to infer head_dim when head_dim is None
+        head_dim:  explicit head dim (avoids needing a dummy q tensor)
+        """
+        if head_dim is None:
+            head_dim = q.shape[-1] // 2   # each spatial axis gets head_dim // 2 dims
+        else:
+            head_dim = head_dim // 2      # convert full head_dim → per-axis dim
+        max_pos = int(mx.max(positions).item()) + 1
+
+        cos_t, sin_t = self._get_tables(head_dim, max_pos)
+
+        pos_y = positions[..., 0]  # [B, T]
+        pos_x = positions[..., 1]  # [B, T]
+
+        cos_y = _lookup(pos_y, cos_t)   # [B, 1, T, head_dim]
+        sin_y = _lookup(pos_y, sin_t)
+        cos_x = _lookup(pos_x, cos_t)
+        sin_x = _lookup(pos_x, sin_t)
+
+        # Concatenate vertical and horizontal components
+        cos = mx.concatenate([cos_y, cos_x], axis=-1)  # [B, 1, T, D]
+        sin = mx.concatenate([sin_y, sin_x], axis=-1)
+        return cos, sin
+
+
+class PositionGetter2D:
+    """Generates y,x grid positions for an H×W patch grid, with caching."""
+
+    def __init__(self):
+        self._cache: Dict[Tuple[int, int], mx.array] = {}
+
+    def __call__(self, B: int, H: int, W: int) -> mx.array:
+        """Returns positions of shape [B, H*W, 2]."""
+        key = (H, W)
+        if key not in self._cache:
+            y = mx.arange(H)
+            x = mx.arange(W)
+            # cartesian product → [H*W, 2]
+            yy = mx.repeat(mx.expand_dims(y, 1), W, axis=1).reshape(-1)
+            xx = mx.tile(mx.expand_dims(x, 0), (H, 1)).reshape(-1)
+            self._cache[key] = mx.stack([yy, xx], axis=1)   # [H*W, 2]
+        base = self._cache[key]                              # [H*W, 2]
+        return mx.repeat(mx.expand_dims(base, 0), B, axis=0)  # [B, H*W, 2]
+
+
+# ---------------------------------------------------------------------------
+# 3D temporal+spatial RoPE (used in camera head, disabled by default)
+# ---------------------------------------------------------------------------
+
+class RotaryEmbedding3D(nn.Module):
+    """3D Rotary Position Embedding for streaming video tokens.
+
+    Mirrors WanRotaryPosEmbed from lingbot_map/layers/rope.py.
+    Allocates head_dim across (temporal, height, width) dimensions.
+    """
+
+    def __init__(self, head_dim: int, max_seq_len: int = 1024,
+                 theta: float = 10000.0,
+                 fhw_dim: Optional[Tuple[int, int, int]] = None):
+        super().__init__()
+        if fhw_dim is not None:
+            t_dim, h_dim, w_dim = fhw_dim
+        else:
+            h_dim = w_dim = 2 * (head_dim // 6)
+            t_dim = head_dim - h_dim - w_dim
+        self.fhw_dim = (t_dim, h_dim, w_dim)
+
+        # Pre-compute frequency tables for each axis (on CPU, then convert)
+        def make_freqs(d: int) -> mx.array:
+            exps = np.arange(0, d, 2, dtype=np.float32) / d
+            inv_freq = 1.0 / (theta ** exps)  # [d//2]
+            pos = np.arange(max_seq_len, dtype=np.float32)
+            return mx.array(np.outer(pos, inv_freq))  # [max_seq, d//2]
+
+        self.freqs_t = make_freqs(t_dim)
+        self.freqs_h = make_freqs(h_dim)
+        self.freqs_w = make_freqs(w_dim)
+
+    def __call__(self, ppf: int, pph: int, ppw: int, patch_start_idx: int,
+                 f_start: int = 0, f_end: Optional[int] = None) -> mx.array:
+        """Build 3D RoPE frequency tensor.
+
+        Returns real-valued angles of shape [1, 1, T, head_dim//2]
+        where T = ppf * (patch_start_idx + pph * ppw).
+        """
+        if f_end is not None:
+            ppf = f_end - f_start
+            frame_slice = slice(f_start, f_end)
+        else:
+            frame_slice = slice(0, ppf)
+
+        ft, fh, fw = self.freqs_t, self.freqs_h, self.freqs_w
+
+        if patch_start_idx > 0:
+            # Special tokens: position (f, i, i) on the diagonal
+            ff_s = ft[frame_slice].reshape(ppf, 1, -1).broadcast_to((ppf, patch_start_idx, ft.shape[-1]))
+            fh_s = fh[:patch_start_idx].reshape(1, patch_start_idx, -1).broadcast_to((ppf, patch_start_idx, fh.shape[-1]))
+            fw_s = fw[:patch_start_idx].reshape(1, patch_start_idx, -1).broadcast_to((ppf, patch_start_idx, fw.shape[-1]))
+            freqs_special = mx.concatenate([ff_s, fh_s, fw_s], axis=-1)  # [ppf, N_sp, dim/2]
+
+            # Patch tokens
+            ff_p = ft[frame_slice].reshape(ppf, 1, 1, -1).broadcast_to((ppf, pph, ppw, ft.shape[-1]))
+            fh_p = fh[patch_start_idx:patch_start_idx + pph].reshape(1, pph, 1, -1).broadcast_to((ppf, pph, ppw, fh.shape[-1]))
+            fw_p = fw[patch_start_idx:patch_start_idx + ppw].reshape(1, 1, ppw, -1).broadcast_to((ppf, pph, ppw, fw.shape[-1]))
+            freqs_patches = mx.concatenate([ff_p, fh_p, fw_p], axis=-1).reshape(ppf, pph * ppw, -1)
+
+            freqs = mx.concatenate([freqs_special, freqs_patches], axis=1)  # [ppf, N_sp+N_p, dim/2]
+        else:
+            ff_p = ft[frame_slice].reshape(ppf, 1, 1, -1).broadcast_to((ppf, pph, ppw, ft.shape[-1]))
+            fh_p = fh[:pph].reshape(1, pph, 1, -1).broadcast_to((ppf, pph, ppw, fh.shape[-1]))
+            fw_p = fw[:ppw].reshape(1, 1, ppw, -1).broadcast_to((ppf, pph, ppw, fw.shape[-1]))
+            freqs = mx.concatenate([ff_p, fh_p, fw_p], axis=-1).reshape(ppf * pph * ppw, -1)
+
+        total_tokens = freqs.shape[0] if patch_start_idx == 0 else ppf * (patch_start_idx + pph * ppw)
+        return freqs.reshape(1, 1, total_tokens, -1)   # [1, 1, T, dim/2]
diff --git a/mlx_infer/weights.py b/mlx_infer/weights.py
new file mode 100644
index 0000000..4ab8f3a
--- /dev/null
+++ b/mlx_infer/weights.py
@@ -0,0 +1,221 @@
+"""
+PyTorch checkpoint → MLX weight conversion for GCTStreamMLX.
+
+Usage
+-----
+    from mlx_infer.weights import load_checkpoint
+    model = GCTStreamMLX(...)
+    load_checkpoint(model, "path/to/checkpoint.pt")
+
+Key remapping
+-------------
+The PyTorch checkpoint uses slightly different attribute paths than the MLX
+model for a few structural reasons:
+
+1. patch_embed.patch_embed.proj.* → patch_embed.patch_embed.*
+   PyTorch VisionTransformer wraps the patch Conv2d in a PatchEmbed class with
+   a .proj attribute.  Our ViTMLX stores the Conv2d directly.
+
+2. camera_head.poseLN_modulation.1.* → camera_head.poseLN_linear.*
+   PyTorch uses nn.Sequential([SiLU, Linear]); we store the Linear as a named
+   attribute `poseLN_linear`.
+
+3. {depth,point}_head.resize_layers.{0,1,3}.* → {depth,point}_head.resize_conv{0,1,3}.*
+   PyTorch DPTHead stores resize convs in a ModuleList; ours use named attrs.
+
+4. {depth,point}_head.scratch.* → {depth,point}_head.*
+   PyTorch DPTHead groups fusion layers under a nested `scratch` module; ours
+   are flat at the head level.
+
+5. {depth,point}_head.scratch.output_conv2.0.* → {depth,point}_head.output_conv2a.*
+   {depth,point}_head.scratch.output_conv2.2.* → {depth,point}_head.output_conv2b.*
+   PyTorch uses nn.Sequential for the two output convs; ours are named attrs.
+
+ConvTranspose2d weight layout
+------------------------------
+PyTorch ConvTranspose2d weight: [in_channels, out_channels, kH, kW]
+MLX   ConvTranspose2d weight: [out_channels, kH, kW, in_channels]
+→ transpose axes (1, 2, 3, 0)
+
+Conv2d weight layout
+--------------------
+PyTorch Conv2d weight: [out_channels, in_channels, kH, kW]
+MLX   Conv2d weight: [out_channels, kH, kW, in_channels]
+→ transpose axes (0, 2, 3, 1)
+"""
+
+import re
+from pathlib import Path
+from typing import Dict, Any
+import numpy as np
+import mlx.core as mx
+import mlx.nn as nn
+from mlx.utils import tree_flatten
+
+
+# ---------------------------------------------------------------------------
+# Key remapping rules
+# ---------------------------------------------------------------------------
+
+def _remap_key(k: str) -> str:
+    """Transform a PyTorch checkpoint key to its MLX model equivalent."""
+
+    # 1. patch_embed Conv2d: remove the extra .proj level
+    k = re.sub(
+        r"(aggregator\.patch_embed\.patch_embed)\.proj\.(weight|bias)$",
+        r"\1.\2", k
+    )
+
+    # 2. camera_head poseLN_modulation sequential → named attr
+    k = re.sub(
+        r"camera_head\.poseLN_modulation\.1\.(weight|bias)$",
+        r"camera_head.poseLN_linear.\1", k
+    )
+
+    # 3. DPT resize_layers ModuleList → named attributes
+    for head in ("depth_head", "point_head"):
+        for old, new in (("0", "0"), ("1", "1"), ("3", "3")):
+            k = k.replace(
+                f"{head}.resize_layers.{old}.",
+                f"{head}.resize_conv{new}."
+            )
+
+    # 4. Strip DPT scratch.* namespace (also handles scratch.refinenet*, scratch.layer*_rn)
+    for head in ("depth_head", "point_head"):
+        k = k.replace(f"{head}.scratch.", f"{head}.")
+
+    # 5. DPT output_conv2 Sequential indices → named attrs
+    for head in ("depth_head", "point_head"):
+        k = k.replace(f"{head}.output_conv2.0.", f"{head}.output_conv2a.")
+        k = k.replace(f"{head}.output_conv2.2.", f"{head}.output_conv2b.")
+
+    return k
+
+
+# Keys to silently drop (no corresponding parameter in the MLX model)
+_DROP_PATTERNS = [
+    re.compile(r"\.mask_token$"),            # not used at inference
+    re.compile(r"\.poseLN_modulation\.0\."),  # SiLU has no params
+]
+
+
+def _should_drop(k: str) -> bool:
+    return any(p.search(k) for p in _DROP_PATTERNS)
+
+
+# Keys whose weight layout needs ConvTranspose2d treatment.
+# Checked on the ORIGINAL (pre-remap) key.
+_CONV_TRANSPOSE_RE = re.compile(r"resize_layers\.[01]\.weight$")
+
+
+def _is_conv_transpose(original_key: str) -> bool:
+    return bool(_CONV_TRANSPOSE_RE.search(original_key))
+
+
+# ---------------------------------------------------------------------------
+# Tensor conversion
+# ---------------------------------------------------------------------------
+
+def _convert_tensor(original_key: str, t) -> mx.array:
+    """Convert a single PyTorch tensor to an MLX array with layout fixes."""
+    arr = np.asarray(t.float().cpu())   # always float32
+
+    if arr.ndim == 4 and original_key.endswith(".weight"):
+        if _is_conv_transpose(original_key):
+            # PyTorch ConvTranspose2d: [I, O, kH, kW] → MLX [O, kH, kW, I]
+            arr = arr.transpose(1, 2, 3, 0)
+        else:
+            # PyTorch Conv2d: [O, I, kH, kW] → MLX [O, kH, kW, I]
+            arr = arr.transpose(0, 2, 3, 1)
+
+    return mx.array(arr)
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+def convert_state_dict(state_dict: Dict[str, Any]) -> Dict[str, mx.array]:
+    """Convert + remap a PyTorch state dict to MLX-compatible flat weight dict."""
+    out: Dict[str, mx.array] = {}
+    for k, v in state_dict.items():
+        if _should_drop(k):
+            continue
+        mlx_key = _remap_key(k)
+        try:
+            out[mlx_key] = _convert_tensor(k, v)
+        except Exception as e:
+            print(f"[weights] skipping {k}: {e}")
+    return out
+
+
+def load_checkpoint(
+    model: nn.Module,
+    path: str,
+    strict: bool = False,
+    verbose: bool = True,
+) -> nn.Module:
+    """Load a PyTorch .pt checkpoint into an MLX GCTStreamMLX model.
+
+    Parameters
+    ----------
+    model : nn.Module
+        The GCTStreamMLX instance to populate.
+    path : str
+        Path to the PyTorch checkpoint (.pt file).
+    strict : bool
+        If True, raise on missing or unexpected keys (after remapping).
+    verbose : bool
+        Print key statistics.
+
+    Returns
+    -------
+    model (in-place modified).
+    """
+    import torch
+
+    ckpt = torch.load(path, map_location="cpu", weights_only=False)
+
+    if isinstance(ckpt, dict):
+        if "model" in ckpt:
+            state_dict = ckpt["model"]
+        elif "state_dict" in ckpt:
+            state_dict = ckpt["state_dict"]
+        else:
+            state_dict = ckpt
+    else:
+        raise ValueError(f"Unexpected checkpoint type: {type(ckpt)}")
+
+    mlx_weights = convert_state_dict(state_dict)
+
+    # Collect the model's parameter keys from the flattened tree
+    model_keys = set(k for k, _ in tree_flatten(model.parameters()))
+    ckpt_keys  = set(mlx_weights.keys())
+
+    matched    = model_keys & ckpt_keys
+    missing    = model_keys - ckpt_keys     # in model, not in ckpt
+    unexpected = ckpt_keys - model_keys     # in ckpt, not in model
+
+    if verbose:
+        print(f"[weights] loaded {len(state_dict)} tensors from {Path(path).name}")
+        print(f"[weights] matched {len(matched)} / {len(model_keys)} model params")
+        if missing:
+            print(f"[weights] missing  ({len(missing)}): "
+                  + ", ".join(sorted(missing)[:6])
+                  + (" ..." if len(missing) > 6 else ""))
+        if unexpected:
+            print(f"[weights] unexpected ({len(unexpected)}): "
+                  + ", ".join(sorted(unexpected)[:6])
+                  + (" ..." if len(unexpected) > 6 else ""))
+
+    if strict and (missing or unexpected):
+        raise RuntimeError(
+            f"Strict load failed: {len(missing)} missing, {len(unexpected)} unexpected"
+        )
+
+    # Filter to only matched keys, then load — pass strict=False so MLX doesn't
+    # raise for parameters that exist in the model but weren't in the checkpoint.
+    filtered = {k: v for k, v in mlx_weights.items() if k in model_keys}
+    model.load_weights(list(filtered.items()), strict=False)
+    mx.eval(model.parameters())
+    return model

From 17bf6ec815c362c93bd2f7d315b8ed09fb49135f Mon Sep 17 00:00:00 2001
From: Dan <dan@DandeMacBook-Air.local>
Date: Wed, 29 Apr 2026 21:47:58 +0800
Subject: [PATCH 02/13] Fix float16 scipy error; cap k_special growth

- _interpolate_pos_embed: force float32 when extracting pos_embed to numpy
  for scipy.ndimage.zoom (rejects float16 arrays with RuntimeError)
- Add max_special_tokens cap to CausalAttentionMLX / StreamingBlock;
  thread kv_cache_max_special_frames through AggregatorMLX, CameraHeadMLX,
  GCTStreamMLX; expose as --max-special-frames CLI flag (default: None)
  k_special grows 6 tokens/frame indefinitely without this cap; set to
  e.g. 100 for sequences >300 frames with small --kv-sliding-window
---
 mlx_infer/aggregator.py | 30 +++++++++++++++++++++---------
 mlx_infer/demo.py       | 18 +++++++++++++++---
 mlx_infer/heads.py      |  8 +++++---
 mlx_infer/layers.py     | 33 +++++++++++++++++++++------------
 mlx_infer/model.py      |  7 +++++++
 5 files changed, 69 insertions(+), 27 deletions(-)

diff --git a/mlx_infer/aggregator.py b/mlx_infer/aggregator.py
index 2a8a811..2192585 100644
--- a/mlx_infer/aggregator.py
+++ b/mlx_infer/aggregator.py
@@ -119,16 +119,16 @@ def _interpolate_pos_embed(self, h: int, w: int) -> mx.array:
             self._pos_embed_cache = {}
         if key not in self._pos_embed_cache:
             from scipy.ndimage import zoom
-            patch_pe = np.array(self.pos_embed[0, 1:])      # [M*M, C]
+            patch_pe = np.array(self.pos_embed[0, 1:], dtype=np.float32)  # [M*M, C]
             patch_pe = patch_pe.reshape(M, M, -1)            # [M, M, C]
             scale_h = h / M
             scale_w = w / M
             patch_pe = zoom(patch_pe, (scale_h, scale_w, 1), order=3)  # bicubic
             patch_pe = patch_pe.reshape(1, h * w, -1)        # [1, h*w, C]
-            cls_pe = np.array(self.pos_embed[0:1, :1])       # [1, 1, C]
+            cls_pe = np.array(self.pos_embed[0:1, :1], dtype=np.float32)  # [1, 1, C]
             full_pe = np.concatenate([cls_pe, patch_pe], axis=1)
             self._pos_embed_cache[key] = mx.array(full_pe.astype(np.float32))
-        return self._pos_embed_cache[key]
+        return self._pos_embed_cache[key].astype(self.pos_embed.dtype)
 
     def __call__(self, x: mx.array) -> mx.array:
         """x: [B, H, W, 3] → patch tokens [B, N_patch, embed_dim]."""
@@ -207,6 +207,7 @@ def __init__(
         kv_cache_sliding_window: int = 64,
         kv_cache_scale_frames: int = 8,
         kv_cache_keep_special: bool = True,
+        kv_cache_max_special_frames: Optional[int] = None,
     ):
         super().__init__()
         assert depth % aa_block_size == 0
@@ -242,6 +243,12 @@ def __init__(
         # Frame blocks: standard TransformerBlock (per-frame, no KV cache)
         self.frame_blocks = [TransformerBlock(**block_kw) for _ in range(depth)]
 
+        # patch_start_idx: camera(1) + register(num_register_tokens) + scale(1)
+        # Computed here (before StreamingBlocks) so we can pass max_special_tokens.
+        self.patch_start_idx = 1 + num_register_tokens + 1
+        _max_sp_tok = (kv_cache_max_special_frames * self.patch_start_idx
+                       if kv_cache_max_special_frames is not None else None)
+
         # Global blocks: StreamingBlock with causal KV cache
         self.global_blocks = [
             StreamingBlock(
@@ -249,6 +256,7 @@ def __init__(
                 sliding_window=kv_cache_sliding_window,
                 scale_frames=kv_cache_scale_frames,
                 keep_special=kv_cache_keep_special,
+                max_special_tokens=_max_sp_tok,
             )
             for _ in range(depth)
         ]
@@ -258,9 +266,6 @@ def __init__(
         self.camera_token   = mx.zeros((1, 2, 1, embed_dim))
         self.register_token = mx.zeros((1, 2, num_register_tokens, embed_dim))
         self.scale_token    = mx.ones((1, 2, 1, embed_dim))
-
-        # patch_start_idx: camera(1) + register(num_register_tokens) + scale(1)
-        self.patch_start_idx = 1 + num_register_tokens + 1
         self.num_special_tokens = self.patch_start_idx
 
         # KV cache: one dict per global block, initialised lazily
@@ -372,12 +377,13 @@ def __call__(
         # transpose to NHWC: [B*S, H, W, 3]
         imgs = images.reshape(B * S, 3, H, W)
         imgs = imgs.transpose(0, 2, 3, 1)                   # [B*S, H, W, 3]
-        mean = mx.array(_RESNET_MEAN, dtype=imgs.dtype).reshape(1, 1, 1, 3)
-        std  = mx.array(_RESNET_STD,  dtype=imgs.dtype).reshape(1, 1, 1, 3)
-        imgs = (imgs - mean) / std
+        imgs = (imgs - self._mean.astype(imgs.dtype)) / self._std.astype(imgs.dtype)
 
         # ---- DINOv2 patch embedding ----
         patch_tokens = self.patch_embed(imgs)                # [B*S, N_patch, C]
+        # Materialise the backbone before frame/global blocks so MLX compiles
+        # two smaller subgraphs rather than one 72-block graph per frame.
+        mx.eval(patch_tokens)
         C = patch_tokens.shape[-1]
 
         # ---- Special tokens ----
@@ -393,6 +399,9 @@ def __call__(
         head_dim = self.embed_dim // self.num_heads          # 64 for ViT-L
         cos, sin = self.rope.get_cos_sin(
             None, pos, head_dim=head_dim)                    # [B*S, 1, P, 64]
+        # Cast to compute dtype so float16 Q/K aren't silently upcast by float32 tables.
+        cos = cos.astype(tokens.dtype)
+        sin = sin.astype(tokens.dtype)
 
         # ---- Alternating frame / global attention ----
         output_list: List[mx.array] = []
@@ -437,6 +446,9 @@ def __call__(
             if selected_idx is None or group in selected_idx:
                 for fi, gi in zip(frame_outs, global_outs):
                     output_list.append(mx.concatenate([fi, gi], axis=-1))  # [B, S, P, 2C]
+                # Break the lazy graph: forces this segment to execute and lets the
+                # next segment compile independently (~6 blocks per segment).
+                mx.eval(tokens_global)
 
         # Update frame counter (only on keyframe path, skip_append=False)
         if self._kv_cache is not None and not self._kv_cache[0].get("_skip_append", False):
diff --git a/mlx_infer/demo.py b/mlx_infer/demo.py
index 0fc2f3c..ba7c543 100644
--- a/mlx_infer/demo.py
+++ b/mlx_infer/demo.py
@@ -146,8 +146,16 @@ def main():
                         help="Number of initial scale frames (Phase 1)")
     parser.add_argument("--keyframe-interval", type=int, default=1,
                         help="Keyframe interval (1 = every frame)")
+    parser.add_argument("--kv-sliding-window", type=int, default=64,
+                        help="KV-cache sliding window in frames. "
+                             "Dominant cost is 24 attn blocks x window*1375 keys per frame. "
+                             "sw=64 (default/accurate), sw=16 (~2.5x faster), sw=8 (~3.3x faster)")
+    parser.add_argument("--max-special-frames", type=int, default=None,
+                        help="Cap k_special (evicted frames' tokens) at this many frames. "
+                             "k_special grows 6 tokens/frame indefinitely; for sequences "
+                             ">300 frames with small --kv-sliding-window, set to e.g. 100.")
     parser.add_argument("--dtype", choices=["float32", "float16"], default="float32",
-                        help="Compute dtype (float16 is faster but less stable)")
+                        help="Compute dtype (float16 is ~2x faster, recommended for speed)")
     parser.add_argument("--max-frames", type=int, default=None,
                         help="Only process first N frames (default: all)")
     # Visualization
@@ -165,6 +173,9 @@ def main():
 
     # ---- Build model ----
     print("Building GCTStreamMLX model...")
+    tokens_per_frame = (args.img_size // 14) ** 2 + 6
+    print(f"KV-cache: scale={args.scale_frames} + sliding={args.kv_sliding_window} frames "
+          f"= {(args.scale_frames + args.kv_sliding_window) * tokens_per_frame:,} keys/block")
     model = GCTStreamMLX(
         img_size=args.img_size,
         patch_size=14,
@@ -172,8 +183,9 @@ def main():
         depth=24,
         num_heads=16,
         num_register_tokens=4,
-        kv_cache_sliding_window=64,
+        kv_cache_sliding_window=args.kv_sliding_window,
         kv_cache_scale_frames=args.scale_frames,
+        kv_cache_max_special_frames=args.max_special_frames,
         camera_num_iterations=4,
         enable_depth=True,
         enable_point=False,
@@ -196,7 +208,7 @@ def main():
 
     if args.dtype == "float16":
         images = images.astype(mx.float16)
-        model.apply(lambda x: x.astype(mx.float16) if mx.is_array(x) else x)
+        model.apply(lambda x: x.astype(mx.float16) if isinstance(x, mx.array) else x)
 
     # ---- Streaming inference ----
     print("Running streaming inference...")
diff --git a/mlx_infer/heads.py b/mlx_infer/heads.py
index 9e7f849..5187ebf 100644
--- a/mlx_infer/heads.py
+++ b/mlx_infer/heads.py
@@ -104,6 +104,7 @@ def __init__(
         kv_cache_sliding_window: int = 64,
         kv_cache_scale_frames: int = 8,
         kv_cache_keep_special: bool = True,
+        kv_cache_max_special_tokens: Optional[int] = None,
     ):
         super().__init__()
         self.dim_in = dim_in
@@ -120,6 +121,7 @@ def __init__(
                 sliding_window=kv_cache_sliding_window,
                 scale_frames=kv_cache_scale_frames,
                 keep_special=kv_cache_keep_special,
+                max_special_tokens=kv_cache_max_special_tokens,
             )
             for _ in range(trunk_depth)
         ]
@@ -314,8 +316,8 @@ def _bilinear(x: mx.array, hw: Tuple[int, int], align_corners: bool = True) -> m
     x0 = mx.clip(mx.floor(x_src).astype(mx.int32), 0, W - 1)
     x1 = mx.clip(x0 + 1, 0, W - 1)
 
-    wy1 = (y_src - mx.floor(y_src)).reshape(1, h, 1, 1)
-    wx1 = (x_src - mx.floor(x_src)).reshape(1, 1, w, 1)
+    wy1 = (y_src - mx.floor(y_src)).reshape(1, h, 1, 1).astype(x.dtype)
+    wx1 = (x_src - mx.floor(x_src)).reshape(1, 1, w, 1).astype(x.dtype)
     wy0 = 1.0 - wy1
     wx0 = 1.0 - wx1
 
@@ -528,5 +530,5 @@ def _sincos(coords: np.ndarray, dim: int) -> np.ndarray:
             emb = emb.reshape(ph, pw, C).astype(np.float32) * ratio
             self._pos_embed_cache[key] = mx.array(emb[None])  # [1, ph, pw, C]
 
-        emb = self._pos_embed_cache[key]
+        emb = self._pos_embed_cache[key].astype(x.dtype)
         return x + mx.broadcast_to(emb, x.shape)
diff --git a/mlx_infer/layers.py b/mlx_infer/layers.py
index 0c1f37d..ccc0fb2 100644
--- a/mlx_infer/layers.py
+++ b/mlx_infer/layers.py
@@ -178,7 +178,7 @@ class CausalAttentionMLX(nn.Module):
     def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = True,
                  proj_bias: bool = True, qk_norm: bool = False,
                  sliding_window: int = 64, scale_frames: int = 8,
-                 keep_special: bool = True):
+                 keep_special: bool = True, max_special_tokens: Optional[int] = None):
         super().__init__()
         self.num_heads = num_heads
         self.head_dim = dim // num_heads
@@ -186,6 +186,7 @@ def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = True,
         self.sliding_window = sliding_window
         self.scale_frames = scale_frames
         self.keep_special = keep_special
+        self.max_special_tokens = max_special_tokens
 
         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
         self.proj = nn.Linear(dim, dim, bias=proj_bias)
@@ -243,14 +244,26 @@ def __call__(self, x: mx.array,
                     if n_frames > total_keep:
                         # Preserve special tokens from evicted frames
                         if self.keep_special:
-                            evict_k = k_cat[:, :, self.scale_frames:n_frames - self.sliding_window, :patch_start_idx, :]
-                            evict_v = v_cat[:, :, self.scale_frames:n_frames - self.sliding_window, :patch_start_idx, :]
+                            n_evict = n_frames - self.sliding_window - self.scale_frames
+                            evict_k = k_cat[:, :, self.scale_frames:self.scale_frames + n_evict,
+                                            :patch_start_idx, :]
+                            evict_v = v_cat[:, :, self.scale_frames:self.scale_frames + n_evict,
+                                            :patch_start_idx, :]
+                            # Store flat [B, H, n_evict*patch_start_idx, D] to avoid
+                            # reshape on every attention call.
+                            B_, H_, ne, ps, Dh = evict_k.shape
+                            evict_k = evict_k.reshape(B_, H_, ne * ps, Dh)
+                            evict_v = evict_v.reshape(B_, H_, ne * ps, Dh)
                             if kv_cache.get("k_special") is None:
                                 kv_cache["k_special"] = evict_k
                                 kv_cache["v_special"] = evict_v
                             else:
                                 kv_cache["k_special"] = mx.concatenate([kv_cache["k_special"], evict_k], axis=2)
                                 kv_cache["v_special"] = mx.concatenate([kv_cache["v_special"], evict_v], axis=2)
+                                if (self.max_special_tokens is not None and
+                                        kv_cache["k_special"].shape[2] > self.max_special_tokens):
+                                    kv_cache["k_special"] = kv_cache["k_special"][:, :, -self.max_special_tokens:]
+                                    kv_cache["v_special"] = kv_cache["v_special"][:, :, -self.max_special_tokens:]
                         kv_cache["k"] = mx.concatenate([
                             k_cat[:, :, :self.scale_frames],
                             k_cat[:, :, -self.sliding_window:],
@@ -273,15 +286,10 @@ def __call__(self, x: mx.array,
                 k_full = k_cached.reshape(B, self.num_heads, -1, self.head_dim)
                 v_full = v_cached.reshape(B, self.num_heads, -1, self.head_dim)
 
-                # Prepend preserved special tokens
+                # Prepend preserved special tokens (stored flat: [B, H, N_sp, D])
                 if kv_cache.get("k_special") is not None:
-                    ks = kv_cache["k_special"]
-                    vs = kv_cache["v_special"]
-                    sa, sb, sc, sd, se = ks.shape
-                    ks = ks.reshape(sa, sb, sc * sd, se)
-                    vs = vs.reshape(sa, sb, sc * sd, se)
-                    k_full = mx.concatenate([ks, k_full], axis=2)
-                    v_full = mx.concatenate([vs, v_full], axis=2)
+                    k_full = mx.concatenate([kv_cache["k_special"], k_full], axis=2)
+                    v_full = mx.concatenate([kv_cache["v_special"], v_full], axis=2)
 
             x_out = mx.fast.scaled_dot_product_attention(q, k_full, v_full, scale=self.scale)
 
@@ -296,13 +304,14 @@ def __init__(self, dim: int, num_heads: int, mlp_ratio: float = 4.0,
                  qkv_bias: bool = True, proj_bias: bool = True, ffn_bias: bool = True,
                  qk_norm: bool = False, init_values: Optional[float] = None,
                  sliding_window: int = 64, scale_frames: int = 8,
-                 keep_special: bool = True):
+                 keep_special: bool = True, max_special_tokens: Optional[int] = None):
         super().__init__()
         self.norm1 = LayerNorm(dim)
         self.attn = CausalAttentionMLX(
             dim, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias,
             qk_norm=qk_norm, sliding_window=sliding_window,
             scale_frames=scale_frames, keep_special=keep_special,
+            max_special_tokens=max_special_tokens,
         )
         self.ls1 = LayerScale(dim, init_values) if init_values else nn.Identity()
         self.norm2 = LayerNorm(dim)
diff --git a/mlx_infer/model.py b/mlx_infer/model.py
index 97bafca..b9ec7a0 100644
--- a/mlx_infer/model.py
+++ b/mlx_infer/model.py
@@ -46,6 +46,7 @@ def __init__(
         kv_cache_sliding_window: int = 64,
         kv_cache_scale_frames: int = 8,
         kv_cache_keep_special: bool = True,
+        kv_cache_max_special_frames: Optional[int] = None,
         camera_num_iterations: int = 4,
         enable_depth: bool = True,
         enable_point: bool = True,
@@ -55,6 +56,10 @@ def __init__(
         self.patch_size = patch_size
         dim_2c = 2 * embed_dim   # concatenated frame+global output dim
 
+        # patch_start_idx = camera(1) + register(num_register_tokens) + scale(1)
+        _max_sp_tok = (kv_cache_max_special_frames * (1 + num_register_tokens + 1)
+                       if kv_cache_max_special_frames is not None else None)
+
         self.aggregator = AggregatorMLX(
             img_size=img_size,
             patch_size=patch_size,
@@ -65,6 +70,7 @@ def __init__(
             kv_cache_sliding_window=kv_cache_sliding_window,
             kv_cache_scale_frames=kv_cache_scale_frames,
             kv_cache_keep_special=kv_cache_keep_special,
+            kv_cache_max_special_frames=kv_cache_max_special_frames,
         )
 
         self.camera_head = CameraHeadMLX(
@@ -75,6 +81,7 @@ def __init__(
             kv_cache_sliding_window=kv_cache_sliding_window,
             kv_cache_scale_frames=kv_cache_scale_frames,
             kv_cache_keep_special=kv_cache_keep_special,
+            kv_cache_max_special_tokens=_max_sp_tok,
         )
 
         self.depth_head = DPTHeadMLX(

From 1e74aa3668268609083960646dc1716bb9d17326 Mon Sep 17 00:00:00 2001
From: Dan <dan@DandeMacBook-Air.local>
Date: Wed, 29 Apr 2026 22:25:06 +0800
Subject: [PATCH 03/13] Speed up _bilinear: precompute flat gather indices,
 cache as MLX arrays

Replace double sequential fancy indexing (x[:,y0,:,:][:,:,x0,:]) with
precomputed flat 1D gather indices into a reshaped [B, H*W, C] view.
Indices computed once per (H,W,h,w) with numpy and cached as concrete
MLX int32 arrays; removes 8 lazy-graph arange/clip/floor ops per call.
Also factors the bilinear blend: 6 muls vs 8, and uses 2.1-2.4x speedup.
End-to-end: 1.5 fps -> 1.9 fps on 294x518, kv-sw=8, float16.
---
 mlx_infer/heads.py | 77 +++++++++++++++++++++++++++++++---------------
 1 file changed, 53 insertions(+), 24 deletions(-)

diff --git a/mlx_infer/heads.py b/mlx_infer/heads.py
index 5187ebf..2f4b15f 100644
--- a/mlx_infer/heads.py
+++ b/mlx_infer/heads.py
@@ -297,36 +297,65 @@ def __call__(self, x: mx.array,
         return x
 
 
+_bilinear_grid_cache: Dict[tuple, tuple] = {}
+
 def _bilinear(x: mx.array, hw: Tuple[int, int], align_corners: bool = True) -> mx.array:
-    """Bilinear resize [B, H, W, C] → [B, h, w, C]."""
+    """Bilinear resize [B, H, W, C] → [B, h, w, C].
+
+    Indices and weights are precomputed with numpy on first call for each
+    (H, W, h, w) pair and cached as concrete MLX arrays, avoiding redundant
+    lazy-graph nodes on every frame and halving the number of gather ops via
+    2D broadcast indexing.
+    """
     B, H, W, C = x.shape
     h, w = hw
     if (h, w) == (H, W):
         return x
 
-    if align_corners:
-        y_src = mx.arange(h, dtype=mx.float32) * ((H - 1) / max(h - 1, 1))
-        x_src = mx.arange(w, dtype=mx.float32) * ((W - 1) / max(w - 1, 1))
-    else:
-        y_src = (mx.arange(h, dtype=mx.float32) + 0.5) * (H / h) - 0.5
-        x_src = (mx.arange(w, dtype=mx.float32) + 0.5) * (W / w) - 0.5
-
-    y0 = mx.clip(mx.floor(y_src).astype(mx.int32), 0, H - 1)
-    y1 = mx.clip(y0 + 1, 0, H - 1)
-    x0 = mx.clip(mx.floor(x_src).astype(mx.int32), 0, W - 1)
-    x1 = mx.clip(x0 + 1, 0, W - 1)
-
-    wy1 = (y_src - mx.floor(y_src)).reshape(1, h, 1, 1).astype(x.dtype)
-    wx1 = (x_src - mx.floor(x_src)).reshape(1, 1, w, 1).astype(x.dtype)
-    wy0 = 1.0 - wy1
-    wx0 = 1.0 - wx1
-
-    q00 = x[:, y0, :, :][:, :, x0, :]   # [B, h, w, C]
-    q01 = x[:, y0, :, :][:, :, x1, :]
-    q10 = x[:, y1, :, :][:, :, x0, :]
-    q11 = x[:, y1, :, :][:, :, x1, :]
-
-    return q00 * wy0 * wx0 + q01 * wy0 * wx1 + q10 * wy1 * wx0 + q11 * wy1 * wx1
+    key = (H, W, h, w, align_corners)
+    if key not in _bilinear_grid_cache:
+        if align_corners:
+            y_src = np.arange(h, dtype=np.float32) * ((H - 1) / max(h - 1, 1))
+            x_src = np.arange(w, dtype=np.float32) * ((W - 1) / max(w - 1, 1))
+        else:
+            y_src = (np.arange(h, dtype=np.float32) + 0.5) * (H / h) - 0.5
+            x_src = (np.arange(w, dtype=np.float32) + 0.5) * (W / w) - 0.5
+
+        y0 = np.clip(np.floor(y_src).astype(np.int32), 0, H - 1)
+        y1 = np.clip(y0 + 1, 0, H - 1)
+        x0 = np.clip(np.floor(x_src).astype(np.int32), 0, W - 1)
+        x1 = np.clip(x0 + 1, 0, W - 1)
+
+        wy1 = (y_src - np.floor(y_src)).reshape(1, h, 1, 1).astype(np.float32)
+        wx1 = (x_src - np.floor(x_src)).reshape(1, 1, w, 1).astype(np.float32)
+
+        # Flat 1D indices into H*W — MLX supports 1D fancy indexing reliably.
+        # Precomputed as numpy; shape [h*w] so x.reshape(B, H*W, C)[:, flat, :]
+        # produces [B, h*w, C] with no intermediate [B, h, W, C] tensor.
+        flat_00 = (y0[:, None] * W + x0[None, :]).reshape(-1)
+        flat_01 = (y0[:, None] * W + x1[None, :]).reshape(-1)
+        flat_10 = (y1[:, None] * W + x0[None, :]).reshape(-1)
+        flat_11 = (y1[:, None] * W + x1[None, :]).reshape(-1)
+
+        _bilinear_grid_cache[key] = (
+            mx.array(flat_00), mx.array(flat_01),           # MLX int32 [h*w]
+            mx.array(flat_10), mx.array(flat_11),
+            mx.array(1.0 - wy1), mx.array(1.0 - wx1),      # wy0, wx0 as MLX float32
+            mx.array(wy1),        mx.array(wx1),             # wy1, wx1 as MLX float32
+        )
+
+    flat_00, flat_01, flat_10, flat_11, wy0, wx0, wy1, wx1 = _bilinear_grid_cache[key]
+
+    wy0 = wy0.astype(x.dtype); wx0 = wx0.astype(x.dtype)
+    wy1 = wy1.astype(x.dtype); wx1 = wx1.astype(x.dtype)
+
+    x_flat = x.reshape(B, H * W, C)
+    q00 = x_flat[:, flat_00, :].reshape(B, h, w, C)
+    q01 = x_flat[:, flat_01, :].reshape(B, h, w, C)
+    q10 = x_flat[:, flat_10, :].reshape(B, h, w, C)
+    q11 = x_flat[:, flat_11, :].reshape(B, h, w, C)
+
+    return (q00 * wy0 + q10 * wy1) * wx0 + (q01 * wy0 + q11 * wy1) * wx1
 
 
 # ---------------------------------------------------------------------------

From d88afdc00ed44acc898467a665557797b7d8b4d1 Mon Sep 17 00:00:00 2001
From: Dan <dan@DandeMacBook-Air.local>
Date: Wed, 29 Apr 2026 23:21:28 +0800
Subject: [PATCH 04/13] Add mx.compile steady-state fast path to
 CausalAttentionMLX

In steady state (cache full at scale_frames+sliding_window frames,
k_special capped at max_special_tokens), all tensor shapes are constant
per frame. mx.compile(_step) lets Metal reuse the same compiled program
every frame instead of recompiling the 24-block attention graph.

Conditions to enter compiled path:
- num_frame_per_block == 1 (single frame streaming)
- kv_cache not skip_append
- rope_cos provided (2D RoPE, the normal streaming path)
- keep_special=True and max_special_tokens set (requires --max-special-frames)
- kv_cache['k'].shape[2] == scale_frames + sliding_window (cache full)
- kv_cache['k_special'].shape[2] == max_special_tokens (special cap reached)

_make_steady_fn builds the compiled pure function lazily on first use.
It captures layer weights in the closure; all ops are pure MLX.

k_special rotation (constant shape): drop oldest patch_start_idx tokens,
append newly evicted frame's special tokens.

Verified: compiled path output matches manual reference with 0.0 abs diff.
---
 mlx_infer/layers.py | 70 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/mlx_infer/layers.py b/mlx_infer/layers.py
index ccc0fb2..b7dd311 100644
--- a/mlx_infer/layers.py
+++ b/mlx_infer/layers.py
@@ -187,12 +187,61 @@ def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = True,
         self.scale_frames = scale_frames
         self.keep_special = keep_special
         self.max_special_tokens = max_special_tokens
+        self._steady_fn: Optional[Any] = None  # compiled steady-state step (built lazily)
 
         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
         self.proj = nn.Linear(dim, dim, bias=proj_bias)
         self.q_norm = LayerNorm(self.head_dim) if qk_norm else nn.Identity()
         self.k_norm = LayerNorm(self.head_dim) if qk_norm else nn.Identity()
 
+    def _make_steady_fn(self, tokens_per_frame: int, patch_start_idx: int):
+        """Compiled pure function for the steady-state forward pass.
+
+        Steady state: cache is exactly scale_frames+sliding_window frames,
+        k_special is exactly max_special_tokens tokens, single new frame,
+        no skip. All shapes are constant so Metal reuses the same program.
+        """
+        qkv_fn = self.qkv; proj_fn = self.proj
+        q_norm_fn = self.q_norm; k_norm_fn = self.k_norm
+        H = self.num_heads; D = self.head_dim; scale = self.scale
+        sf = self.scale_frames; sw = self.sliding_window; ps = patch_start_idx
+
+        def _step(x, k_cache, v_cache, k_special, v_special, rope_cos, rope_sin):
+            B, N, C = x.shape
+            qkv = qkv_fn(x).reshape(B, N, 3, H, D)
+            qkv = qkv.transpose(0, 2, 3, 1, 4)
+            q, k, v = qkv[:, 0], qkv[:, 1], qkv[:, 2]
+            q = q_norm_fn(q); k = k_norm_fn(k)
+            q, k = apply_rope_2d(q, k, rope_cos, rope_sin)
+
+            k_new = k.reshape(B, H, 1, tokens_per_frame, D)
+            v_new = v.reshape(B, H, 1, tokens_per_frame, D)
+            k_cat = mx.concatenate([k_cache, k_new], axis=2)  # [B, H, sf+sw+1, T, D]
+            v_cat = mx.concatenate([v_cache, v_new], axis=2)
+
+            # Evict exactly 1 frame at index sf; extract its special tokens
+            evict_k = k_cat[:, :, sf:sf + 1, :ps, :].reshape(B, H, ps, D)
+            evict_v = v_cat[:, :, sf:sf + 1, :ps, :].reshape(B, H, ps, D)
+
+            # Rotate k_special: drop oldest ps slots, append newest ps tokens
+            new_k_special = mx.concatenate([k_special[:, :, ps:], evict_k], axis=2)
+            new_v_special = mx.concatenate([v_special[:, :, ps:], evict_v], axis=2)
+
+            # Trim cache back to sf + sw frames
+            new_k_cache = mx.concatenate([k_cat[:, :, :sf], k_cat[:, :, -sw:]], axis=2)
+            new_v_cache = mx.concatenate([v_cat[:, :, :sf], v_cat[:, :, -sw:]], axis=2)
+
+            k_full = mx.concatenate(
+                [new_k_special, new_k_cache.reshape(B, H, -1, D)], axis=2)
+            v_full = mx.concatenate(
+                [new_v_special, new_v_cache.reshape(B, H, -1, D)], axis=2)
+
+            x_out = mx.fast.scaled_dot_product_attention(q, k_full, v_full, scale=scale)
+            x_out = x_out.transpose(0, 2, 1, 3).reshape(B, N, C)
+            return proj_fn(x_out), new_k_cache, new_v_cache, new_k_special, new_v_special
+
+        return mx.compile(_step)
+
     def __call__(self, x: mx.array,
                  kv_cache: Optional[Dict[str, Any]] = None,
                  num_frame_per_block: int = 1,
@@ -203,6 +252,27 @@ def __call__(self, x: mx.array,
         B, N, C = x.shape
         tokens_per_frame = N // num_frame_per_block
 
+        # --- Steady-state compiled fast path ---
+        # Conditions: single new frame, cache full, k_special at cap, no skip.
+        # All tensor shapes are constant → Metal reuses the compiled program.
+        if (kv_cache is not None and
+                not kv_cache.get("_skip_append", False) and
+                num_frame_per_block == 1 and
+                rope_cos is not None and
+                self.keep_special and
+                self.max_special_tokens is not None and
+                kv_cache.get("k") is not None and
+                kv_cache["k"].shape[2] == self.scale_frames + self.sliding_window and
+                kv_cache.get("k_special") is not None and
+                kv_cache["k_special"].shape[2] == self.max_special_tokens):
+            if self._steady_fn is None:
+                self._steady_fn = self._make_steady_fn(tokens_per_frame, patch_start_idx)
+            x_out, kv_cache["k"], kv_cache["v"], kv_cache["k_special"], kv_cache["v_special"] = \
+                self._steady_fn(x, kv_cache["k"], kv_cache["v"],
+                                kv_cache["k_special"], kv_cache["v_special"],
+                                rope_cos, rope_sin)
+            return x_out
+
         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
         qkv = qkv.transpose(0, 2, 3, 1, 4)  # [B, 3, H, N, D]
         q, k, v = qkv[:, 0], qkv[:, 1], qkv[:, 2]

From 94961107ed9c6d78d9c0b73513d7e4f5bee83645 Mon Sep 17 00:00:00 2001
From: Dan <dan@DandeMacBook-Air.local>
Date: Thu, 30 Apr 2026 21:00:51 +0800
Subject: [PATCH 05/13] Reduce per-frame overhead: eliminate k_cat temp, cache
 RoPE, drop eval sync points
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three incremental optimizations that together give ~7% end-to-end speedup
(1.88fps → 2.01fps measured in component benchmark, sw=16 float16):

1. layers.py _make_steady_fn: eliminate 32 MB k_cat intermediate tensor
   k_cat = concat([k_cache, k_new]) was the largest intermediate allocation
   per steady-state attention call (32.5 MB). Replace with:
   - evict_k drawn directly from old k_cache at index sf (zero alloc)
   - new_k_cache = 3-way concat([k_cache[:sf], k_cache[sf+1:], k_new])
   Reduces concatenates 8→6 per block; 24 blocks × 32 MB = 768 MB less
   memory traffic per frame.

2. aggregator.py _get_rope: cache RoPE cos/sin tables
   Patch positions are purely a function of image resolution; cos/sin are
   identical for every streaming frame of the same size. Cache per
   (B, S, H, W, head_dim, dtype) key; subsequent frames get pre-evaluated,
   dtype-cast arrays with zero recompute.

3. aggregator.py __call__: remove 4 intermediate mx.eval sync points
   mx.eval(tokens_global) was called after output groups 4/11/17/23 to break
   the lazy graph into Metal-compilable segments. With compiled steady-state
   global blocks, the full 24-pair graph is small enough to evaluate lazily
   in one pass. Measured 7 ms/frame savings from fewer Metal→CPU roundtrips.

Correctness: confirmed by running full inference_streaming on 20 frames and
verifying early-frame outputs are bit-for-bit identical with and without the
compiled path.
---
 mlx_infer/aggregator.py | 36 ++++++++++++++++++++++++++----------
 mlx_infer/layers.py     | 16 ++++++++--------
 2 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/mlx_infer/aggregator.py b/mlx_infer/aggregator.py
index 2192585..3af34b9 100644
--- a/mlx_infer/aggregator.py
+++ b/mlx_infer/aggregator.py
@@ -307,6 +307,26 @@ def _get_positions(self, B: int, S: int, H: int, W: int) -> mx.array:
         pos_special = mx.zeros((B * S, self.num_special_tokens, 2), dtype=mx.int32)
         return mx.concatenate([pos_special, pos], axis=1)  # [B*S, P, 2]
 
+    def _get_rope(self, B: int, S: int, H: int, W: int,
+                  head_dim: int, dtype) -> tuple:
+        """Return (cos, sin) RoPE tables, cached per (B, S, H, W, head_dim, dtype).
+
+        The patch positions are purely a function of the image grid, so they are
+        identical for every frame of the same resolution. Computing them once and
+        caching avoids repeated calls to get_cos_sin and two .astype() casts.
+        """
+        key = (B, S, H, W, head_dim, dtype)
+        if not hasattr(self, '_rope_cache'):
+            self._rope_cache: dict = {}
+        if key not in self._rope_cache:
+            pos = self._get_positions(B, S, H, W)
+            cos, sin = self.rope.get_cos_sin(None, pos, head_dim=head_dim)
+            cos = cos.astype(dtype)
+            sin = sin.astype(dtype)
+            mx.eval(cos, sin)
+            self._rope_cache[key] = (cos, sin)
+        return self._rope_cache[key]
+
     # ------------------------------------------------------------------
     # Special token preparation
     # ------------------------------------------------------------------
@@ -394,14 +414,9 @@ def __call__(
         tokens = mx.concatenate([special, patch_tokens], axis=1)  # [B*S, P, C]
         P = tokens.shape[1]
 
-        # ---- 2D RoPE positions ----
-        pos = self._get_positions(B, S, H, W)                # [B*S, P, 2]
+        # ---- 2D RoPE positions (cached — constant per image resolution) ----
         head_dim = self.embed_dim // self.num_heads          # 64 for ViT-L
-        cos, sin = self.rope.get_cos_sin(
-            None, pos, head_dim=head_dim)                    # [B*S, 1, P, 64]
-        # Cast to compute dtype so float16 Q/K aren't silently upcast by float32 tables.
-        cos = cos.astype(tokens.dtype)
-        sin = sin.astype(tokens.dtype)
+        cos, sin = self._get_rope(B, S, H, W, head_dim, tokens.dtype)
 
         # ---- Alternating frame / global attention ----
         output_list: List[mx.array] = []
@@ -446,9 +461,10 @@ def __call__(
             if selected_idx is None or group in selected_idx:
                 for fi, gi in zip(frame_outs, global_outs):
                     output_list.append(mx.concatenate([fi, gi], axis=-1))  # [B, S, P, 2C]
-                # Break the lazy graph: forces this segment to execute and lets the
-                # next segment compile independently (~6 blocks per segment).
-                mx.eval(tokens_global)
+                # Note: we no longer call mx.eval(tokens_global) here. With compiled
+                # steady-state global blocks the full 24-pair lazy graph is smaller
+                # than before, and a single eval via mx.eval(agg_list) in __call__
+                # is ~7ms faster than four intermediate Metal sync points.
 
         # Update frame counter (only on keyframe path, skip_append=False)
         if self._kv_cache is not None and not self._kv_cache[0].get("_skip_append", False):
diff --git a/mlx_infer/layers.py b/mlx_infer/layers.py
index b7dd311..b818ee9 100644
--- a/mlx_infer/layers.py
+++ b/mlx_infer/layers.py
@@ -216,20 +216,20 @@ def _step(x, k_cache, v_cache, k_special, v_special, rope_cos, rope_sin):
 
             k_new = k.reshape(B, H, 1, tokens_per_frame, D)
             v_new = v.reshape(B, H, 1, tokens_per_frame, D)
-            k_cat = mx.concatenate([k_cache, k_new], axis=2)  # [B, H, sf+sw+1, T, D]
-            v_cat = mx.concatenate([v_cache, v_new], axis=2)
 
-            # Evict exactly 1 frame at index sf; extract its special tokens
-            evict_k = k_cat[:, :, sf:sf + 1, :ps, :].reshape(B, H, ps, D)
-            evict_v = v_cat[:, :, sf:sf + 1, :ps, :].reshape(B, H, ps, D)
+            # Extract evicted frame's special tokens directly from the OLD cache at index sf.
+            # Avoids building the 32 MB k_cat = concat([k_cache, k_new]) intermediate tensor.
+            evict_k = k_cache[:, :, sf:sf + 1, :ps, :].reshape(B, H, ps, D)
+            evict_v = v_cache[:, :, sf:sf + 1, :ps, :].reshape(B, H, ps, D)
 
             # Rotate k_special: drop oldest ps slots, append newest ps tokens
             new_k_special = mx.concatenate([k_special[:, :, ps:], evict_k], axis=2)
             new_v_special = mx.concatenate([v_special[:, :, ps:], evict_v], axis=2)
 
-            # Trim cache back to sf + sw frames
-            new_k_cache = mx.concatenate([k_cat[:, :, :sf], k_cat[:, :, -sw:]], axis=2)
-            new_v_cache = mx.concatenate([v_cat[:, :, :sf], v_cat[:, :, -sw:]], axis=2)
+            # New cache: scale_frames kept + sliding[1:] + new frame (3-way concat).
+            # k_cache[:, :, sf+1:] = sw-1 old sliding frames; k_new = 1 new frame.
+            new_k_cache = mx.concatenate([k_cache[:, :, :sf], k_cache[:, :, sf + 1:], k_new], axis=2)
+            new_v_cache = mx.concatenate([v_cache[:, :, :sf], v_cache[:, :, sf + 1:], v_new], axis=2)
 
             k_full = mx.concatenate(
                 [new_k_special, new_k_cache.reshape(B, H, -1, D)], axis=2)

From 434526a9fdfdad89b1816d59f883b6526ef6c2b1 Mon Sep 17 00:00:00 2001
From: Dan <dan@DandeMacBook-Air.local>
Date: Thu, 30 Apr 2026 22:06:24 +0800
Subject: [PATCH 06/13] Remove redundant mx.eval(patch_tokens) sync; add
 steady-state bench tools
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With compiled steady-state global blocks, letting the backbone fuse
lazily into the full aggregator graph gives ~3ms improvement and
reduces Metal round-trips. Previous reasoning (break 72-block graph
into two segments) is no longer compelling.

bench_compile.py / bench_depth.py: per-component timing utilities
that require N_WARMUP>=27 (16 to fill sliding window + 10 to cap
k_special) to measure true steady-state performance.

Verified steady-state baseline: sw=16 float16 max-special-frames=10
  agg≈452ms  cam≈18ms  depth≈63ms  total≈534ms  → 1.87fps
Depth breakdown: refinenets=30ms, output_conv=28ms, resize=4ms
(bilinear+conv dominated, ConvTranspose2d is not a bottleneck)
---
 mlx_infer/aggregator.py    |   3 -
 mlx_infer/bench_compile.py | 120 +++++++++++++++++++++++++++++++++++++
 mlx_infer/bench_depth.py   | 107 +++++++++++++++++++++++++++++++++
 3 files changed, 227 insertions(+), 3 deletions(-)
 create mode 100644 mlx_infer/bench_compile.py
 create mode 100644 mlx_infer/bench_depth.py

diff --git a/mlx_infer/aggregator.py b/mlx_infer/aggregator.py
index 3af34b9..5919901 100644
--- a/mlx_infer/aggregator.py
+++ b/mlx_infer/aggregator.py
@@ -401,9 +401,6 @@ def __call__(
 
         # ---- DINOv2 patch embedding ----
         patch_tokens = self.patch_embed(imgs)                # [B*S, N_patch, C]
-        # Materialise the backbone before frame/global blocks so MLX compiles
-        # two smaller subgraphs rather than one 72-block graph per frame.
-        mx.eval(patch_tokens)
         C = patch_tokens.shape[-1]
 
         # ---- Special tokens ----
diff --git a/mlx_infer/bench_compile.py b/mlx_infer/bench_compile.py
new file mode 100644
index 0000000..5188c26
--- /dev/null
+++ b/mlx_infer/bench_compile.py
@@ -0,0 +1,120 @@
+"""
+Benchmark: measure per-component timing before/after backbone mx.compile.
+Run with:
+    conda run -n lingbot python mlx_infer/bench_compile.py \
+        --checkpoint /Users/dan/.cache/modelscope/hub/models/Robbyant/lingbot-map/lingbot-map-long.pt \
+        --images example/courthouse
+"""
+import argparse, sys, time
+from pathlib import Path
+import numpy as np
+import mlx.core as mx
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from mlx_infer.model import GCTStreamMLX
+from mlx_infer.weights import load_checkpoint
+from mlx_infer.demo import _load_images_from_dir
+
+CKPT  = "/Users/dan/.cache/modelscope/hub/models/Robbyant/lingbot-map/lingbot-map-long.pt"
+IMGS  = "example/courthouse"
+SW    = 16
+SF    = 8
+MSF   = 10
+DTYPE = mx.float16
+N_WARMUP = 30   # need 16 (fill cache) + max_special_frames=10 (fill k_special cap) + buffer
+N_BENCH  = 40
+
+def tsync(arr):
+    mx.eval(arr)
+    return time.perf_counter()
+
+def build_model():
+    m = GCTStreamMLX(
+        img_size=518, patch_size=14, embed_dim=1024, depth=24, num_heads=16,
+        num_register_tokens=4,
+        kv_cache_sliding_window=SW,
+        kv_cache_scale_frames=SF,
+        kv_cache_max_special_frames=MSF,
+        camera_num_iterations=4,
+        enable_depth=True,
+        enable_point=False,
+    )
+    load_checkpoint(m, CKPT, verbose=False)
+    m.apply(lambda x: x.astype(mx.float16) if isinstance(x, mx.array) else x)
+    return m
+
+def run_bench(args):
+    model = build_model()
+
+    images_np = _load_images_from_dir(IMGS, img_size=518)[:SF + N_WARMUP + N_BENCH]
+    images = mx.array(images_np).astype(DTYPE)
+    print(f"Image shape: {images.shape}  dtype={images.dtype}")
+    print(f"KV cache: scale={SF} + sliding={SW}  max_special_frames={MSF}")
+
+    model.clean_kv_cache()
+
+    # Phase 1
+    scale_out = model(
+        images[None, :SF],
+        num_frame_for_scale=SF,
+        num_frame_per_block=SF,
+        causal_inference=True,
+    )
+    mx.eval(scale_out)
+    print("Phase 1 done")
+
+    # Warmup (fill cache to steady state)
+    for i in range(SF, SF + N_WARMUP):
+        out = model(images[None, i:i+1], num_frame_for_scale=SF,
+                    num_frame_per_block=1, causal_inference=True)
+        mx.eval(out)
+    print(f"Warmup ({N_WARMUP} frames) done, measuring {N_BENCH} frames...")
+
+    # Benchmark loop
+    agg_ms = []; cam_ms = []; depth_ms = []; total_ms = []
+
+    for i in range(SF + N_WARMUP, SF + N_WARMUP + N_BENCH):
+        frame = images[None, i:i+1]
+
+        t0 = time.perf_counter()
+        agg_list, psi = model.aggregator(
+            frame, selected_idx=[4, 11, 17, 23],
+            num_frame_for_scale=SF, num_frame_per_block=1,
+        )
+        t1 = tsync(agg_list)
+
+        pose_list = model.camera_head(
+            agg_list, causal_inference=True,
+            num_frame_per_block=1, num_frame_for_scale=SF,
+        )
+        t2 = tsync(pose_list[-1])
+
+        depth, dconf = model.depth_head(agg_list, frame, psi)
+        t3 = tsync(depth)
+
+        agg_ms.append((t1-t0)*1e3)
+        cam_ms.append((t2-t1)*1e3)
+        depth_ms.append((t3-t2)*1e3)
+        total_ms.append((t3-t0)*1e3)
+
+    def stats(name, arr):
+        a = np.array(arr)
+        print(f"  {name:8s}  mean={a.mean():.1f}ms  min={a.min():.1f}ms  std={a.std():.1f}ms")
+
+    print(f"\n--- Steady-state component breakdown ({N_BENCH} frames) ---")
+    stats("agg",   agg_ms)
+    stats("cam",   cam_ms)
+    stats("depth", depth_ms)
+    stats("total", total_ms)
+    fps = 1000 / np.mean(total_ms)
+    print(f"  => {fps:.2f} fps")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint", default=CKPT)
+    parser.add_argument("--images", default=IMGS)
+    args = parser.parse_args()
+    CKPT = args.checkpoint
+    IMGS = args.images
+    run_bench(args)
diff --git a/mlx_infer/bench_depth.py b/mlx_infer/bench_depth.py
new file mode 100644
index 0000000..5170509
--- /dev/null
+++ b/mlx_infer/bench_depth.py
@@ -0,0 +1,107 @@
+"""Profile depth head component timing in isolation."""
+import sys, time
+from pathlib import Path
+import numpy as np
+import mlx.core as mx
+import mlx.nn as nn
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from mlx_infer.model import GCTStreamMLX
+from mlx_infer.weights import load_checkpoint
+from mlx_infer.demo import _load_images_from_dir
+from mlx_infer.heads import _bilinear, activate_head
+
+CKPT = "/Users/dan/.cache/modelscope/hub/models/Robbyant/lingbot-map/lingbot-map-long.pt"
+SW, SF, MSF = 16, 8, 10
+N_WU, N_B = 30, 20
+
+model = GCTStreamMLX(
+    img_size=518, patch_size=14, embed_dim=1024, depth=24, num_heads=16,
+    num_register_tokens=4, kv_cache_sliding_window=SW, kv_cache_scale_frames=SF,
+    kv_cache_max_special_frames=MSF, camera_num_iterations=4,
+    enable_depth=True, enable_point=False,
+)
+load_checkpoint(model, CKPT, verbose=False)
+model.apply(lambda x: x.astype(mx.float16) if isinstance(x, mx.array) else x)
+print("Model loaded")
+
+imgs_np = _load_images_from_dir("example/courthouse", img_size=518)[:SF + N_WU + N_B]
+imgs = mx.array(imgs_np).astype(mx.float16)
+
+model.clean_kv_cache()
+scale_out = model(imgs[None,:SF], num_frame_for_scale=SF, num_frame_per_block=SF, causal_inference=True)
+mx.eval(scale_out); del scale_out
+
+for i in range(SF, SF+N_WU):
+    out = model(imgs[None,i:i+1], num_frame_for_scale=SF, num_frame_per_block=1, causal_inference=True)
+    mx.eval(out)
+print(f"Warmup done ({N_WU} frames)")
+
+dh = model.depth_head
+B, S, H, W = 1, 1, 294, 518
+patch_h, patch_w, psi = H//14, W//14, model.aggregator.patch_start_idx
+
+buckets = {k: [] for k in ['norm+proj+posemb', 'resize', 'scratch_rn', 'refinenets', 'output_conv', 'total']}
+
+for i in range(SF+N_WU, SF+N_WU+N_B):
+    frame = imgs[None, i:i+1]
+    agg_list, _ = model.aggregator(frame, selected_idx=[4,11,17,23],
+                                   num_frame_for_scale=SF, num_frame_per_block=1)
+    mx.eval(agg_list)
+
+    def ts(a):
+        mx.eval(a); return time.perf_counter()
+
+    # ---- norm + project + pos_embed (4 levels) ----
+    t0 = time.perf_counter()
+    out_feats = []
+    for level, li in enumerate([0,1,2,3]):
+        x = agg_list[li][:,:,psi:].reshape(B*S, patch_h*patch_w, -1)
+        x = dh.norm(x).reshape(B*S, patch_h, patch_w, -1)
+        x = dh.projects[level](x)
+        x = dh._apply_pos_embed(x, W, H)
+        out_feats.append(x)
+    t1 = ts(out_feats)
+
+    # ---- resize (ConvTranspose2d × 2 + Conv2d × 1) ----
+    r0 = dh.resize_conv0(out_feats[0])
+    r1 = dh.resize_conv1(out_feats[1])
+    r2 = out_feats[2]
+    r3 = dh.resize_conv3(out_feats[3])
+    t2 = ts([r0, r1, r3])
+
+    # ---- scratch layer_rn (4 × 3×3 Conv2d) ----
+    l1 = dh.layer1_rn(r0); l2 = dh.layer2_rn(r1)
+    l3 = dh.layer3_rn(r2); l4 = dh.layer4_rn(r3)
+    t3 = ts([l1, l2, l3, l4])
+
+    # ---- refinenets (4 FeatureFusionBlocks) ----
+    o = dh.refinenet4(l4, target_hw=(l3.shape[1], l3.shape[2]))
+    o = dh.refinenet3(o, skip=l3, target_hw=(l2.shape[1], l2.shape[2]))
+    o = dh.refinenet2(o, skip=l2, target_hw=(l1.shape[1], l1.shape[2]))
+    o = dh.refinenet1(o, skip=l1)
+    t4 = ts(o)
+
+    # ---- output convolutions + final bilinear ----
+    o = dh.output_conv1(o)
+    o = _bilinear(o, (patch_h*14, patch_w*14))
+    o = dh._apply_pos_embed(o, W, H)
+    o = nn.relu(dh.output_conv2a(o))
+    o = dh.output_conv2b(o)
+    o, _ = activate_head(o, dh.activation, dh.conf_activation)
+    t5 = ts(o)
+
+    model.camera_head(agg_list, causal_inference=True, num_frame_per_block=1, num_frame_for_scale=SF)
+
+    buckets['norm+proj+posemb'].append((t1-t0)*1e3)
+    buckets['resize'].append((t2-t1)*1e3)
+    buckets['scratch_rn'].append((t3-t2)*1e3)
+    buckets['refinenets'].append((t4-t3)*1e3)
+    buckets['output_conv'].append((t5-t4)*1e3)
+    buckets['total'].append((t5-t0)*1e3)
+
+print("\n--- Depth head breakdown ---")
+for k, v in buckets.items():
+    a = np.array(v)
+    print(f"  {k:20s}  mean={a.mean():.1f}ms  min={a.min():.1f}ms  std={a.std():.1f}ms")

From b22fd9574b56ad5e58b4a00de2bde911da69ffac Mon Sep 17 00:00:00 2001
From: Dan <dan@DandeMacBook-Air.local>
Date: Thu, 30 Apr 2026 22:15:33 +0800
Subject: [PATCH 07/13] Remove mx.eval(agg_list) mid-forward sync; add
 bench_e2e.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Aggregator, camera, and depth head lazy graphs now fuse into one
graph evaluated at mx.eval(frame_out). Removes a Metal CPU round-trip
between aggregator and heads; MLX lazy evaluation correctly tracks
all data dependencies transitively.

bench_e2e.py: end-to-end per-frame timing through model() call.

Measured improvement (sw=16 float16 steady-state, N=40):
  Before: 534ms → 1.87 fps  (agg eval forced mid-forward)
  After:  527ms → 1.90 fps  (fully fused, lower std too)
---
 mlx_infer/bench_e2e.py | 48 ++++++++++++++++++++++++++++++++++++++++++
 mlx_infer/model.py     |  1 -
 2 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100644 mlx_infer/bench_e2e.py

diff --git a/mlx_infer/bench_e2e.py b/mlx_infer/bench_e2e.py
new file mode 100644
index 0000000..f31e8e6
--- /dev/null
+++ b/mlx_infer/bench_e2e.py
@@ -0,0 +1,48 @@
+"""End-to-end per-frame timing through model() — measures the fused aggregator+camera+depth graph."""
+import sys, time
+from pathlib import Path
+import numpy as np
+import mlx.core as mx
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from mlx_infer.model import GCTStreamMLX
+from mlx_infer.weights import load_checkpoint
+from mlx_infer.demo import _load_images_from_dir
+
+CKPT = "/Users/dan/.cache/modelscope/hub/models/Robbyant/lingbot-map/lingbot-map-long.pt"
+SW, SF, MSF = 16, 8, 10
+N_WU, N_B = 30, 40
+
+model = GCTStreamMLX(
+    img_size=518, patch_size=14, embed_dim=1024, depth=24, num_heads=16,
+    num_register_tokens=4, kv_cache_sliding_window=SW, kv_cache_scale_frames=SF,
+    kv_cache_max_special_frames=MSF, camera_num_iterations=4,
+    enable_depth=True, enable_point=False,
+)
+load_checkpoint(model, CKPT, verbose=False)
+model.apply(lambda x: x.astype(mx.float16) if isinstance(x, mx.array) else x)
+print("Model loaded")
+
+imgs_np = _load_images_from_dir("example/courthouse", img_size=518)[:SF + N_WU + N_B]
+imgs = mx.array(imgs_np).astype(mx.float16)
+
+model.clean_kv_cache()
+s = model(imgs[None,:SF], num_frame_for_scale=SF, num_frame_per_block=SF, causal_inference=True)
+mx.eval(s); del s
+
+for i in range(SF, SF+N_WU):
+    o = model(imgs[None,i:i+1], num_frame_for_scale=SF, num_frame_per_block=1, causal_inference=True)
+    mx.eval(o)
+print(f"Warmup ({N_WU} frames) done")
+
+times = []
+for i in range(SF+N_WU, SF+N_WU+N_B):
+    t0 = time.perf_counter()
+    o = model(imgs[None,i:i+1], num_frame_for_scale=SF, num_frame_per_block=1, causal_inference=True)
+    mx.eval(o)
+    times.append((time.perf_counter() - t0) * 1e3)
+
+a = np.array(times)
+print(f"\n--- End-to-end (N={N_B}, sw={SW}) ---")
+print(f"  mean={a.mean():.1f}ms  min={a.min():.1f}ms  std={a.std():.1f}ms  => {1000/a.mean():.2f} fps")
diff --git a/mlx_infer/model.py b/mlx_infer/model.py
index b9ec7a0..59ac702 100644
--- a/mlx_infer/model.py
+++ b/mlx_infer/model.py
@@ -132,7 +132,6 @@ def __call__(
             num_frame_for_scale=num_frame_for_scale,
             num_frame_per_block=num_frame_per_block,
         )
-        mx.eval(agg_list)   # materialise before heads
 
         predictions: Dict[str, mx.array] = {}
 

From 245c570e22d1e24759823be1a39eb6adb3366ad5 Mon Sep 17 00:00:00 2001
From: Dan <dan@DandeMacBook-Air.local>
Date: Thu, 30 Apr 2026 22:46:40 +0800
Subject: [PATCH 08/13] Update demo.py help text: document scale-frames as
 speed lever, fix kv-sliding-window token count

---
 mlx_infer/demo.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/mlx_infer/demo.py b/mlx_infer/demo.py
index ba7c543..6216844 100644
--- a/mlx_infer/demo.py
+++ b/mlx_infer/demo.py
@@ -143,13 +143,16 @@ def main():
     parser.add_argument("--img-size", type=int, default=518,
                         help="Resize images to this square size")
     parser.add_argument("--scale-frames", type=int, default=8,
-                        help="Number of initial scale frames (Phase 1)")
+                        help="Number of initial scale frames (Phase 1). "
+                             "Also a speed lever: reduces permanent KV cache from "
+                             "(sf+sw)*tokens to fewer keys. sf=4 saves ~16%%, sf=1 saves ~29%% "
+                             "vs sf=8 (with sw=16, 294x518px).")
     parser.add_argument("--keyframe-interval", type=int, default=1,
                         help="Keyframe interval (1 = every frame)")
     parser.add_argument("--kv-sliding-window", type=int, default=64,
                         help="KV-cache sliding window in frames. "
-                             "Dominant cost is 24 attn blocks x window*1375 keys per frame. "
-                             "sw=64 (default/accurate), sw=16 (~2.5x faster), sw=8 (~3.3x faster)")
+                             "Dominant cost is 24 attn blocks × (sf+sw)*783 keys (294×518px). "
+                             "sw=64 (default/accurate), sw=16 (~1.9 fps), sw=8 (~2.1 fps) with float16.")
     parser.add_argument("--max-special-frames", type=int, default=None,
                         help="Cap k_special (evicted frames' tokens) at this many frames. "
                              "k_special grows 6 tokens/frame indefinitely; for sequences "

From bf4221d9c4619eebe1699de34a3d3f5b98db92ee Mon Sep 17 00:00:00 2001
From: Dan <dan@DandeMacBook-Air.local>
Date: Thu, 30 Apr 2026 22:49:06 +0800
Subject: [PATCH 09/13] Fix tokens_per_frame to use actual image dimensions
 (was assuming square)

---
 mlx_infer/demo.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mlx_infer/demo.py b/mlx_infer/demo.py
index 6216844..173b116 100644
--- a/mlx_infer/demo.py
+++ b/mlx_infer/demo.py
@@ -176,9 +176,6 @@ def main():
 
     # ---- Build model ----
     print("Building GCTStreamMLX model...")
-    tokens_per_frame = (args.img_size // 14) ** 2 + 6
-    print(f"KV-cache: scale={args.scale_frames} + sliding={args.kv_sliding_window} frames "
-          f"= {(args.scale_frames + args.kv_sliding_window) * tokens_per_frame:,} keys/block")
     model = GCTStreamMLX(
         img_size=args.img_size,
         patch_size=14,
@@ -201,7 +198,11 @@ def main():
     # ---- Load images ----
     print(f"Loading images from: {args.images}")
     images_np = _load_images_from_dir(args.images, img_size=args.img_size)
-    print(f"Loaded {images_np.shape[0]} frames at {images_np.shape[2]}×{images_np.shape[3]}")
+    H, W = images_np.shape[2], images_np.shape[3]
+    tokens_per_frame = (H // 14) * (W // 14) + 6
+    print(f"Loaded {images_np.shape[0]} frames at {H}×{W}")
+    print(f"KV-cache: scale={args.scale_frames} + sliding={args.kv_sliding_window} frames "
+          f"= {(args.scale_frames + args.kv_sliding_window) * tokens_per_frame:,} keys/block")
 
     if args.max_frames is not None:
         images_np = images_np[:args.max_frames]

From e9733ce6670da4bc90650d28300b7c2bb870159a Mon Sep 17 00:00:00 2001
From: Dan <dan@DandeMacBook-Air.local>
Date: Thu, 30 Apr 2026 22:55:22 +0800
Subject: [PATCH 10/13] Correct scale-frames help text; add CLI args to
 bench_e2e.py; fix tokens_per_frame for non-square images

---
 mlx_infer/bench_e2e.py | 83 ++++++++++++++++++++++++------------------
 mlx_infer/demo.py      |  6 +--
 2 files changed, 50 insertions(+), 39 deletions(-)

diff --git a/mlx_infer/bench_e2e.py b/mlx_infer/bench_e2e.py
index f31e8e6..c5e7b9a 100644
--- a/mlx_infer/bench_e2e.py
+++ b/mlx_infer/bench_e2e.py
@@ -1,5 +1,5 @@
 """End-to-end per-frame timing through model() — measures the fused aggregator+camera+depth graph."""
-import sys, time
+import argparse, sys, time
 from pathlib import Path
 import numpy as np
 import mlx.core as mx
@@ -11,38 +11,49 @@
 from mlx_infer.demo import _load_images_from_dir
 
 CKPT = "/Users/dan/.cache/modelscope/hub/models/Robbyant/lingbot-map/lingbot-map-long.pt"
-SW, SF, MSF = 16, 8, 10
-N_WU, N_B = 30, 40
-
-model = GCTStreamMLX(
-    img_size=518, patch_size=14, embed_dim=1024, depth=24, num_heads=16,
-    num_register_tokens=4, kv_cache_sliding_window=SW, kv_cache_scale_frames=SF,
-    kv_cache_max_special_frames=MSF, camera_num_iterations=4,
-    enable_depth=True, enable_point=False,
-)
-load_checkpoint(model, CKPT, verbose=False)
-model.apply(lambda x: x.astype(mx.float16) if isinstance(x, mx.array) else x)
-print("Model loaded")
-
-imgs_np = _load_images_from_dir("example/courthouse", img_size=518)[:SF + N_WU + N_B]
-imgs = mx.array(imgs_np).astype(mx.float16)
-
-model.clean_kv_cache()
-s = model(imgs[None,:SF], num_frame_for_scale=SF, num_frame_per_block=SF, causal_inference=True)
-mx.eval(s); del s
-
-for i in range(SF, SF+N_WU):
-    o = model(imgs[None,i:i+1], num_frame_for_scale=SF, num_frame_per_block=1, causal_inference=True)
-    mx.eval(o)
-print(f"Warmup ({N_WU} frames) done")
-
-times = []
-for i in range(SF+N_WU, SF+N_WU+N_B):
-    t0 = time.perf_counter()
-    o = model(imgs[None,i:i+1], num_frame_for_scale=SF, num_frame_per_block=1, causal_inference=True)
-    mx.eval(o)
-    times.append((time.perf_counter() - t0) * 1e3)
-
-a = np.array(times)
-print(f"\n--- End-to-end (N={N_B}, sw={SW}) ---")
-print(f"  mean={a.mean():.1f}ms  min={a.min():.1f}ms  std={a.std():.1f}ms  => {1000/a.mean():.2f} fps")
+
+def run(sw, sf, msf, n_wu, n_b, ckpt, imgs_dir):
+    model = GCTStreamMLX(
+        img_size=518, patch_size=14, embed_dim=1024, depth=24, num_heads=16,
+        num_register_tokens=4, kv_cache_sliding_window=sw, kv_cache_scale_frames=sf,
+        kv_cache_max_special_frames=msf, camera_num_iterations=4,
+        enable_depth=True, enable_point=False,
+    )
+    load_checkpoint(model, ckpt, verbose=False)
+    model.apply(lambda x: x.astype(mx.float16) if isinstance(x, mx.array) else x)
+    print(f"Model loaded  sw={sw} sf={sf} msf={msf}")
+
+    imgs_np = _load_images_from_dir(imgs_dir, img_size=518)[:sf + n_wu + n_b]
+    imgs = mx.array(imgs_np).astype(mx.float16)
+
+    model.clean_kv_cache()
+    s = model(imgs[None,:sf], num_frame_for_scale=sf, num_frame_per_block=sf, causal_inference=True)
+    mx.eval(s); del s
+
+    for i in range(sf, sf + n_wu):
+        o = model(imgs[None,i:i+1], num_frame_for_scale=sf, num_frame_per_block=1, causal_inference=True)
+        mx.eval(o)
+    print(f"Warmup ({n_wu} frames) done")
+
+    times = []
+    for i in range(sf + n_wu, sf + n_wu + n_b):
+        t0 = time.perf_counter()
+        o = model(imgs[None,i:i+1], num_frame_for_scale=sf, num_frame_per_block=1, causal_inference=True)
+        mx.eval(o)
+        times.append((time.perf_counter() - t0) * 1e3)
+
+    a = np.array(times)
+    print(f"\n--- End-to-end (N={n_b}, sw={sw}, sf={sf}) ---")
+    print(f"  mean={a.mean():.1f}ms  min={a.min():.1f}ms  std={a.std():.1f}ms  => {1000/a.mean():.2f} fps")
+
+if __name__ == "__main__":
+    p = argparse.ArgumentParser()
+    p.add_argument("--sw",   type=int, default=16)
+    p.add_argument("--sf",   type=int, default=8)
+    p.add_argument("--msf",  type=int, default=10)
+    p.add_argument("--n-wu", type=int, default=30)
+    p.add_argument("--n-b",  type=int, default=40)
+    p.add_argument("--checkpoint", default=CKPT)
+    p.add_argument("--images", default="example/courthouse")
+    args = p.parse_args()
+    run(args.sw, args.sf, args.msf, args.n_wu, args.n_b, args.checkpoint, args.images)
diff --git a/mlx_infer/demo.py b/mlx_infer/demo.py
index 173b116..0d97428 100644
--- a/mlx_infer/demo.py
+++ b/mlx_infer/demo.py
@@ -144,9 +144,9 @@ def main():
                         help="Resize images to this square size")
     parser.add_argument("--scale-frames", type=int, default=8,
                         help="Number of initial scale frames (Phase 1). "
-                             "Also a speed lever: reduces permanent KV cache from "
-                             "(sf+sw)*tokens to fewer keys. sf=4 saves ~16%%, sf=1 saves ~29%% "
-                             "vs sf=8 (with sw=16, 294x518px).")
+                             "Minor speed lever: sf=1 gives ~10%% speedup vs sf=8 "
+                             "(2.09→1.90 fps, sw=16 float16); sf=4 is negligible. "
+                             "Lower values reduce initialization quality.")
     parser.add_argument("--keyframe-interval", type=int, default=1,
                         help="Keyframe interval (1 = every frame)")
     parser.add_argument("--kv-sliding-window", type=int, default=64,

From 8a28c3628561f5e43166cc69b4716bb9a4100a5a Mon Sep 17 00:00:00 2001
From: Dan <dan@DandeMacBook-Air.local>
Date: Thu, 30 Apr 2026 23:04:43 +0800
Subject: [PATCH 11/13] Update help text: document sw+sf interaction, add
 combined speedup examples

---
 mlx_infer/demo.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/mlx_infer/demo.py b/mlx_infer/demo.py
index 0d97428..12385a0 100644
--- a/mlx_infer/demo.py
+++ b/mlx_infer/demo.py
@@ -144,15 +144,17 @@ def main():
                         help="Resize images to this square size")
     parser.add_argument("--scale-frames", type=int, default=8,
                         help="Number of initial scale frames (Phase 1). "
-                             "Minor speed lever: sf=1 gives ~10%% speedup vs sf=8 "
-                             "(2.09→1.90 fps, sw=16 float16); sf=4 is negligible. "
-                             "Lower values reduce initialization quality.")
+                             "Speed lever: KV cache holds (sf+sw)*783 keys; reducing sf "
+                             "compounds with --kv-sliding-window. sf=1+sw=4 → 2.82fps, "
+                             "sf=1+sw=1 → 3.0fps (vs sf=8+sw=16 → 1.9fps, float16). "
+                             "Lower sf reduces initialization quality.")
     parser.add_argument("--keyframe-interval", type=int, default=1,
                         help="Keyframe interval (1 = every frame)")
     parser.add_argument("--kv-sliding-window", type=int, default=64,
-                        help="KV-cache sliding window in frames. "
-                             "Dominant cost is 24 attn blocks × (sf+sw)*783 keys (294×518px). "
-                             "sw=64 (default/accurate), sw=16 (~1.9 fps), sw=8 (~2.1 fps) with float16.")
+                        help="KV-cache sliding window in frames. Speed/quality tradeoff "
+                             "(float16, 294×518px, sf=8): sw=64 accurate, sw=16 ~1.9fps, "
+                             "sw=8 ~2.1fps, sw=4 ~2.4fps. Compounds with --scale-frames: "
+                             "sf=1+sw=4 → 2.82fps, sf=1+sw=1 → 3.0fps.")
     parser.add_argument("--max-special-frames", type=int, default=None,
                         help="Cap k_special (evicted frames' tokens) at this many frames. "
                              "k_special grows 6 tokens/frame indefinitely; for sequences "

From 47155db56dfcfccdb72f9f432005b8a47e8b3052 Mon Sep 17 00:00:00 2001
From: Dan <dan@DandeMacBook-Air.local>
Date: Fri, 1 May 2026 09:41:13 +0800
Subject: [PATCH 12/13] Add mlx optional dependency extra and guard import with
 clear error

---
 mlx_infer/__init__.py | 7 +++++++
 pyproject.toml        | 1 +
 2 files changed, 8 insertions(+)

diff --git a/mlx_infer/__init__.py b/mlx_infer/__init__.py
index 6732827..f93a097 100644
--- a/mlx_infer/__init__.py
+++ b/mlx_infer/__init__.py
@@ -1,5 +1,12 @@
 """MLX inference package for GCTStream on Apple Silicon."""
 
+try:
+    import mlx.core  # noqa: F401
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError(
+        "mlx_infer requires MLX. Install it with: pip install lingbot-map[mlx]"
+    ) from e
+
 from .model import GCTStreamMLX
 from .weights import load_checkpoint
 
diff --git a/pyproject.toml b/pyproject.toml
index 29c6fb8..6dca29f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,6 +17,7 @@ dependencies = [
 [project.optional-dependencies]
 vis = ["viser>=0.2.23", "trimesh", "matplotlib", "onnxruntime", "requests"]
 demo = ["lingbot-map[vis]"]
+mlx = ["mlx==0.31.2", "mlx-metal==0.31.2"]
 
 [build-system]
 requires = ["setuptools>=61.0", "wheel"]

From 52edc606704263935b481284150c1905b2e62978 Mon Sep 17 00:00:00 2001
From: Dan <dan@DandeMacBook-Air.local>
Date: Fri, 1 May 2026 09:44:53 +0800
Subject: [PATCH 13/13] Add mlx_infer README

---
 mlx_infer/README.md | 112 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 mlx_infer/README.md

diff --git a/mlx_infer/README.md b/mlx_infer/README.md
new file mode 100644
index 0000000..4c3631e
--- /dev/null
+++ b/mlx_infer/README.md
@@ -0,0 +1,112 @@
+# mlx_infer
+
+MLX inference backend for GCTStream on Apple Silicon. Runs the full streaming 3D reconstruction pipeline (pose + depth) natively on Metal via the unified memory architecture, eliminating the Metal OOM accumulation that occurs with the PyTorch backend on long sequences.
+
+## Requirements
+
+- Apple Silicon Mac (M1 or later)
+- Python 3.10+
+- A PyTorch `.pt` checkpoint (same file used by the main `demo.py`)
+
+## Installation
+
+```bash
+pip install lingbot-map[mlx]
+```
+
+This installs `mlx==0.31.2` and `mlx-metal==0.31.2`. The rest of the dependencies (`torch`, `numpy`, etc.) are part of the base install.
+
+## Demo
+
+```bash
+python mlx_infer/demo.py \
+    --checkpoint /path/to/lingbot-map-long.pt \
+    --images     /path/to/image_dir \
+    --output     out.npz
+```
+
+Images are loaded from the directory in sorted order (`.jpg` and `.png`). The script resizes each frame to `--img-size` width (default 518), rounds height to the nearest 14-pixel multiple, and center-crops if the result is taller than `--img-size`.
+
+### Key arguments
+
+| Argument | Default | Description |
+|---|---|---|
+| `--scale-frames` | 8 | Phase 1 bidirectional frames. Lower = faster, lower initialization quality. |
+| `--kv-sliding-window` | 64 | Sliding window size for KV cache eviction. Lower = faster, shorter temporal context. |
+| `--dtype` | `float32` | `float16` is ~2× faster and recommended for speed benchmarks. |
+| `--max-frames` | all | Truncate the sequence to N frames. |
+| `--keyframe-interval` | 1 | Only append every Nth frame to KV cache (1 = every frame). |
+| `--max-special-frames` | none | Cap the number of evicted-frame tokens retained. Set to ~100 for sequences >300 frames with a small sliding window. |
+| `--no-vis` | off | Skip the viser 3D viewer; just save the `.npz`. |
+
+### Speed vs quality tradeoffs (float16, 294×518 px)
+
+| `--scale-frames` | `--kv-sliding-window` | fps |
+|---|---|---|
+| 8 | 64 | ~1.5 |
+| 8 | 16 | ~1.9 |
+| 8 | 8 | ~2.1 |
+| 8 | 4 | ~2.4 |
+| 1 | 4 | ~2.8 |
+| 1 | 1 | ~3.0 |
+
+## Streaming inference API
+
+```python
+from mlx_infer import GCTStreamMLX, load_checkpoint
+import mlx.core as mx
+import numpy as np
+
+model = GCTStreamMLX(
+    kv_cache_sliding_window=16,
+    kv_cache_scale_frames=8,
+    enable_depth=True,
+    enable_point=False,
+)
+load_checkpoint(model, "lingbot-map-long.pt")
+
+images = mx.array(np.load("frames.npy"))   # [S, 3, H, W] in [0, 1]
+predictions = model.inference_streaming(images, num_scale_frames=8)
+mx.eval(predictions)
+# keys: pose_enc, depth, depth_conf, images
+```
+
+`inference_streaming` runs two phases:
+
+1. **Scale phase** — the first `num_scale_frames` frames are processed together bidirectionally via a scale token, establishing the scene geometry baseline.
+2. **Streaming phase** — remaining frames are processed one at a time with a sliding KV cache, enabling unbounded sequences at bounded memory.
+
+## Benchmarking
+
+```bash
+# End-to-end per-frame timing
+python mlx_infer/bench_e2e.py \
+    --checkpoint /path/to/checkpoint.pt \
+    --images     /path/to/images \
+    --sw 16 --sf 8 --msf 10
+
+# Depth head component breakdown
+python mlx_infer/bench_depth.py
+
+# MLX graph compile timing
+python mlx_infer/bench_compile.py
+```
+
+## Checkpoint loading
+
+`load_checkpoint` converts a PyTorch `.pt` checkpoint to MLX in-memory — no separate conversion step is needed. Weight remapping handles the structural differences between the PyTorch and MLX model definitions (Conv2d/ConvTranspose2d axis permutations, renamed submodules).
+
+## Output format
+
+`inference_streaming` returns an `mx.array` dict:
+
+| Key | Shape | Description |
+|---|---|---|
+| `pose_enc` | `[B, S, 9]` | Encoded camera pose (FoV + rotation + translation) |
+| `depth` | `[B, S, H, W, 1]` | Metric depth |
+| `depth_conf` | `[B, S, H, W]` | Depth confidence |
+| `world_points` | `[B, S, H, W, 3]` | 3D world points (requires `enable_point=True`) |
+| `world_points_conf` | `[B, S, H, W]` | World point confidence |
+| `images` | `[B, S, 3, H, W]` | Input images (pass-through) |
+
+Use `demo.postprocess()` to convert `pose_enc` to `(extrinsic, intrinsic)` numpy arrays compatible with `PointCloudViewer`.