diff --git a/.gitignore b/.gitignore
index 228607990..501b6c21d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,11 +6,13 @@
 /ds4_native
 /ds4_server_test
 /ds4_test
+/tests/test_jaccl_shim
 /ds4flash.gguf
 /TODO.md
 /gguf/
 *.o
 *.dSYM/
+/build/
 __pycache__/
 *.pyc
 /misc/
diff --git a/Makefile b/Makefile
index 27283ba0f..b7622d069 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,5 @@
 CC ?= cc
+CXX ?= c++
 UNAME_S := $(shell uname -s)
 
 ifeq ($(UNAME_S),Darwin)
@@ -9,14 +10,31 @@ endif
 
 DEBUG_FLAGS ?= -g
 CFLAGS ?= -O3 -ffast-math $(DEBUG_FLAGS) $(NATIVE_CPU_FLAG) -Wall -Wextra -std=c99
+CXXFLAGS ?= -O3 -ffast-math $(DEBUG_FLAGS) $(NATIVE_CPU_FLAG) -Wall -Wextra -std=c++20
 OBJCFLAGS ?= -O3 -ffast-math $(DEBUG_FLAGS) $(NATIVE_CPU_FLAG) -Wall -Wextra -fobjc-arc
 
 LDLIBS ?= -lm -pthread
 METAL_SRCS := $(wildcard metal/*.metal)
 
+# --- JACCL (opt-in: make JACCL=1) ---
+JACCL_SRC ?= $(HOME)/opensource/mlx/mlx/distributed/jaccl/lib
+JACCL_BUILD_DIR := build/jaccl
+JACCL_INCLUDE := $(JACCL_SRC)
+JACCL_LIB := $(JACCL_BUILD_DIR)/libjaccl.a
+
+ifeq ($(JACCL),1)
+JACCL_CFLAGS := -DDS4_JACCL
+JACCL_OBJS := jaccl_shim.o
+JACCL_LDLIBS := $(JACCL_LIB) -lc++
+else
+JACCL_CFLAGS :=
+JACCL_OBJS :=
+JACCL_LDLIBS :=
+endif
+
 ifeq ($(UNAME_S),Darwin)
-METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
-CORE_OBJS = ds4.o ds4_metal.o
+METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal $(JACCL_LDLIBS)
+CORE_OBJS = ds4.o ds4_metal.o $(JACCL_OBJS)
 CPU_CORE_OBJS = ds4_cpu.o
 else
 CFLAGS += -D_GNU_SOURCE -fno-finite-math-only
@@ -41,6 +59,7 @@ all: ds4 ds4-server ds4-bench ds4-eval ds4-agent
 help:
 	@echo "DS4 build targets:"
 	@echo "  make              Build Metal ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent"
+	@echo "  make JACCL=1      Build with JACCL distributed support (requires macOS SDK >= 26.2)"
 	@echo "  make cpu          Build CPU-only ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent"
 	@echo "  make test         Build and run tests"
 	@echo "  make clean        Remove build outputs"
@@ -122,7 +141,7 @@ cuda-regression: tests/cuda_long_context_smoke
 endif
 
 ds4.o: ds4.c ds4.h ds4_gpu.h
-	$(CC) $(CFLAGS) -c -o $@ ds4.c
+	$(CC) $(CFLAGS) $(JACCL_CFLAGS) -c -o $@ ds4.c
 
 ds4_cli.o: ds4_cli.c ds4.h linenoise.h
 	$(CC) $(CFLAGS) -c -o $@ ds4_cli.c
@@ -157,6 +176,23 @@ rax.o: rax.c rax.h rax_malloc.h
 linenoise.o: linenoise.c linenoise.h
 	$(CC) $(CFLAGS) -c -o $@ linenoise.c
 
+# --- JACCL shim + static lib ---
+jaccl_shim.o: jaccl_shim.cpp jaccl_shim.h $(JACCL_LIB)
+	$(CXX) $(CXXFLAGS) -I$(JACCL_INCLUDE) -c -o $@ jaccl_shim.cpp
+
+tests/test_jaccl_shim.o: tests/test_jaccl_shim.c jaccl_shim.h
+	$(CC) $(CFLAGS) -c -o $@ tests/test_jaccl_shim.c
+
+tests/test_jaccl_shim: tests/test_jaccl_shim.o jaccl_shim.o $(JACCL_LIB)
+	$(CC) $(CFLAGS) -o $@ tests/test_jaccl_shim.o jaccl_shim.o $(JACCL_LIB) -lc++
+
+$(JACCL_LIB):
+	@mkdir -p $(JACCL_BUILD_DIR)
+	cmake -S $(JACCL_SRC) -B $(JACCL_BUILD_DIR) -DCMAKE_BUILD_TYPE=Release > /dev/null 2>&1
+	cmake --build $(JACCL_BUILD_DIR) --config Release -j$$(sysctl -n hw.ncpu) > /dev/null 2>&1
+	@test -f $(JACCL_LIB) || (echo "ERROR: libjaccl.a not found after build" && exit 1)
+	@echo "Built JACCL static library: $(JACCL_LIB)"
+
 ds4_cpu.o: ds4.c ds4.h ds4_gpu.h
 	$(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4.c
 
@@ -196,4 +232,5 @@ test: ds4_test ds4-eval
 	./ds4_test
 
 clean:
-	rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
+	rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o tests/test_jaccl_shim
+	rm -rf build/
diff --git a/distributed_launch.sh b/distributed_launch.sh
new file mode 100755
index 000000000..4a6721716
--- /dev/null
+++ b/distributed_launch.sh
@@ -0,0 +1,208 @@
+#!/usr/bin/env bash
+# distributed_launch.sh — Launch ds4-server across multiple nodes via JACCL
+#
+# Discovers RDMA interfaces via asmi, generates JACCL env vars, and SSHs
+# to each node to start ds4-server with --distributed.
+#
+# Usage:
+#   ./distributed_launch.sh --nodes hub,m3u4 --model gguf/ds4flash.gguf [--ctx 32768] [--port 8080]
+
+set -euo pipefail
+
+# --- Defaults ---
+NODES=""
+MODEL=""
+CTX=32768
+PORT=8080
+DS4_BIN="./ds4-server"
+EXTRA_ARGS=""
+
+usage() {
+    cat <<USAGE
+Usage: $(basename "$0") --nodes node1,node2 --model <path> [OPTIONS]
+
+Required:
+  --nodes node1,node2   Comma-separated list of nodes (hostnames or Tailscale names)
+  --model <path>        Path to GGUF model file (must exist on all nodes)
+
+Options:
+  --ctx N               Context length (default: 32768)
+  --port P              Server port (default: 8080)
+  --bin <path>          Path to ds4-server binary (default: ./ds4-server)
+  --extra "<args>"      Extra args to pass to ds4-server
+  -h, --help            Show this help
+
+Environment:
+  The script uses asmi (port 9090) on each node to discover RDMA interfaces.
+  The coordinator IP is rank 0's LAN IP (from Tailscale, NOT TB5 /30 IPs).
+
+Examples:
+  # 2-node launch
+  $(basename "$0") --nodes hub,m3u4 --model gguf/ds4flash.gguf --ctx 32768
+
+  # 4-node launch with custom port
+  $(basename "$0") --nodes hub,m3u1,m3u3,m3u4 --model gguf/ds4flash.gguf --port 9090
+USAGE
+    exit 0
+}
+
+# --- Parse args ---
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --nodes)    NODES="$2"; shift 2 ;;
+        --model)    MODEL="$2"; shift 2 ;;
+        --ctx)      CTX="$2"; shift 2 ;;
+        --port)     PORT="$2"; shift 2 ;;
+        --bin)      DS4_BIN="$2"; shift 2 ;;
+        --extra)    EXTRA_ARGS="$2"; shift 2 ;;
+        -h|--help)  usage ;;
+        *)          echo "Unknown option: $1"; usage ;;
+    esac
+done
+
+if [[ -z "$NODES" || -z "$MODEL" ]]; then
+    echo "Error: --nodes and --model are required"
+    echo ""
+    usage
+fi
+
+# --- Split nodes into array ---
+IFS=',' read -ra NODE_LIST <<< "$NODES"
+WORLD_SIZE=${#NODE_LIST[@]}
+
+if [[ $WORLD_SIZE -lt 2 ]]; then
+    echo "Error: distributed mode requires at least 2 nodes"
+    exit 1
+fi
+
+echo "=== ds4 distributed launch ==="
+echo "Nodes:      ${NODE_LIST[*]}"
+echo "World size: $WORLD_SIZE"
+echo "Model:      $MODEL"
+echo "Context:    $CTX"
+echo "Port:       $PORT"
+echo ""
+
+# --- Resolve coordinator IP (rank 0's LAN IP via Tailscale) ---
+# Use the LAN IP (10.x.x.x), NOT TB5 /30 IPs which cause JACCL error 60.
+COORD_NODE="${NODE_LIST[0]}"
+echo "Resolving coordinator LAN IP for $COORD_NODE..."
+
+# Try tailscale status to get the 100.x IP, then fall back to 10.x from /etc/hosts or DNS
+COORD_IP=$(ssh -o ConnectTimeout=5 "$COORD_NODE" \
+    "tailscale ip -4 2>/dev/null || hostname -I 2>/dev/null | awk '{print \$1}'" 2>/dev/null)
+
+if [[ -z "$COORD_IP" ]]; then
+    echo "Error: could not resolve LAN IP for coordinator $COORD_NODE"
+    exit 1
+fi
+echo "Coordinator: $COORD_IP ($COORD_NODE)"
+echo ""
+
+# --- Discover RDMA interfaces via asmi ---
+# Build the JACCL_IBV_DEVICES JSON matrix.
+# For N nodes, this is an NxN matrix where [i][j] is the RDMA interface on node i
+# that connects to node j (null for self).
+#
+# Example for 2 nodes: [[null, "rdma_en7"], ["rdma_en5", null]]
+
+echo "Discovering RDMA interfaces via asmi..."
+
+declare -A RDMA_IFACE  # RDMA_IFACE[i,j] = interface name on node i for link to node j
+
+for (( i=0; i<WORLD_SIZE; i++ )); do
+    node="${NODE_LIST[$i]}"
+    echo "  Querying $node:9090/links..."
+
+    # asmi /links returns JSON array of RDMA link objects with peer_hostname and rdma_device
+    links_json=$(curl -sf "http://${node}:9090/links" 2>/dev/null || echo "[]")
+
+    if [[ "$links_json" == "[]" ]]; then
+        echo "  Warning: no RDMA links found on $node (asmi may not be running)"
+    fi
+
+    for (( j=0; j<WORLD_SIZE; j++ )); do
+        if [[ $i -eq $j ]]; then
+            RDMA_IFACE[$i,$j]="null"
+            continue
+        fi
+
+        peer="${NODE_LIST[$j]}"
+        # Extract the RDMA device name for the link to this peer
+        iface=$(echo "$links_json" | python3 -c "
+import json, sys
+links = json.load(sys.stdin)
+for link in links:
+    peer = link.get('peer_hostname', link.get('peer', ''))
+    if '$peer' in peer:
+        print(link.get('rdma_device', link.get('interface', '')))
+        break
+" 2>/dev/null || echo "")
+
+        if [[ -n "$iface" ]]; then
+            RDMA_IFACE[$i,$j]="\"$iface\""
+        else
+            echo "  Warning: no RDMA interface found on $node for peer $peer"
+            RDMA_IFACE[$i,$j]="null"
+        fi
+    done
+done
+
+# --- Build JACCL_IBV_DEVICES JSON ---
+IBV_JSON="["
+for (( i=0; i<WORLD_SIZE; i++ )); do
+    [[ $i -gt 0 ]] && IBV_JSON+=", "
+    IBV_JSON+="["
+    for (( j=0; j<WORLD_SIZE; j++ )); do
+        [[ $j -gt 0 ]] && IBV_JSON+=", "
+        IBV_JSON+="${RDMA_IFACE[$i,$j]}"
+    done
+    IBV_JSON+="]"
+done
+IBV_JSON+="]"
+
+echo ""
+echo "JACCL_IBV_DEVICES=$IBV_JSON"
+echo ""
+
+# --- Launch on each node ---
+PIDS=()
+for (( rank=0; rank<WORLD_SIZE; rank++ )); do
+    node="${NODE_LIST[$rank]}"
+    echo "Launching rank $rank on $node..."
+
+    ssh -o ConnectTimeout=10 "$node" bash -c "'
+        export JACCL_RANK=$rank
+        export JACCL_WORLD_SIZE=$WORLD_SIZE
+        export JACCL_COORDINATOR=$COORD_IP
+        export JACCL_IBV_DEVICES='"'"'$IBV_JSON'"'"'
+        echo \"ds4: starting rank $rank/$WORLD_SIZE on \$(hostname)\"
+        echo \"  JACCL_COORDINATOR=$COORD_IP\"
+        echo \"  JACCL_IBV_DEVICES=\$JACCL_IBV_DEVICES\"
+        cd \$(dirname $DS4_BIN) 2>/dev/null || true
+        exec $DS4_BIN --distributed --metal --host 0.0.0.0 --port $PORT --ctx $CTX --model $MODEL $EXTRA_ARGS
+    '" &
+    PIDS+=($!)
+done
+
+echo ""
+echo "=== All $WORLD_SIZE ranks launched ==="
+echo "PIDs: ${PIDS[*]}"
+echo "Press Ctrl-C to stop all nodes"
+echo ""
+
+# --- Wait for any child to exit ---
+cleanup() {
+    echo ""
+    echo "Shutting down all ranks..."
+    for pid in "${PIDS[@]}"; do
+        kill "$pid" 2>/dev/null || true
+    done
+    wait
+    echo "All ranks stopped."
+}
+trap cleanup INT TERM
+
+wait -n "${PIDS[@]}" 2>/dev/null || true
+echo "A rank exited. Shutting down remaining..."
+cleanup
diff --git a/docs/distributed-benchmark.md b/docs/distributed-benchmark.md
new file mode 100644
index 000000000..97ac80c19
--- /dev/null
+++ b/docs/distributed-benchmark.md
@@ -0,0 +1,109 @@
+# Distributed Benchmark Methodology
+
+## Overview
+
+Measure the overhead of JACCL expert parallelism by comparing single-node and multi-node ds4-bench runs. The expected overhead per token is minimal: 16KB all_sum per layer x 43 layers = 688KB/token. At 11.7 GB/s measured RDMA bandwidth, this adds ~0.06ms per token.
+
+## Prerequisites
+
+- ds4 built with JACCL: `make clean && make JACCL=1`
+- GGUF model accessible on all nodes (same path, lazy mmap handles partial loading)
+- RDMA verified: `asmi links` shows active links between nodes
+- PD health checked: no prior PD exhaustion (reboot nodes if uncertain)
+
+## Single-Node Baseline
+
+Run on the coordinator node (hub):
+
+```bash
+# Generation benchmark (decode)
+./ds4-bench --metal --model gguf/ds4flash.gguf --ctx 32768 --batch 1 --repeat 3
+
+# Prefill benchmark
+./ds4-bench --metal --model gguf/ds4flash.gguf --ctx 32768 --batch 512 --repeat 3
+```
+
+Record: tok/s generation, tok/s prefill, peak memory.
+
+## 2-Node Distributed
+
+Use the launch script with ds4-bench instead of ds4-server:
+
+```bash
+# On rank 0 (hub):
+export JACCL_RANK=0
+export JACCL_WORLD_SIZE=2
+export JACCL_COORDINATOR=$(tailscale ip -4)
+export JACCL_IBV_DEVICES='[[null, "rdma_enX"], ["rdma_enY", null]]'
+./ds4-bench --distributed --metal --model gguf/ds4flash.gguf --ctx 32768 --batch 1 --repeat 3
+
+# On rank 1 (m3u4) — same env vars but JACCL_RANK=1:
+export JACCL_RANK=1
+export JACCL_WORLD_SIZE=2
+export JACCL_COORDINATOR=<rank0_tailscale_ip>
+export JACCL_IBV_DEVICES='[[null, "rdma_enX"], ["rdma_enY", null]]'
+./ds4-bench --distributed --metal --model gguf/ds4flash.gguf --ctx 32768 --batch 1 --repeat 3
+```
+
+Use `asmi links` on each node to fill in the actual RDMA interface names.
+
+Record: tok/s generation, tok/s prefill, overhead %.
+
+## Expected Overhead Calculation
+
+### Per-Token Communication Cost
+
+```
+all_sum payload per layer:  DS4_N_EMBD * sizeof(float) = 4096 * 4 = 16,384 bytes (16 KB)
+Number of MoE layers:       43
+Total per-token RDMA:       16 KB * 43 = 688 KB
+
+Measured RDMA bandwidth:    11.7 GB/s (from JACCL 4-node baseline benchmark)
+Transfer time per token:    688 KB / 11.7 GB/s = 0.057 ms
+
+Single-node generation:     ~15 tok/s = 66.7 ms/tok
+Expected overhead:          0.057 / 66.7 = 0.09%
+```
+
+### Prefill (Batch) Communication Cost
+
+```
+Batch all_sum payload:      n_tok * 16 KB * 43 layers
+For n_tok=512:              512 * 688 KB = 344 MB per prefill pass
+Transfer time:              344 MB / 11.7 GB/s = 29 ms
+Single-node 512-tok prefill: ~200 ms (estimated)
+Expected overhead:          29 / 200 = 14.5%
+```
+
+Prefill overhead is higher because the batch all_sum is proportional to token count. For short prompts (<64 tokens), overhead is negligible.
+
+## 4-Node Distributed
+
+Same procedure with JACCL_WORLD_SIZE=4. Expected compute savings: each node runs 64/256 experts instead of 256/256. The GPU still dispatches all 6 expert projections per token (weight masking, not early exit), so GPU time savings are limited. CPU path sees full 4x savings on expert compute.
+
+## Recording Results
+
+After each run, record in this table:
+
+| Config | Nodes | Generation tok/s | Prefill tok/s | Overhead % |
+|--------|-------|-------------------|---------------|------------|
+| Baseline | 1 (hub) | | | - |
+| 2-node | hub + m3u4 | | | |
+| 4-node | hub + m3u1 + m3u3 + m3u4 | | | |
+
+## Profiling
+
+For per-layer timing breakdown:
+
+```bash
+DS4_DECODE_PROFILE_DETAIL=1 ./ds4 --distributed --metal -c 4096 -n 10 -p "test"
+DS4_PREFILL_PROFILE_DETAIL=1 ./ds4 --distributed --metal -c 4096 -p "long prompt here..."
+```
+
+This prints per-layer timing for HC, norm, routed MoE, shared FFN, and post-processing. Compare the routed MoE time between single-node and distributed to isolate all_sum overhead from compute savings.
+
+## Known Limitations
+
+- **GPU waste on masked experts:** The fused Metal kernel still dispatches all 6 expert projections even when weights are zeroed. Shader-level early-exit on weight=0 is future work.
+- **PD exhaustion risk:** One JACCL Group per process lifetime. Never restart the benchmark without a clean process exit. If PD exhaustion occurs, power-cycle all nodes (full shutdown, not reboot).
+- **Prefill batch size:** Large batch all_sum (>100MB) may saturate RDMA bandwidth and show higher variance. Use --repeat 5 for large batch benchmarks.
diff --git a/docs/plans/2026-05-26-jaccl-distributed.md b/docs/plans/2026-05-26-jaccl-distributed.md
new file mode 100644
index 000000000..bbf13efd1
--- /dev/null
+++ b/docs/plans/2026-05-26-jaccl-distributed.md
@@ -0,0 +1,463 @@
+# Implementation plan — ds4 + JACCL Distributed Expert Parallelism
+
+**Date:** 2026-05-26
+**Companion research:** `~/.claude/projects/-Users-ma-Projects-r1o/memory/research_ds4_jaccl_integration_2026_05_26.md`
+**Working branch:** `feat/jaccl-distributed`
+**Repo:** `~/opensource/ds4` (fork of antirez/ds4)
+
+---
+
+## Working protocol
+
+Apply dependency-scanner framework before every multi-file edit. See Iron Laws for the non-negotiables this work obeys.
+
+## Architecture
+
+Expert parallelism across N nodes. Each node owns `256/N` experts per layer. All nodes run the full forward pass (attention is replicated, shared expert is replicated). After the routed expert down-projection, a single `all_sum()` over RDMA synchronizes the partial sums. 16KB per layer × 43 layers = ~700KB per token. At 11.7 GB/s RDMA, this adds ~0.06ms per token — negligible.
+
+```
+Node 0 (rank 0)                 Node 1 (rank 1)
+┌─────────────────┐             ┌─────────────────┐
+│ embed            │             │ embed            │
+│ attention (full) │             │ attention (full) │
+│ router (full)    │             │ router (full)    │
+│ experts 0-127    │             │ experts 128-255  │
+│ partial_sum      │             │ partial_sum      │
+│       └──── JACCL all_sum() ────┘                │
+│ shared_expert    │             │ shared_expert    │
+│ moe + shared     │             │ moe + shared     │
+│ output logits    │             │ output logits    │
+└─────────────────┘             └─────────────────┘
+```
+
+Each node mmaps the full GGUF — the OS only faults in pages for accessed experts (lazy mmap). No model splitting needed.
+
+## Tech stack
+
+- **Language:** C99 (ds4) + C++20 (JACCL) + ~70 LOC C shim bridging them
+- **JACCL:** Standalone lib from `ml-explore/mlx` @ main, pinned to commit `1322065f` (race fix, 2026-05-11)
+- **Build:** CMake 3.24+ for JACCL, then link `libjaccl.a` into ds4's Makefile
+- **SDK:** macOS 26.2+ (for `<infiniband/verbs.h>`)
+- **Runtime:** `librdma.dylib` (dlopen'd), Thunderbolt 5 RDMA enabled
+
+## Tasks
+
+### Phase A — JACCL C Shim + Build Integration
+
+#### Task A1 — Write the C shim header — 10 min
+
+**Edit:** `~/opensource/ds4/jaccl_shim.h` (new file)
+
+```c
+#ifndef JACCL_SHIM_H
+#define JACCL_SHIM_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void *jaccl_group_t;
+
+// Dtype enum matching jaccl::Dtype
+enum jaccl_dtype {
+    JACCL_FLOAT32 = 11, // matches jaccl::Dtype::Float32
+    JACCL_FLOAT16 = 9,  // matches jaccl::Dtype::Float16
+};
+
+bool     jaccl_is_available(void);
+jaccl_group_t jaccl_init_from_env(bool strict);
+void     jaccl_group_free(jaccl_group_t g);
+int      jaccl_group_rank(jaccl_group_t g);
+int      jaccl_group_size(jaccl_group_t g);
+void     jaccl_group_all_sum(jaccl_group_t g, const void *in, void *out, size_t n_bytes, int dtype);
+void     jaccl_group_barrier(jaccl_group_t g);
+void     jaccl_group_send(jaccl_group_t g, const void *buf, size_t n_bytes, int dst);
+void     jaccl_group_recv(jaccl_group_t g, void *buf, size_t n_bytes, int src);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+```
+
+**Verify:** File exists, compiles as C header: `cc -fsyntax-only -x c jaccl_shim.h`
+
+**Commit:** `[jaccl] add C shim header for JACCL integration`
+
+#### Task A2 — Write the C++ shim implementation — 15 min
+
+**Edit:** `~/opensource/ds4/jaccl_shim.cpp` (new file)
+
+```cpp
+#include "jaccl_shim.h"
+#include <jaccl/jaccl.h>
+#include <memory>
+
+static std::shared_ptr<jaccl::Group> unwrap(jaccl_group_t g) {
+    return *reinterpret_cast<std::shared_ptr<jaccl::Group>*>(g);
+}
+
+extern "C" {
+
+bool jaccl_is_available(void) {
+    return jaccl::is_available();
+}
+
+jaccl_group_t jaccl_init_from_env(bool strict) {
+    auto group = jaccl::init(strict);
+    if (!group) return nullptr;
+    auto *p = new std::shared_ptr<jaccl::Group>(std::move(group));
+    return reinterpret_cast<jaccl_group_t>(p);
+}
+
+void jaccl_group_free(jaccl_group_t g) {
+    if (!g) return;
+    delete reinterpret_cast<std::shared_ptr<jaccl::Group>*>(g);
+}
+
+int jaccl_group_rank(jaccl_group_t g) { return unwrap(g)->rank(); }
+int jaccl_group_size(jaccl_group_t g) { return unwrap(g)->size(); }
+
+void jaccl_group_all_sum(jaccl_group_t g, const void *in, void *out, size_t n_bytes, int dtype) {
+    unwrap(g)->all_sum(in, out, n_bytes, dtype);
+}
+
+void jaccl_group_barrier(jaccl_group_t g) { unwrap(g)->barrier(); }
+
+void jaccl_group_send(jaccl_group_t g, const void *buf, size_t n_bytes, int dst) {
+    unwrap(g)->send(buf, n_bytes, dst);
+}
+
+void jaccl_group_recv(jaccl_group_t g, void *buf, size_t n_bytes, int src) {
+    unwrap(g)->recv(buf, n_bytes, src);
+}
+
+} // extern "C"
+```
+
+**Verify:** Compiles with JACCL headers: `c++ -std=c++20 -c jaccl_shim.cpp -I<jaccl-include-path> -fsyntax-only`
+
+**Commit:** `[jaccl] add C++ shim implementation wrapping JACCL Group API`
+
+#### Task A3 — Build JACCL as static lib + integrate Makefile — 15 min
+
+**Pre-flight:** Verify macOS SDK >= 26.2: `xcrun --sdk macosx --show-sdk-version`
+
+**Edit:** `~/opensource/ds4/Makefile` — add JACCL build targets
+
+Add to Makefile (Darwin section):
+- CMake configure + build of JACCL into `build/jaccl/`
+- Compile `jaccl_shim.cpp` against JACCL headers
+- Link `libjaccl.a` + `jaccl_shim.o` into all ds4 binaries
+- Conditional: only when `JACCL=1` make flag is set (opt-in, doesn't break default build)
+
+**Verify:** `make clean && make JACCL=1` — all 5 binaries link without error
+
+**Commit:** `[jaccl] integrate JACCL static lib build into Makefile (opt-in JACCL=1)`
+
+#### Task A4 — Standalone shim test — 10 min
+
+**Edit:** `~/opensource/ds4/tests/test_jaccl_shim.c` (new file)
+
+Simple single-process test: `jaccl_is_available()` returns true/false, if available init+rank+size+free cycle. Does not require multi-node — validates linkage.
+
+**Verify:** `make JACCL=1 test_jaccl_shim && ./test_jaccl_shim` — prints availability and exits 0
+
+**Commit:** `[jaccl] add shim linkage test`
+
+### Phase B — Distributed State in ds4 Engine
+
+#### Task B1 — Add distributed state to ds4_engine — 10 min
+
+**Pre-flight:** Read ds4.c:15056-15073 (ds4_engine struct), ds4.h:62-76 (ds4_engine_options)
+
+**Edit:** `ds4.h` — add to `ds4_engine_options`:
+
+```c
+bool distributed;      // enable JACCL distributed mode
+```
+
+**Edit:** `ds4.c` — add at file scope (conditionally compiled):
+
+```c
+#ifdef DS4_JACCL
+#include "jaccl_shim.h"
+#endif
+```
+
+**Edit:** `ds4.c` — add to `struct ds4_engine`:
+
+```c
+// Distributed (JACCL)
+void *jaccl_group;     // jaccl_group_t, NULL when not distributed
+int   world_size;      // total ranks (1 when single-node)
+int   rank;            // this process's rank
+int   expert_start;    // first expert owned by this rank
+int   expert_end;      // one-past-last expert owned
+```
+
+**Verify:** `make JACCL=1` compiles
+
+**Commit:** `[jaccl] add distributed state fields to ds4_engine`
+
+#### Task B2 — Initialize JACCL in engine_open — 10 min
+
+**Pre-flight:** Read ds4.c:17985 (ds4_engine_open)
+
+**Edit:** `ds4.c` in `ds4_engine_open()` — after model load, before Metal init:
+
+```c
+if (opt->distributed) {
+    e->jaccl_group = jaccl_init_from_env(/*strict=*/true);
+    if (!e->jaccl_group) { fprintf(stderr, "ds4: JACCL init failed\n"); return -1; }
+    e->world_size = jaccl_group_size(e->jaccl_group);
+    e->rank = jaccl_group_rank(e->jaccl_group);
+    int experts_per_rank = DS4_N_EXPERT / e->world_size;
+    e->expert_start = e->rank * experts_per_rank;
+    e->expert_end = (e->rank == e->world_size - 1) ? DS4_N_EXPERT : e->expert_start + experts_per_rank;
+    fprintf(stderr, "ds4: distributed mode rank %d/%d experts [%d, %d)\n",
+            e->rank, e->world_size, e->expert_start, e->expert_end);
+} else {
+    e->jaccl_group = NULL;
+    e->world_size = 1;
+    e->rank = 0;
+    e->expert_start = 0;
+    e->expert_end = DS4_N_EXPERT;
+}
+```
+
+**Edit:** `ds4_engine_close()` — add `if (e->jaccl_group) jaccl_group_free(e->jaccl_group);`
+
+**Verify:** Single-node: `./ds4 -p "test" --metal` works unchanged. Multi-node: `JACCL_RANK=0 ... ./ds4 --distributed -p "test" --cpu` initializes JACCL and prints rank info.
+
+**Commit:** `[jaccl] init/teardown JACCL in engine lifecycle`
+
+#### Task B3 — Wire --distributed flag through CLI/server — 5 min
+
+**Edit:** `ds4_cli.c` — add `--distributed` flag parsing → `engine.distributed = true`
+**Edit:** `ds4_server.c` — same
+**Edit:** `ds4.h` — add `bool distributed` to `ds4_engine_options`
+
+**Verify:** `./ds4 --help` shows `--distributed`
+
+**Commit:** `[jaccl] add --distributed CLI flag`
+
+### Phase C — Expert Dispatch Modification (CPU Path)
+
+#### Task C1 — Modify expert accumulation to respect rank ownership — 20 min
+
+**Pre-flight:** Read ds4.c:4375-4432 (`matvec_q2_k_experts_accum_prequant` and its worker)
+
+This is the core change. The inner loop at ds4.c:4378-4386 currently iterates over all 6 selected experts. In distributed mode, each rank only computes experts it owns:
+
+**Edit:** `ds4.c` — modify `matvec_q2_k_accum_worker()`:
+
+```c
+// Before: iterate over all n_expert selected
+for (int i = 0; i < ctx->n_expert; i++) {
+    int expert_id = ctx->selected[i].expert;
+    // Skip experts not owned by this rank
+    if (ctx->distributed && (expert_id < ctx->expert_start || expert_id >= ctx->expert_end))
+        continue;
+    float v = 0.0f;
+    ds4_vec_dot_q2_K_q8_K(..., &v, ...);
+    acc += v;
+}
+```
+
+Then after the accumulation loop returns, if distributed:
+
+```c
+if (engine->jaccl_group) {
+    float *tmp = alloca(DS4_N_EMBD * sizeof(float));
+    memcpy(tmp, out, DS4_N_EMBD * sizeof(float));
+    jaccl_group_all_sum(engine->jaccl_group, tmp, out, DS4_N_EMBD * sizeof(float), JACCL_FLOAT32);
+}
+```
+
+**Verify:** 
+1. Single-node: output unchanged (all experts owned, no all_sum called)
+2. Two-node CPU: `JACCL_RANK=0 ... ./ds4 --distributed --cpu -p "hello"` on hub, `JACCL_RANK=1 ... ./ds4 --distributed --cpu -p "hello"` on m3u4 — both produce identical output
+
+**Commit:** `[jaccl] distribute expert accumulation with all_sum (CPU path)`
+
+#### Task C2 — Modify batch expert accumulation — 15 min
+
+**Pre-flight:** Read ds4.c:5979-6003 (batch accumulation path)
+
+Same pattern as C1 but for the batch (prefill) path. The `matvec_q2_k_batch_accum_rows_worker` needs the same rank-ownership filter, and the batch output needs `all_sum()` after accumulation.
+
+**Verify:** Prefill of a multi-token prompt produces identical output across 2 nodes
+
+**Commit:** `[jaccl] distribute batch expert accumulation (CPU prefill path)`
+
+#### Task C3 — Modify gate/up projection to skip non-owned experts — 15 min
+
+**Pre-flight:** Read ds4.c:5740-5768 (gate+up projection in `layer_routed_moe_one`)
+
+The gate+up projection (`matvec_iq2_xxs_experts_mid_prequant`) currently computes all 6 selected experts. In distributed mode, skip experts not owned by this rank (saves compute, not just the reduction).
+
+**Verify:** Same as C1 — identical output across ranks
+
+**Commit:** `[jaccl] skip non-owned expert gate/up projections`
+
+### Phase D — Metal Path (Fused Kernel + all_sum)
+
+The fused Metal kernel `ds4_gpu_routed_moe_one_tensor` writes partial expert sums to `g->routed_out`. No kernel splitting needed — we mask non-owned experts before dispatch, then `all_sum()` the output buffer after.
+
+#### Task D1 — Mask non-owned experts in router_selected buffer — 15 min
+
+**Pre-flight:** Read ds4.c:10474-10492 (fused MoE kernel call) and ds4_gpu.h:639-664 (kernel signature). The kernel reads `router_selected` (int32 array of 6 expert IDs) and `router_weights` (float array of 6 weights).
+
+**Edit:** `ds4.c` — after `ds4_gpu_router_select_tensor()` (line ~10466) and before `ds4_gpu_routed_moe_one_tensor()` (line 10474), add:
+
+```c
+// In distributed mode, zero weights for experts not owned by this rank.
+// The fused kernel multiplies each expert's contribution by its weight,
+// so weight=0 effectively skips it. No Metal shader changes needed.
+if (engine->jaccl_group) {
+    metal_graph_mask_non_owned_experts(g, engine->expert_start, engine->expert_end);
+}
+```
+
+`metal_graph_mask_non_owned_experts` reads `g->router_selected` + `g->router_weights` from GPU→CPU (they're StorageModeShared, so zero-copy), zeros weights for expert IDs outside `[expert_start, expert_end)`.
+
+**Note on GPU compute:** Weight masking zeroes the result but does NOT skip GPU dispatch — all 6 expert projections still execute for zeroed experts. This wastes ~50% GPU compute on 2-node. Correctness-first: accept this for now. Compute savings requires Metal shader changes to early-exit on weight=0 (future optimization, not in this plan).
+
+**Verify:** Single-node: no masking (all experts owned). 2-node: output correctness matches single-node within float epsilon.
+
+**Commit:** `[jaccl] mask non-owned experts before fused Metal MoE kernel`
+
+#### Task D2 — Insert all_sum after fused MoE kernel output — 15 min
+
+**Pre-flight:** Read ds4.c:10492-10568 (between MoE kernel and shared+routed addition)
+
+**Edit:** `ds4.c` — after `ds4_gpu_routed_moe_one_tensor` returns (line ~10493), before the `ds4_gpu_add_tensor` or fused HC variant:
+
+```c
+// Distributed: synchronize partial expert sums across ranks via RDMA.
+// g->routed_out is StorageModeShared (mmap-backed), RDMA-registerable.
+// 16KB all_sum at 11.7 GB/s = ~1.4 microseconds — negligible.
+if (engine->jaccl_group) {
+    // No explicit GPU sync needed — ds4_gpu_routed_moe_one_tensor is synchronous
+    // (calls ds4_gpu_finish_command_buffer at ds4_metal.m:14184).
+    void *buf = ds4_gpu_tensor_contents(g->routed_out);
+    jaccl_group_all_sum(engine->jaccl_group,
+                        buf, buf,  // in-place (supported: mesh_impl.h:34 copies if in!=out)
+                        DS4_N_EMBD * sizeof(float),
+                        JACCL_FLOAT32);
+}
+```
+
+**Key detail:** `ds4_gpu_routed_moe_one_tensor` already calls `ds4_gpu_finish_command_buffer()` at ds4_metal.m:14184 — it is synchronous. By the time it returns to ds4.c:10493, `g->routed_out` (StorageModeShared) is CPU-readable. No explicit sync needed before the all_sum.
+
+**Verify:** 2-node Metal: output tokens match single-node Metal within float epsilon
+
+**Commit:** `[jaccl] insert all_sum on routed_out after fused Metal MoE kernel`
+
+#### Task D3 — Same pattern for batch (prefill) Metal path — 15 min
+
+**Pre-flight:** Read ds4.c:13316 (`ds4_gpu_routed_moe_batch_tensor` call)
+
+Same mask + all_sum pattern for the batch/prefill Metal kernel. The batch variant processes N tokens, so the all_sum is `N * DS4_N_EMBD * sizeof(float)`.
+
+**Verify:** Multi-token prefill produces identical output across 2 nodes
+
+**Commit:** `[jaccl] distribute batch Metal MoE path`
+
+### Phase E — Launch Script + Integration Test
+
+#### Task E1 — Write a JACCL launch script — 10 min
+
+**Edit:** `~/opensource/ds4/distributed_launch.sh` (new file)
+
+Reads the asmi hostfile format, sets `JACCL_RANK`, `JACCL_COORDINATOR`, `JACCL_IBV_DEVICES` per node, and SSHs to each node to start ds4-server with `--distributed`.
+
+**Verify:** `./distributed_launch.sh --nodes hub,m3u4 --model gguf/ds4flash.gguf --ctx 32768` starts ds4-server on both nodes
+
+**Commit:** `[jaccl] add distributed launch script`
+
+#### Task E2 — End-to-end correctness test — 15 min
+
+**Edit:** `~/opensource/ds4/tests/test_distributed_correctness.sh` (new file)
+
+1. Run a prompt on single-node, capture output tokens + logits
+2. Run same prompt on 2-node distributed, capture output tokens + logits
+3. Compare: tokens must be identical, logits must match within float epsilon
+
+**Verify:** Script passes on hub + m3u4
+
+**Commit:** `[jaccl] add distributed correctness test`
+
+#### Task E3 — Benchmark: single-node vs distributed — 10 min
+
+Use `ds4-bench` to measure:
+- Single-node q4-imatrix on hub (512GB): tok/s prefill + generation
+- 2-node distributed q4-imatrix on hub + m3u4: tok/s prefill + generation
+
+Record results, compute overhead percentage.
+
+**Verify:** Distributed overhead < 5% for generation (16KB all_sum should be negligible)
+
+**Commit:** `[jaccl] document distributed benchmark results`
+
+## File touch matrix
+
+| File | Lines added | Lines removed | Notes |
+|---|---|---|---|
+| `jaccl_shim.h` | ~35 | 0 | New — C shim header |
+| `jaccl_shim.cpp` | ~50 | 0 | New — C++ shim impl |
+| `Makefile` | ~25 | 0 | JACCL build integration (opt-in) |
+| `ds4.h` | ~3 | 0 | distributed flag + engine accessors |
+| `ds4.c` | ~80 | ~10 | Engine state, expert dispatch, all_sum calls |
+| `ds4_cli.c` | ~5 | 0 | --distributed flag |
+| `ds4_server.c` | ~5 | 0 | --distributed flag |
+| `distributed_launch.sh` | ~60 | 0 | New — multi-node launcher |
+| `tests/test_jaccl_shim.c` | ~30 | 0 | New — linkage test |
+| `tests/test_distributed_correctness.sh` | ~40 | 0 | New — e2e correctness |
+
+**Total:** ~420 LOC added, ~10 removed. 10 files touched (6 new, 4 modified). Phase D adds ~90 LOC for Metal path (mask + all_sum + batch).
+
+**Time estimate:** 8-12 hours of focused work. Major time sinks: JACCL CMake extraction (~2h), RDMA debugging on live cluster (~2-3h), batch path complexity (expert-grouped histogram differs from single-token pattern, ~1.5h).
+
+## Risk register
+
+| Risk | Mitigation |
+|---|---|
+| Metal GPU compute waste | Weight masking zeroes output but doesn't skip dispatch. Accept 50% waste for correctness-first. Shader early-exit is future work. |
+| PD exhaustion on repeated launch | One JACCL Group per process lifetime. Never teardown. Script enforces single launch. |
+| Expert count not evenly divisible by world_size | expert_end for last rank = DS4_N_EXPERT (takes remainder). 256/2=128, 256/4=64 — clean. |
+| KV cache divergence across ranks | Attention is replicated, same input → same KV. Disk KV cache is per-node — independent, no conflict. |
+| antirez won't merge | We fork. Our changes are additive (behind --distributed flag). Default build is unchanged. |
+| Q4 expert tensors are interleaved differently than Q2 | Both use `tensor_expert_bytes()` for slicing (ds4.c:4179-4180). Pattern is identical. |
+
+## Rollback strategy
+
+| Failure point | Action |
+|---|---|
+| JACCL build fails | `make clean && make` (no JACCL=1) — default build unaffected |
+| Distributed produces wrong output | Remove `--distributed` flag — single-node path untouched |
+| PD exhaustion | `shutdown -h` + 60s poweroff on all nodes (known recovery, [[pd-exhaustion-deep-dive-2026-05-14]]) |
+
+## Acceptance criteria
+
+1. `make JACCL=1` builds all 5 binaries with zero warnings
+2. `make` (without JACCL=1) still builds — no regressions to default path
+3. Single-node `./ds4 --metal -p "test"` output is bit-identical with and without JACCL compiled in
+4. 2-node distributed CPU: output tokens match single-node CPU within float epsilon
+5. 2-node distributed: `ds4-bench` generation overhead < 5% vs single-node
+6. No PD leaks: `jaccl_group_free()` called in engine_close, verified via RDMA metric after run
+
+## Out of scope
+
+- **Metal shader early-exit optimization** — the fused kernel still dispatches all experts (weight=0 zeroes output but doesn't skip compute). Shader-level early-exit on weight=0 is future work.
+- **4-node distributed** — 2-node proves the architecture. 4-node is config change only.
+- **DeepSeek V4 Pro distributed** — Pro has 384 experts (different count). Test with Flash first.
+- **Upstream PR to antirez/ds4** — build as fork first, demonstrate value, then propose.
+- **TurboQuant KV cache** — PR #243 not merged yet. Orthogonal to distributed.
+- **Ensemble approach** — antirez's preferred method. Complementary, not competing.
diff --git a/ds4.c b/ds4.c
index ecbcec3f5..bd753dda4 100644
--- a/ds4.c
+++ b/ds4.c
@@ -40,6 +40,9 @@
 #ifndef DS4_NO_GPU
 #include "ds4_gpu.h"
 #endif
+#ifdef DS4_JACCL
+#include "jaccl_shim.h"
+#endif
 #if defined(__ARM_NEON)
 #include <arm_neon.h>
 #endif
@@ -70,6 +73,14 @@ static const char DS4_REASONING_EFFORT_MAX_PREFIX[] =
  * asks for a reasoning budget the allocated context is not meant to hold. */
 #define DS4_THINK_MAX_MIN_CONTEXT 393216u
 
+/* File-scope distributed state -- safe because ds4 enforces a single engine
+ * instance per process via the instance lock.  Set once in ds4_engine_open(),
+ * read by inner kernels without threading the engine pointer through every
+ * layer function. */
+static void *g_jaccl_group  = NULL; /* jaccl_group_t */
+static int   g_expert_start = 0;
+static int   g_expert_end   = 0;
+
 static bool ds4_backend_uses_graph(ds4_backend backend) {
     return backend == DS4_BACKEND_METAL || backend == DS4_BACKEND_CUDA;
 }
@@ -4246,6 +4257,7 @@ typedef struct {
     uint64_t gate_row_bytes[DS4_MAX_EXPERT_USED];
     uint64_t up_row_bytes[DS4_MAX_EXPERT_USED];
     int n_expert;
+    bool skip_slot[DS4_MAX_EXPERT_USED]; /* distributed: true = non-owned expert */
 } matvec_iq2_xxs_mid_ctx;
 
 static void matvec_iq2_xxs_mid_worker(void *vctx, uint64_t row0, uint64_t row1) {
@@ -4254,6 +4266,13 @@ static void matvec_iq2_xxs_mid_worker(void *vctx, uint64_t row0, uint64_t row1)
     for (uint64_t idx = row0; idx < row1; idx++) {
         const int slot = (int)(idx / ctx->out_dim);
         const uint64_t row = idx - (uint64_t)slot * ctx->out_dim;
+
+        /* In distributed mode, skip gate/up compute for non-owned experts. */
+        if (ctx->skip_slot[slot]) {
+            ctx->mid[idx] = 0.0f;
+            continue;
+        }
+
         float gate = 0.0f;
         float up = 0.0f;
 
@@ -4311,6 +4330,12 @@ static void matvec_iq2_xxs_experts_mid_prequant(
             ds4_die("IQ2_XXS expert tensors do not share a layout");
         }
         ctx.expert_weight[i] = expert_weight[i];
+#ifdef DS4_JACCL
+        ctx.skip_slot[i] = g_jaccl_group &&
+                           (selected[i] < g_expert_start || selected[i] >= g_expert_end);
+#else
+        ctx.skip_slot[i] = false;
+#endif
     }
     if (in_dim0 % QK_K != 0) ds4_die("IQ2_XXS expert row is not QK_K aligned");
 
@@ -4370,6 +4395,9 @@ typedef struct {
     uint64_t in_dim;
     uint64_t row_bytes[DS4_MAX_EXPERT_USED];
     int n_expert;
+    int selected[DS4_MAX_EXPERT_USED]; /* expert IDs, for distributed ownership check */
+    int expert_start;                  /* first expert owned by this rank */
+    int expert_end;                    /* one-past-last expert owned */
 } matvec_q2_k_accum_ctx;
 
 static void matvec_q2_k_accum_worker(void *vctx, uint64_t row0, uint64_t row1) {
@@ -4378,6 +4406,10 @@ static void matvec_q2_k_accum_worker(void *vctx, uint64_t row0, uint64_t row1) {
     for (uint64_t row = row0; row < row1; row++) {
         float acc = 0.0f;
         for (int i = 0; i < ctx->n_expert; i++) {
+            /* In distributed mode, skip experts not owned by this rank. */
+            int eid = ctx->selected[i];
+            if (eid < ctx->expert_start || eid >= ctx->expert_end)
+                continue;
             float v = 0.0f;
             const block_q2_K *br = (const block_q2_K *)(ctx->base[i] + row * ctx->row_bytes[i]);
             ds4_vec_dot_q2_K_q8_K((int)ctx->in_dim, &v, br, ctx->xq[i]);
@@ -4421,11 +4453,14 @@ static void matvec_q2_k_experts_accum_prequant(
         .out = out,
         .in_dim = in_dim0,
         .n_expert = n_expert,
+        .expert_start = g_expert_start,
+        .expert_end   = g_expert_end,
     };
     for (int i = 0; i < n_expert; i++) {
         ctx.base[i] = base[i];
         ctx.row_bytes[i] = row_bytes[i];
         ctx.xq[i] = xq + (uint64_t)i * n_blocks;
+        ctx.selected[i] = selected[i];
     }
 
     ds4_parallel_for(out_dim0, matvec_q2_k_accum_worker, &ctx);
@@ -4452,6 +4487,8 @@ typedef struct {
     uint64_t gate_row_bytes[DS4_MAX_EXPERT];
     uint64_t up_row_bytes[DS4_MAX_EXPERT];
     uint64_t xq_blocks;
+    int expert_start;   /* distributed: first owned expert */
+    int expert_end;     /* distributed: one-past-last owned expert */
 } matvec_iq2_xxs_batch_mid_ctx;
 
 static void matvec_iq2_xxs_batch_mid_worker(void *vctx, uint64_t task0, uint64_t task1) {
@@ -4461,6 +4498,18 @@ static void matvec_iq2_xxs_batch_mid_worker(void *vctx, uint64_t task0, uint64_t
         const uint32_t active_idx = (uint32_t)(task / ctx->out_dim);
         const uint64_t row = task - (uint64_t)active_idx * ctx->out_dim;
         const uint32_t expert = ctx->active_expert[active_idx];
+
+        /* In distributed mode, skip gate/up compute for non-owned experts. */
+        if ((int)expert < ctx->expert_start || (int)expert >= ctx->expert_end) {
+            const uint32_t begin = ctx->expert_offset[expert];
+            const uint32_t end = ctx->expert_offset[expert + 1];
+            for (uint32_t i = begin; i < end; i++) {
+                const uint32_t pair_id = ctx->pair_ids[i];
+                ctx->mid[(uint64_t)pair_id * ctx->out_dim + row] = 0.0f;
+            }
+            continue;
+        }
+
         const uint32_t begin = ctx->expert_offset[expert];
         const uint32_t end = ctx->expert_offset[expert + 1];
 
@@ -4551,6 +4600,8 @@ typedef struct {
     uint64_t out_dim;
     uint64_t row_bytes[DS4_MAX_EXPERT];
     uint64_t midq_blocks;
+    int expert_start;   /* distributed: first owned expert */
+    int expert_end;     /* distributed: one-past-last owned expert */
 } matvec_q2_k_batch_accum_rows_ctx;
 
 static void matvec_q2_k_batch_accum_rows_worker(void *vctx, uint64_t row0, uint64_t row1) {
@@ -4563,6 +4614,9 @@ static void matvec_q2_k_batch_accum_rows_worker(void *vctx, uint64_t row0, uint6
 
         for (uint32_t ai = 0; ai < ctx->n_active; ai++) {
             const uint32_t expert = ctx->active_expert[ai];
+            /* In distributed mode, skip experts not owned by this rank. */
+            if ((int)expert < ctx->expert_start || (int)expert >= ctx->expert_end)
+                continue;
             const uint32_t begin = ctx->expert_offset[expert];
             const uint32_t end = ctx->expert_offset[expert + 1];
             const block_q2_K *br = (const block_q2_K *)(ctx->base[expert] + row * ctx->row_bytes[expert]);
@@ -5766,6 +5820,14 @@ static void layer_routed_moe_one(
                                   (int64_t)down_in_dim);
         }
         matvec_q2_k_experts_accum_prequant(out, model, layer->ffn_down_exps, midq, selected, DS4_N_EXPERT_USED);
+#ifdef DS4_JACCL
+        if (g_jaccl_group) {
+            float *tmp = alloca(DS4_N_EMBD * sizeof(float));
+            memcpy(tmp, out, DS4_N_EMBD * sizeof(float));
+            jaccl_group_all_sum(g_jaccl_group, tmp, out,
+                                DS4_N_EMBD * sizeof(float), JACCL_FLOAT32);
+        }
+#endif
     } else {
         for (uint32_t i = 0; i < DS4_N_EXPERT_USED; i++) {
             const uint32_t expert = (uint32_t)selected[i];
@@ -5860,6 +5922,10 @@ static void layer_routed_moe_one_prealloc(
                               (int64_t)down_in_dim);
     }
     matvec_q2_k_experts_accum_prequant(out, model, layer->ffn_down_exps, midq, selected, DS4_N_EXPERT_USED);
+    /* NOTE: all_sum is NOT done here — callers are responsible for
+     * synchronizing after this function returns.  This function is called
+     * from threaded contexts (routed_moe_tokens_worker via ds4_parallel_for)
+     * where concurrent all_sum would be undefined behavior. */
 
     (void)il;
 }
@@ -5947,6 +6013,8 @@ static void layer_routed_moe_batch(
         .in_dim = expert_in_dim,
         .out_dim = expert_out_dim,
         .xq_blocks = xq_blocks,
+        .expert_start = g_expert_start,
+        .expert_end   = g_expert_end,
     };
 
     for (uint32_t ai = 0; ai < n_active; ai++) {
@@ -5988,6 +6056,8 @@ static void layer_routed_moe_batch(
         .in_dim = down_in_dim,
         .out_dim = down_out_dim,
         .midq_blocks = midq_blocks,
+        .expert_start = g_expert_start,
+        .expert_end   = g_expert_end,
     };
 
     for (uint32_t ai = 0; ai < n_active; ai++) {
@@ -6002,6 +6072,14 @@ static void layer_routed_moe_batch(
 
     ds4_parallel_for(down_out_dim, matvec_q2_k_batch_accum_rows_worker, &down_ctx);
 
+#ifdef DS4_JACCL
+    if (g_jaccl_group) {
+        jaccl_group_all_sum(g_jaccl_group, moe, moe,
+                            (size_t)n_tok * DS4_N_EMBD * sizeof(float),
+                            JACCL_FLOAT32);
+    }
+#endif
+
     free(midq);
     free(pair_ids);
     free(xq);
@@ -6168,6 +6246,14 @@ static void layer_ffn_one_decode_scratch(
                                   scratch->routed_mid_all,
                                   scratch->routed_xq,
                                   scratch->routed_midq);
+#ifdef DS4_JACCL
+    if (g_jaccl_group) {
+        float *tmp = alloca(DS4_N_EMBD * sizeof(float));
+        memcpy(tmp, scratch->ffn_moe, DS4_N_EMBD * sizeof(float));
+        jaccl_group_all_sum(g_jaccl_group, tmp, scratch->ffn_moe,
+                            DS4_N_EMBD * sizeof(float), JACCL_FLOAT32);
+    }
+#endif
     if (profile) t_routed = now_sec() - t0;
 
     t0 = profile ? now_sec() : 0.0;
@@ -6392,6 +6478,16 @@ static void layer_ffn_shared_batch(
                                           routed_midq);
         }
     }
+#ifdef DS4_JACCL
+    /* Single all_sum after all tokens are done — covers both the parallel
+     * path (where per-token all_sum inside threads would be UB) and the
+     * serial fallback. */
+    if (g_jaccl_group) {
+        jaccl_group_all_sum(g_jaccl_group, moe, moe,
+                            (size_t)n_tok * DS4_N_EMBD * sizeof(float),
+                            JACCL_FLOAT32);
+    }
+#endif
     if (profile) t_routed = now_sec() - t0;
 
     t0 = profile ? now_sec() : 0.0;
@@ -9137,6 +9233,12 @@ static bool metal_graph_ensure_batch_ffn_out(ds4_gpu_graph *g) {
     return g->batch_ffn_out != NULL;
 }
 
+#ifdef DS4_JACCL
+/* CPU masking functions removed — replaced by GPU kernels
+ * ds4_gpu_expert_mask() and ds4_gpu_expert_mask_batch() which run
+ * in the same command buffer without breaking the batch. */
+#endif
+
 /* =========================================================================
  * Metal Release Graph Allocation.
  * ========================================================================= */
@@ -10471,6 +10573,19 @@ static bool metal_graph_encode_decode_layer(
         metal_graph_debug_dump_i32_tensor("ffn_moe_topk", g->router_selected, DS4_N_EXPERT_USED, il, pos);
         metal_graph_debug_dump_tensor("ffn_moe_weights_scaled", g->router_weights, DS4_N_EXPERT_USED, il, pos);
     }
+#ifdef DS4_JACCL
+    if (ok && g_jaccl_group) {
+        if (getenv("DS4_EXPERT_COMPACT")) {
+            uint32_t compacted = 0;
+            ok = ds4_gpu_expert_compact(g->router_selected, g->router_weights,
+                                        g_expert_start, g_expert_end,
+                                        DS4_N_EXPERT_USED, &compacted) != 0;
+        } else {
+            ok = ds4_gpu_expert_mask(g->router_selected, g->router_weights,
+                                     g_expert_start, g_expert_end, DS4_N_EXPERT_USED) != 0;
+        }
+    }
+#endif
     if (ok) ok = ds4_gpu_routed_moe_one_tensor(g->routed_out,
                                                  g->routed_gate,
                                                  g->routed_up,
@@ -10508,6 +10623,32 @@ static bool metal_graph_encode_decode_layer(
     if (ok) {
         metal_graph_debug_dump_tensor("ffn_moe_out", g->routed_out, DS4_N_EMBD, il, pos);
     }
+#ifdef DS4_JACCL
+    if (ok && g_jaccl_group) {
+        static bool first_allsum = true;
+        if (first_allsum) {
+            fprintf(stderr, "ds4: first all_sum layer=%u n_bytes=%zu\n",
+                    il, (size_t)(DS4_N_EMBD * sizeof(float)));
+            first_allsum = false;
+        }
+        if (!ds4_gpu_end_commands() || !ds4_gpu_begin_commands()) {
+            ok = 0;
+        }
+        if (ok) {
+        float *buf = (float *)ds4_gpu_tensor_contents(g->routed_out);
+        /* fp16 all_sum: halves RDMA payload (28KB → 14KB) with no quality
+         * loss — downstream HC post accumulates in fp32. */
+        static _Float16 *allsum_f16 = NULL;
+        if (!allsum_f16) allsum_f16 = malloc(DS4_N_EMBD * sizeof(_Float16));
+        for (uint32_t i = 0; i < DS4_N_EMBD; i++) allsum_f16[i] = (_Float16)buf[i];
+        jaccl_group_all_sum(g_jaccl_group, allsum_f16, allsum_f16,
+                            DS4_N_EMBD * sizeof(_Float16), JACCL_FLOAT16);
+        for (uint32_t i = 0; i < DS4_N_EMBD; i++) buf[i] = (float)allsum_f16[i];
+        }
+        metal_graph_debug_dump_tensor("ffn_moe_out_after_allsum",
+                                      g->routed_out, DS4_N_EMBD, il, pos);
+    }
+#endif
     const bool fuse_shared_gate_up =
         !g->quality &&
         getenv("DS4_METAL_DISABLE_SHARED_GATE_UP_SWIGLU_FUSION") == NULL;
@@ -10935,6 +11076,14 @@ static void metal_graph_trace_layer_stages(
                                   routed_mid_all,
                                   routed_xq,
                                   routed_midq);
+#ifdef DS4_JACCL
+    if (g_jaccl_group) {
+        float *tmp = alloca(DS4_N_EMBD * sizeof(float));
+        memcpy(tmp, cpu_routed, DS4_N_EMBD * sizeof(float));
+        jaccl_group_all_sum(g_jaccl_group, tmp, cpu_routed,
+                            DS4_N_EMBD * sizeof(float), JACCL_FLOAT32);
+    }
+#endif
     if (layer->ffn_gate_tid2eid) {
         layer_hash_selected_experts(selected, model, layer, token);
         layer_hash_router_weights_one(expert_weight, model, layer, cpu_ffn_norm, selected);
@@ -11159,6 +11308,14 @@ static int metal_graph_decode_test(
                                   routed_mid_all,
                                   routed_xq,
                                   routed_midq);
+#ifdef DS4_JACCL
+    if (g_jaccl_group) {
+        float *tmp = alloca(DS4_N_EMBD * sizeof(float));
+        memcpy(tmp, cpu_routed, DS4_N_EMBD * sizeof(float));
+        jaccl_group_all_sum(g_jaccl_group, tmp, cpu_routed,
+                            DS4_N_EMBD * sizeof(float), JACCL_FLOAT32);
+    }
+#endif
     if (layer->ffn_gate_tid2eid) {
         layer_hash_selected_experts(selected, model, layer, token);
         layer_hash_router_weights_one(expert_weight, model, layer, cpu_ffn_norm, selected);
@@ -13311,6 +13468,12 @@ static bool metal_graph_encode_layer_ffn_batch(
                                       (uint64_t)n_tokens * DS4_N_EXPERT_USED, il, pos0);
     }
     DS4_METAL_PROFILE_FFN_STAGE("router");
+#ifdef DS4_JACCL
+    if (ok && g_jaccl_group)
+        ok = ds4_gpu_expert_mask_batch(g->batch_router_selected, g->batch_router_weights,
+                                       g_expert_start, g_expert_end,
+                                       DS4_N_EXPERT_USED, n_tokens) != 0;
+#endif
 
     if (ok) {
         ok = ds4_gpu_routed_moe_batch_tensor(g->batch_routed_out,
@@ -13367,6 +13530,16 @@ static bool metal_graph_encode_layer_ffn_batch(
                                       (uint64_t)n_tokens * DS4_N_EMBD, il, pos0);
     }
     DS4_METAL_PROFILE_FFN_STAGE("routed_moe");
+#ifdef DS4_JACCL
+    if (ok && g_jaccl_group) {
+        /* Batch fused MoE kernel is synchronous. batch_routed_out is
+         * StorageModeShared — CPU-readable, RDMA-registerable. */
+        void *buf = ds4_gpu_tensor_contents(g->batch_routed_out);
+        jaccl_group_all_sum(g_jaccl_group, buf, buf,
+                            (size_t)n_tokens * DS4_N_EMBD * sizeof(float),
+                            JACCL_FLOAT32);
+    }
+#endif
     if (ok) ok = metal_graph_matmul_q8_0_named_tensor("shared_gate",
                                                       il,
                                                       pos0,
@@ -15070,6 +15243,12 @@ struct ds4_engine {
     bool quality;
     bool metal_ready;
     bool mtp_ready;
+    /* Distributed (JACCL) */
+    void *jaccl_group;     /* jaccl_group_t, NULL when not distributed */
+    int   world_size;      /* total ranks (1 when single-node) */
+    int   rank;            /* this process's rank */
+    int   expert_start;    /* first expert owned by this rank */
+    int   expert_end;      /* one-past-last expert owned */
 };
 
 static bool cpu_directional_steering_enabled(
@@ -18033,6 +18212,38 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
                 e->mtp_draft_tokens);
     }
 
+#ifdef DS4_JACCL
+    if (opt->distributed) {
+        e->jaccl_group = jaccl_init_from_env(/*strict=*/true);
+        if (!e->jaccl_group) {
+            fprintf(stderr, "ds4: JACCL init failed\n");
+            ds4_engine_close(e);
+            *out = NULL;
+            return 1;
+        }
+        e->world_size = jaccl_group_size(e->jaccl_group);
+        e->rank = jaccl_group_rank(e->jaccl_group);
+        int experts_per_rank = (int)DS4_N_EXPERT / e->world_size;
+        e->expert_start = e->rank * experts_per_rank;
+        e->expert_end = (e->rank == e->world_size - 1)
+                            ? (int)DS4_N_EXPERT
+                            : e->expert_start + experts_per_rank;
+        fprintf(stderr, "ds4: JACCL group=%p rank=%d/%d experts=[%d,%d) of %d\n",
+                (void*)e->jaccl_group, e->rank, e->world_size,
+                e->expert_start, e->expert_end, (int)DS4_N_EXPERT);
+    } else
+#endif
+    {
+        e->jaccl_group = NULL;
+        e->world_size = 1;
+        e->rank = 0;
+        e->expert_start = 0;
+        e->expert_end = (int)DS4_N_EXPERT;
+    }
+    g_jaccl_group  = e->jaccl_group;
+    g_expert_start = e->expert_start;
+    g_expert_end   = e->expert_end;
+
 #ifndef DS4_NO_GPU
     if (e->backend == DS4_BACKEND_CUDA) {
 #ifdef __APPLE__
@@ -18144,6 +18355,12 @@ int ds4_engine_model_id(ds4_engine *e) {
 
 void ds4_engine_close(ds4_engine *e) {
     if (!e) return;
+    g_jaccl_group  = NULL;
+    g_expert_start = 0;
+    g_expert_end   = 0;
+#ifdef DS4_JACCL
+    if (e->jaccl_group) jaccl_group_free(e->jaccl_group);
+#endif
     weights_free(&e->weights);
     vocab_free(&e->vocab);
     ds4_threads_shutdown();
diff --git a/ds4.h b/ds4.h
index f1a8e9e4b..2f2bfe785 100644
--- a/ds4.h
+++ b/ds4.h
@@ -73,6 +73,7 @@ typedef struct {
     bool warm_weights;
     bool quality;
     bool inspect_only;
+    bool distributed;
 } ds4_engine_options;
 
 typedef void (*ds4_token_emit_fn)(void *ud, int token);
diff --git a/ds4_cli.c b/ds4_cli.c
index dfac149b3..8118d71c7 100644
--- a/ds4_cli.c
+++ b/ds4_cli.c
@@ -116,6 +116,9 @@ static void usage(FILE *fp) {
         "      Touch mapped tensor pages before generation. Slower startup, fewer first-use stalls.\n"
         "  --power N\n"
         "      Target GPU duty cycle percentage, 1..100. Default: 100\n"
+        "  --distributed\n"
+        "      Enable JACCL distributed expert parallelism across RDMA-connected nodes.\n"
+        "      Requires JACCL=1 build and JACCL_RANK/JACCL_COORDINATOR env vars.\n"
         "\n"
         "Prompt and generation:\n"
         "  -p, --prompt TEXT\n"
@@ -1494,6 +1497,8 @@ static cli_config parse_options(int argc, char **argv) {
             c.engine.backend = DS4_BACKEND_METAL;
         } else if (!strcmp(arg, "--cuda")) {
             c.engine.backend = DS4_BACKEND_CUDA;
+        } else if (!strcmp(arg, "--distributed")) {
+            c.engine.distributed = true;
         } else if (!strcmp(arg, "--dump-tokens")) {
             c.gen.dump_tokens = true;
         } else if (!strcmp(arg, "--dump-logits")) {
diff --git a/ds4_gpu.h b/ds4_gpu.h
index 2872b46a4..d2b533fc7 100644
--- a/ds4_gpu.h
+++ b/ds4_gpu.h
@@ -810,6 +810,25 @@ int ds4_gpu_shared_down_hc_expand_q8_0_tensor(
         uint32_t                n_embd,
         uint32_t                n_hc);
 
+/* =========================================================================
+ * Expert Ownership Mask (JACCL distributed).
+ * =========================================================================
+ *
+ * GPU-side masking of router weights for experts not owned by this rank.
+ * Eliminates the command buffer break that the CPU masking path required.
+ */
+
+int ds4_gpu_expert_mask(const ds4_gpu_tensor *selected, ds4_gpu_tensor *weights,
+                        int32_t expert_start, int32_t expert_end, uint32_t n_expert_used);
+
+int ds4_gpu_expert_mask_batch(const ds4_gpu_tensor *selected, ds4_gpu_tensor *weights,
+                              int32_t expert_start, int32_t expert_end,
+                              uint32_t n_expert_used, uint32_t n_tokens);
+
+int ds4_gpu_expert_compact(const ds4_gpu_tensor *selected, ds4_gpu_tensor *weights,
+                           int32_t expert_start, int32_t expert_end,
+                           uint32_t n_expert_used, uint32_t *compacted_count);
+
 int ds4_gpu_matmul_q8_0_hc_expand_tensor(
         ds4_gpu_tensor       *out_hc,
         ds4_gpu_tensor       *block_out,
diff --git a/ds4_metal.m b/ds4_metal.m
index 465fb6294..20196e44e 100644
--- a/ds4_metal.m
+++ b/ds4_metal.m
@@ -101,6 +101,9 @@
 static id<MTLComputePipelineState> g_dsv4_router_finalize_one_pipeline;
 static id<MTLComputePipelineState> g_dsv4_router_weights_one_pipeline;
 static id<MTLComputePipelineState> g_dsv4_hc_expand4_pipeline;
+static id<MTLComputePipelineState> g_expert_mask_pipeline;
+static id<MTLComputePipelineState> g_expert_mask_batch_pipeline;
+static id<MTLComputePipelineState> g_expert_compact_pipeline;
 static NSMutableDictionary<NSString *, id<MTLComputePipelineState>> *g_pipeline_cache;
 static NSMutableDictionary<NSString *, id<MTLBuffer>> *g_model_buffer_cache;
 static NSMutableArray<id<MTLBuffer>> *g_transient_buffers;
@@ -1508,6 +1511,7 @@ void ds4_gpu_set_quality(bool quality) {
         @[@"DS4_METAL_NORM_SOURCE",       @"metal/norm.metal"],
         @[@"DS4_METAL_BIN_SOURCE",        @"metal/bin.metal"],
         @[@"DS4_METAL_SET_ROWS_SOURCE",   @"metal/set_rows.metal"],
+        @[@"DS4_METAL_EXPERT_MASK_SOURCE", @"metal/expert_mask.metal"],
     ];
 
     NSMutableString *source = [NSMutableString stringWithString:base];
@@ -4159,6 +4163,13 @@ int ds4_gpu_init(void) {
             return 0;
         }
 
+        /* Expert mask kernels — non-fatal, only needed for distributed JACCL. */
+        g_expert_mask_pipeline = ds4_gpu_get_pipeline("kernel_expert_mask");
+        g_expert_mask_batch_pipeline = ds4_gpu_get_pipeline("kernel_expert_mask_batch");
+        g_expert_compact_pipeline = ds4_gpu_get_pipeline("kernel_expert_compact");
+        if (!g_expert_mask_pipeline || !g_expert_mask_batch_pipeline)
+            fprintf(stderr, "ds4: Metal expert_mask kernel(s) not found (distributed masking unavailable)\n");
+
         g_initialized = 1;
     }
 
@@ -4522,6 +4533,9 @@ void ds4_gpu_cleanup(void) {
         g_dsv4_router_finalize_one_pipeline = nil;
         g_dsv4_router_weights_one_pipeline = nil;
         g_dsv4_hc_expand4_pipeline = nil;
+        g_expert_mask_pipeline = nil;
+        g_expert_mask_batch_pipeline = nil;
+        g_expert_compact_pipeline = nil;
         g_flash_attn_mask_buffer = nil;
         g_flash_attn_pad_buffer = nil;
         g_flash_attn_tmp_buffer = nil;
@@ -13990,6 +14004,21 @@ int ds4_gpu_routed_moe_one_tensor(
         const NSUInteger gate_smem = ds4_gpu_routed_mv_smem(gate_type);
         const NSUInteger down_smem = ds4_gpu_routed_mv_smem(down_type);
         int ok = 1;
+#ifdef DS4_JACCL
+        if (g_jaccl_group) {
+            if (!ds4_gpu_finish_command_buffer(cb, owned, "pre_moe_mid_zero")) return 0;
+            int blit_owned = 0;
+            id<MTLCommandBuffer> blit_cb = ds4_gpu_command_buffer(&blit_owned);
+            if (!blit_cb) return 0;
+            id<MTLBlitCommandEncoder> blit = [blit_cb blitCommandEncoder];
+            [blit fillBuffer:midbuf range:NSMakeRange(ds4_gpu_tensor_offset(mid), mid_bytes) value:0];
+            [blit endEncoding];
+            if (!ds4_gpu_finish_command_buffer(blit_cb, blit_owned, "mid_zero")) return 0;
+            owned = 0;
+            cb = ds4_gpu_command_buffer(&owned);
+            if (!cb) return 0;
+        }
+#endif
         const bool write_clamped_moe =
             getenv("DS4_METAL_MOE_WRITE_CLAMPED_ACT") != NULL;
         id<MTLComputePipelineState> pair_swiglu_pipeline = nil;
@@ -14413,6 +14442,21 @@ int ds4_gpu_routed_moe_batch_tensor(
             n_tokens <= 4u &&
             down_sum6_pipeline != nil;
         int ok = 0;
+#ifdef DS4_JACCL
+        if (g_jaccl_group) {
+            if (!ds4_gpu_finish_command_buffer(cb, owned, "pre_batch_mid_zero")) return 0;
+            int blit_owned = 0;
+            id<MTLCommandBuffer> blit_cb = ds4_gpu_command_buffer(&blit_owned);
+            if (!blit_cb) return 0;
+            id<MTLBlitCommandEncoder> blit = [blit_cb blitCommandEncoder];
+            [blit fillBuffer:midbuf range:NSMakeRange(ds4_gpu_tensor_offset(mid), mid_bytes) value:0];
+            [blit endEncoding];
+            if (!ds4_gpu_finish_command_buffer(blit_cb, blit_owned, "batch_mid_zero")) return 0;
+            owned = 0;
+            cb = ds4_gpu_command_buffer(&owned);
+            if (!cb) return 0;
+        }
+#endif
         if (use_mm_id) {
             /*
              * The routed pair ids are the same for gate, up, and down. Build
@@ -15811,3 +15855,107 @@ int ds4_gpu_matmul_q8_0_hc_expand_tensor(
 
     return 1;
 }
+
+/* =========================================================================
+ * Expert Ownership Mask — GPU kernel for distributed JACCL inference.
+ * =========================================================================
+ *
+ * Zeros router weights for experts not owned by this rank, entirely on GPU.
+ * Replaces the CPU path that required end_commands/begin_commands around a
+ * StorageModeShared pointer read, saving ~0.5ms per layer per token.
+ */
+
+/* Single-token decode path. */
+int ds4_gpu_expert_mask(const ds4_gpu_tensor *selected, ds4_gpu_tensor *weights,
+                        int32_t expert_start, int32_t expert_end, uint32_t n_expert_used) {
+    if (!g_expert_mask_pipeline || !selected || !weights || n_expert_used == 0) return 0;
+
+    @autoreleasepool {
+        int owned = 0;
+        id<MTLCommandBuffer> cb = ds4_gpu_command_buffer(&owned);
+        if (!cb) return 0;
+
+        struct { int32_t expert_start; int32_t expert_end; uint32_t n_expert_used; } args = {
+            expert_start, expert_end, n_expert_used
+        };
+
+        id<MTLComputeCommandEncoder> enc = ds4_gpu_compute_encoder(cb);
+        if (!enc) return 0;
+        [enc setComputePipelineState:g_expert_mask_pipeline];
+        [enc setBytes:&args length:sizeof(args) atIndex:0];
+        [enc setBuffer:ds4_gpu_tensor_buffer(selected) offset:ds4_gpu_tensor_offset(selected) atIndex:1];
+        [enc setBuffer:ds4_gpu_tensor_buffer(weights)  offset:ds4_gpu_tensor_offset(weights)  atIndex:2];
+        [enc dispatchThreads:MTLSizeMake(n_expert_used, 1, 1)
+            threadsPerThreadgroup:MTLSizeMake(n_expert_used, 1, 1)];
+        ds4_gpu_end_compute_encoder(cb, enc);
+
+        if (!ds4_gpu_finish_command_buffer(cb, owned, "expert_mask")) return 0;
+    }
+    return 1;
+}
+
+/* Batch (prefill) path: mask N tokens' worth of expert weights. */
+int ds4_gpu_expert_mask_batch(const ds4_gpu_tensor *selected, ds4_gpu_tensor *weights,
+                              int32_t expert_start, int32_t expert_end,
+                              uint32_t n_expert_used, uint32_t n_tokens) {
+    if (!g_expert_mask_batch_pipeline || !selected || !weights || n_expert_used == 0 || n_tokens == 0) return 0;
+
+    @autoreleasepool {
+        int owned = 0;
+        id<MTLCommandBuffer> cb = ds4_gpu_command_buffer(&owned);
+        if (!cb) return 0;
+
+        const uint32_t total = n_tokens * n_expert_used;
+        struct { int32_t expert_start; int32_t expert_end; uint32_t n_expert_used; uint32_t total; } args = {
+            expert_start, expert_end, n_expert_used, total
+        };
+
+        id<MTLComputeCommandEncoder> enc = ds4_gpu_compute_encoder(cb);
+        if (!enc) return 0;
+        [enc setComputePipelineState:g_expert_mask_batch_pipeline];
+        [enc setBytes:&args length:sizeof(args) atIndex:0];
+        [enc setBuffer:ds4_gpu_tensor_buffer(selected) offset:ds4_gpu_tensor_offset(selected) atIndex:1];
+        [enc setBuffer:ds4_gpu_tensor_buffer(weights)  offset:ds4_gpu_tensor_offset(weights)  atIndex:2];
+        [enc dispatchThreads:MTLSizeMake(total, 1, 1)
+            threadsPerThreadgroup:MTLSizeMake(MIN(total, (uint32_t)g_expert_mask_batch_pipeline.maxTotalThreadsPerThreadgroup), 1, 1)];
+        ds4_gpu_end_compute_encoder(cb, enc);
+
+        if (!ds4_gpu_finish_command_buffer(cb, owned, "expert_mask_batch")) return 0;
+    }
+    return 1;
+}
+
+int ds4_gpu_expert_compact(const ds4_gpu_tensor *selected, ds4_gpu_tensor *weights,
+                           int32_t expert_start, int32_t expert_end,
+                           uint32_t n_expert_used, uint32_t *compacted_count) {
+    if (!g_expert_compact_pipeline || !selected || !weights || n_expert_used == 0 || !compacted_count) return 0;
+
+    @autoreleasepool {
+        int owned = 0;
+        id<MTLCommandBuffer> cb = ds4_gpu_command_buffer(&owned);
+        if (!cb) return 0;
+
+        id<MTLBuffer> count_buf = ds4_gpu_new_transient_buffer(sizeof(uint32_t), "expert_compact_count");
+        if (!count_buf) return 0;
+
+        struct { int32_t expert_start; int32_t expert_end; uint32_t n_expert_used; } args = {
+            expert_start, expert_end, n_expert_used
+        };
+
+        id<MTLComputeCommandEncoder> enc = ds4_gpu_compute_encoder(cb);
+        if (!enc) return 0;
+        [enc setComputePipelineState:g_expert_compact_pipeline];
+        [enc setBytes:&args length:sizeof(args) atIndex:0];
+        [enc setBuffer:ds4_gpu_tensor_buffer(selected) offset:ds4_gpu_tensor_offset(selected) atIndex:1];
+        [enc setBuffer:ds4_gpu_tensor_buffer(weights)  offset:ds4_gpu_tensor_offset(weights)  atIndex:2];
+        [enc setBuffer:count_buf offset:0 atIndex:3];
+        [enc dispatchThreads:MTLSizeMake(n_expert_used, 1, 1)
+            threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+        ds4_gpu_end_compute_encoder(cb, enc);
+
+        if (!ds4_gpu_finish_command_buffer(cb, owned, "expert_compact")) return 0;
+
+        *compacted_count = *(const uint32_t *)count_buf.contents;
+    }
+    return 1;
+}
diff --git a/ds4_server.c b/ds4_server.c
index a9930d603..253258f5e 100644
--- a/ds4_server.c
+++ b/ds4_server.c
@@ -11392,6 +11392,9 @@ static void usage(FILE *fp) {
         "      Target GPU duty cycle percentage, 1..100. Default: 100\n"
         "  --metal | --cuda | --cpu | --backend NAME\n"
         "      Select backend explicitly. Defaults to Metal on macOS and CUDA on CUDA builds.\n"
+        "  --distributed\n"
+        "      Enable JACCL distributed expert parallelism across RDMA-connected nodes.\n"
+        "      Requires JACCL=1 build and JACCL_RANK/JACCL_COORDINATOR env vars.\n"
         "\n"
         "HTTP API:\n"
         "  --host HOST\n"
@@ -11562,6 +11565,8 @@ static server_config parse_options(int argc, char **argv) {
             c.engine.backend = parse_backend_arg(need_arg(&i, argc, argv, arg), arg);
         } else if (!strcmp(arg, "--cpu")) {
             c.engine.backend = DS4_BACKEND_CPU;
+        } else if (!strcmp(arg, "--distributed")) {
+            c.engine.distributed = true;
         } else {
             server_log(DS4_LOG_DEFAULT, "ds4-server: unknown option: %s", arg);
             usage(stderr);
diff --git a/jaccl_shim.cpp b/jaccl_shim.cpp
new file mode 100644
index 000000000..ff1eaf993
--- /dev/null
+++ b/jaccl_shim.cpp
@@ -0,0 +1,46 @@
+#include "jaccl_shim.h"
+#include <jaccl/jaccl.h>
+#include <memory>
+
+static std::shared_ptr<jaccl::Group> unwrap(jaccl_group_t g) {
+    return *reinterpret_cast<std::shared_ptr<jaccl::Group>*>(g);
+}
+
+extern "C" {
+
+bool jaccl_is_available(void) {
+    return jaccl::is_available();
+}
+
+jaccl_group_t jaccl_init_from_env(bool strict) {
+    auto group = jaccl::init(strict);
+    if (!group) return nullptr;
+    auto *p = new std::shared_ptr<jaccl::Group>(std::move(group));
+    return reinterpret_cast<jaccl_group_t>(p);
+}
+
+void jaccl_group_free(jaccl_group_t g) {
+    if (!g) return;
+    delete reinterpret_cast<std::shared_ptr<jaccl::Group>*>(g);
+}
+
+int jaccl_group_rank(jaccl_group_t g) { return unwrap(g)->rank(); }
+int jaccl_group_size(jaccl_group_t g) { return unwrap(g)->size(); }
+
+void jaccl_group_all_sum(jaccl_group_t g, const void *in, void *out,
+                         size_t n_bytes, int dtype) {
+    unwrap(g)->all_sum(in, out, n_bytes, dtype);
+}
+
+void jaccl_group_barrier(jaccl_group_t g) { unwrap(g)->barrier(); }
+
+void jaccl_group_send(jaccl_group_t g, const void *buf, size_t n_bytes,
+                      int dst) {
+    unwrap(g)->send(buf, n_bytes, dst);
+}
+
+void jaccl_group_recv(jaccl_group_t g, void *buf, size_t n_bytes, int src) {
+    unwrap(g)->recv(buf, n_bytes, src);
+}
+
+} /* extern "C" */
diff --git a/jaccl_shim.h b/jaccl_shim.h
new file mode 100644
index 000000000..c8e1e6174
--- /dev/null
+++ b/jaccl_shim.h
@@ -0,0 +1,37 @@
+#ifndef JACCL_SHIM_H
+#define JACCL_SHIM_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void *jaccl_group_t;
+
+/* Dtype enum matching jaccl::Dtype (group.h). */
+enum jaccl_dtype {
+    JACCL_FLOAT32 = 11, /* jaccl::Dtype::Float32 */
+    JACCL_FLOAT16 = 9,  /* jaccl::Dtype::Float16 */
+};
+
+bool          jaccl_is_available(void);
+jaccl_group_t jaccl_init_from_env(bool strict);
+void          jaccl_group_free(jaccl_group_t g);
+int           jaccl_group_rank(jaccl_group_t g);
+int           jaccl_group_size(jaccl_group_t g);
+void          jaccl_group_all_sum(jaccl_group_t g, const void *in, void *out,
+                                  size_t n_bytes, int dtype);
+void          jaccl_group_barrier(jaccl_group_t g);
+void          jaccl_group_send(jaccl_group_t g, const void *buf,
+                               size_t n_bytes, int dst);
+void          jaccl_group_recv(jaccl_group_t g, void *buf, size_t n_bytes,
+                               int src);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/metal/expert_mask.metal b/metal/expert_mask.metal
new file mode 100644
index 000000000..c90ca7d0e
--- /dev/null
+++ b/metal/expert_mask.metal
@@ -0,0 +1,69 @@
+// DS4 Metal expert ownership mask kernel.
+// Zeros router_weights for experts not owned by this rank.
+// Runs in the same command buffer as the router select kernel —
+// no batch break needed for CPU masking.
+
+struct ds4_metal_args_expert_mask {
+    int32_t expert_start;
+    int32_t expert_end;
+    uint32_t n_expert_used;
+};
+
+// Single-token decode path: zero weights for non-owned experts.
+kernel void kernel_expert_mask(
+        constant ds4_metal_args_expert_mask & args,
+        device const int32_t * selected [[buffer(1)]],
+        device       float   * weights  [[buffer(2)]],
+        uint tid [[thread_position_in_grid]]) {
+    if (tid >= args.n_expert_used) return;
+    const int32_t expert_id = selected[tid];
+    if (expert_id < args.expert_start || expert_id >= args.expert_end) {
+        weights[tid] = 0.0f;
+    }
+}
+
+// Single-token compact: rewrite selected[] and weights[] to only owned experts.
+// Outputs new count to count_out[0]. Single-threaded (6 elements — not worth parallelizing).
+kernel void kernel_expert_compact(
+        constant ds4_metal_args_expert_mask & args,
+        device       int32_t * selected  [[buffer(1)]],
+        device       float   * weights   [[buffer(2)]],
+        device       uint    * count_out [[buffer(3)]],
+        uint tid [[thread_position_in_grid]]) {
+    if (tid != 0) return;
+    uint out_idx = 0;
+    for (uint i = 0; i < args.n_expert_used; i++) {
+        const int32_t expert_id = selected[i];
+        if (expert_id >= args.expert_start && expert_id < args.expert_end) {
+            selected[out_idx] = selected[i];
+            weights[out_idx] = weights[i];
+            out_idx++;
+        }
+    }
+    count_out[0] = out_idx;
+    // Zero remaining slots so kernel doesn't read garbage
+    for (uint i = out_idx; i < args.n_expert_used; i++) {
+        selected[i] = 0;
+        weights[i] = 0.0f;
+    }
+}
+
+struct ds4_metal_args_expert_mask_batch {
+    int32_t expert_start;
+    int32_t expert_end;
+    uint32_t n_expert_used;
+    uint32_t total;          // n_tokens * n_expert_used
+};
+
+// Batch (prefill) path: n_tokens * n_expert_used threads.
+kernel void kernel_expert_mask_batch(
+        constant ds4_metal_args_expert_mask_batch & args,
+        device const int32_t * selected [[buffer(1)]],
+        device       float   * weights  [[buffer(2)]],
+        uint tid [[thread_position_in_grid]]) {
+    if (tid >= args.total) return;
+    const int32_t expert_id = selected[tid];
+    if (expert_id < args.expert_start || expert_id >= args.expert_end) {
+        weights[tid] = 0.0f;
+    }
+}
diff --git a/metal/moe.metal b/metal/moe.metal
index c776e8ddc..9f3837ac9 100644
--- a/metal/moe.metal
+++ b/metal/moe.metal
@@ -1014,6 +1014,23 @@ kernel void kernel_mul_mv_id_iq2_xxs_pair_swiglu_f32(
     const int64_t i11 = idx % args.ne11;
     const int64_t i12 = iid1;
 
+    // Distributed early exit: skip matmul entirely for non-owned experts.
+    // Zero mid[] so the downstream down-projection reads zeros, not stale data.
+    {
+        device const float *route_w_early =
+            (device const float *)(weights + (uint64_t)idx * act.weight_stride);
+        if (route_w_early[0] == 0.0f) {
+            if (tiisg == 0) {
+                const int fr = (tgpig.x * NSG + sgitg) * N_R0_IQ2_XXS;
+                device float *mid_zero =
+                    (device float *)(dst_mid + (uint64_t)idx * act.mid_row_stride);
+                for (int r = 0; r < N_R0_IQ2_XXS && fr + r < args.ne0; ++r)
+                    mid_zero[fr + r] = 0.0f;
+            }
+            return;
+        }
+    }
+
     const int nb = args.ne00 / QK_K;
     const int first_row = (tgpig.x * NSG + sgitg) * N_R0_IQ2_XXS;
     const int nb32 = nb * (QK_K / 32);
@@ -1189,6 +1206,7 @@ kernel void kernel_mul_mv_id_q4_K_pair_f32(
 // for gate and up, then the same lane that wrote each row derives the routed
 // SwiGLU input.  This keeps Q4 behavior aligned with the Q2 optimization while
 // preserving the old pair projection arithmetic.
+// Q4_K pair swiglu — gate+up matmul fused with SwiGLU + expert weight (distributed early-exit)
 kernel void kernel_mul_mv_id_q4_K_pair_swiglu_f32(
         constant ds4_metal_args_mul_mv_id & args,
         constant ds4_metal_dsv4_moe_swiglu_weight_args & act,
@@ -1214,6 +1232,24 @@ kernel void kernel_mul_mv_id_q4_K_pair_swiglu_f32(
     const int64_t i11 = idx % args.ne11;
     const int64_t i12 = iid1;
 
+    // Distributed early exit: skip matmul for non-owned experts.
+    // Zero mid[] so the downstream down-projection reads zeros, not stale data.
+    {
+        device const float *route_w_early =
+            (device const float *)(weights + (uint64_t)idx * act.weight_stride);
+        if (route_w_early[0] == 0.0f) {
+            if (tiisg == 0) {
+                const short NSG_e = FC_mul_mv_nsg;
+                const int fr = (tgpig.x * NSG_e + sgitg) * N_R0_Q4_K;
+                device float *mid_zero =
+                    (device float *)(dst_mid + (uint64_t)idx * act.mid_row_stride);
+                for (int r = 0; r < N_R0_Q4_K && fr + r < args.ne0; ++r)
+                    mid_zero[fr + r] = 0.0f;
+            }
+            return;
+        }
+    }
+
     device const char *src0_gate_cur = src0_gate + i02 * args.nb02;
     device const char *src0_up_cur   = src0_up   + i02 * args.nb02;
     device const char *src1_cur      = src1      + i11 * args.nb11 + i12 * args.nb12;
diff --git a/tests/test_distributed_correctness.sh b/tests/test_distributed_correctness.sh
new file mode 100755
index 000000000..7d860ce41
--- /dev/null
+++ b/tests/test_distributed_correctness.sh
@@ -0,0 +1,128 @@
+#!/usr/bin/env bash
+# test_distributed_correctness.sh — Verify distributed ds4 produces identical
+# output to single-node.
+#
+# Phase 1 (automated): Run single-node CPU baseline and capture output.
+# Phase 2 (manual):    Run the same prompt distributed, then compare.
+#
+# Requires: ds4 binary built with JACCL=1 (make JACCL=1)
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+DS4_ROOT="$(dirname "$SCRIPT_DIR")"
+DS4_BIN="${DS4_ROOT}/ds4"
+MODEL="${DS4_ROOT}/ds4flash.gguf"
+
+PROMPT="The capital of France is"
+CTX=4096
+N_PREDICT=32
+
+BASELINE_FILE="/tmp/ds4_correctness_baseline.txt"
+DISTRIBUTED_FILE="/tmp/ds4_correctness_distributed.txt"
+
+# --- Colors ---
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[0;33m'
+NC='\033[0m'
+
+echo "=== ds4 distributed correctness test ==="
+echo "Binary:  $DS4_BIN"
+echo "Model:   $MODEL"
+echo "Prompt:  \"$PROMPT\""
+echo "Context: $CTX"
+echo "Predict: $N_PREDICT tokens"
+echo ""
+
+# --- Check prerequisites ---
+if [[ ! -x "$DS4_BIN" ]]; then
+    echo -e "${RED}Error: ds4 binary not found at $DS4_BIN${NC}"
+    echo "Build with: make JACCL=1"
+    exit 1
+fi
+
+if [[ ! -f "$MODEL" ]]; then
+    echo -e "${RED}Error: model not found at $MODEL${NC}"
+    echo "Symlink or copy your GGUF model to $MODEL"
+    exit 1
+fi
+
+# ========================================
+# Phase 1: Single-node CPU baseline
+# ========================================
+echo "--- Phase 1: Single-node CPU baseline ---"
+echo "Running: $DS4_BIN --cpu -c $CTX -n $N_PREDICT -p \"$PROMPT\""
+echo ""
+
+"$DS4_BIN" --cpu -c "$CTX" -n "$N_PREDICT" -p "$PROMPT" 2>/dev/null \
+    | tee "$BASELINE_FILE"
+
+echo ""
+echo -e "${GREEN}Baseline captured to $BASELINE_FILE${NC}"
+echo ""
+
+# ========================================
+# Phase 2: Distributed (manual instructions)
+# ========================================
+echo "--- Phase 2: Distributed run (manual) ---"
+echo ""
+echo "To run the same prompt distributed across 2 nodes, execute on each node:"
+echo ""
+echo "  Node 0 (coordinator):"
+echo "    export JACCL_RANK=0"
+echo "    export JACCL_WORLD_SIZE=2"
+echo "    export JACCL_COORDINATOR=<rank0_lan_ip>"
+echo "    export JACCL_IBV_DEVICES='[[null, \"rdma_enX\"], [\"rdma_enY\", null]]'"
+echo "    $DS4_BIN --distributed --cpu -c $CTX -n $N_PREDICT -p \"$PROMPT\" > $DISTRIBUTED_FILE 2>/dev/null"
+echo ""
+echo "  Node 1:"
+echo "    export JACCL_RANK=1"
+echo "    export JACCL_WORLD_SIZE=2"
+echo "    export JACCL_COORDINATOR=<rank0_lan_ip>"
+echo "    export JACCL_IBV_DEVICES='[[null, \"rdma_enX\"], [\"rdma_enY\", null]]'"
+echo "    $DS4_BIN --distributed --cpu -c $CTX -n $N_PREDICT -p \"$PROMPT\" > /dev/null 2>/dev/null"
+echo ""
+echo "  Or use the launch script:"
+echo "    ./distributed_launch.sh --nodes hub,m3u4 --model $MODEL --extra \"--cpu -n $N_PREDICT -p '$PROMPT'\""
+echo ""
+echo "Then run this script with --compare to check results:"
+echo "    $0 --compare"
+echo ""
+
+# ========================================
+# Phase 3: Compare outputs
+# ========================================
+if [[ "${1:-}" == "--compare" ]]; then
+    echo "--- Phase 3: Comparing outputs ---"
+
+    if [[ ! -f "$BASELINE_FILE" ]]; then
+        echo -e "${RED}Error: baseline file not found at $BASELINE_FILE${NC}"
+        echo "Run this script without --compare first to generate the baseline."
+        exit 1
+    fi
+
+    if [[ ! -f "$DISTRIBUTED_FILE" ]]; then
+        echo -e "${RED}Error: distributed output not found at $DISTRIBUTED_FILE${NC}"
+        echo "Run the distributed command from Phase 2, saving rank 0's output to $DISTRIBUTED_FILE"
+        exit 1
+    fi
+
+    echo "Baseline output:"
+    cat "$BASELINE_FILE"
+    echo ""
+    echo "Distributed output:"
+    cat "$DISTRIBUTED_FILE"
+    echo ""
+
+    if diff -q "$BASELINE_FILE" "$DISTRIBUTED_FILE" > /dev/null 2>&1; then
+        echo -e "${GREEN}PASS: Outputs are identical.${NC}"
+        exit 0
+    else
+        echo -e "${RED}FAIL: Outputs differ.${NC}"
+        echo ""
+        echo "Diff:"
+        diff --color=auto "$BASELINE_FILE" "$DISTRIBUTED_FILE" || true
+        exit 1
+    fi
+fi
diff --git a/tests/test_jaccl_shim.c b/tests/test_jaccl_shim.c
new file mode 100644
index 000000000..2abb0f5c7
--- /dev/null
+++ b/tests/test_jaccl_shim.c
@@ -0,0 +1,31 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "../jaccl_shim.h"
+
+int main(void) {
+    printf("test_jaccl_shim: checking JACCL availability...\n");
+
+    bool avail = jaccl_is_available();
+    printf("  jaccl_is_available() = %s\n", avail ? "true" : "false");
+
+    if (avail) {
+        printf("  JACCL RDMA is available on this system.\n");
+
+        /* Try non-strict init (returns NULL if env vars not set). */
+        jaccl_group_t g = jaccl_init_from_env(false);
+        if (g) {
+            int rank = jaccl_group_rank(g);
+            int size = jaccl_group_size(g);
+            printf("  Initialized group: rank=%d, size=%d\n", rank, size);
+            jaccl_group_free(g);
+            printf("  Group freed successfully.\n");
+        } else {
+            printf("  Non-strict init returned NULL (env vars not set -- expected in unit test).\n");
+        }
+    } else {
+        printf("  JACCL RDMA not available (no TB5 RDMA hardware or SDK < 26.2).\n");
+    }
+
+    printf("test_jaccl_shim: PASS (linkage verified)\n");
+    return 0;
+}