diff --git a/.gitignore b/.gitignore
index 2c70e4d61..cac9297e4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 /ds4
 /ds4-server
 /ds4-bench
+/ds4-bench.exe
 /ds4_native
 /ds4_server_test
 /ds4_test
@@ -8,7 +9,11 @@
 /TODO.md
 /gguf/
 *.o
+*.exe
 *.dSYM/
+# Generated MSVC import lib for the Windows ROCm build (regenerated from the
+# HIP SDK's libhipblas.dll by win/build-rocm.sh).
+/win/third_party/hipblas.lib
 /misc/
 .*.swp
 .DS_Store
diff --git a/Makefile b/Makefile
index 42cde3c35..750de1ae4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,28 @@
 CC ?= cc
 UNAME_S := $(shell uname -s)
 
+# On MinGW/MSYS `uname -s` is e.g. MINGW64_NT-10.0 or MSYS_NT-10.0.
+IS_WINDOWS := $(filter MINGW% MSYS%,$(UNAME_S))
+
+# MinGW has no `cc`; default the compiler to gcc there (still overridable).
+ifneq ($(IS_WINDOWS),)
+ifeq ($(origin CC),default)
+CC := gcc
+endif
+endif
+
 ifeq ($(UNAME_S),Darwin)
 NATIVE_CPU_FLAG ?= -mcpu=native
 else
 NATIVE_CPU_FLAG ?= -march=native
 endif
 
+# Native Windows (MinGW-w64) CPU build flags. ds4.c pulls in the dependency-free
+# POSIX shim (ds4_win.h) behind #ifdef _WIN32; no extra -I/-include is needed.
+WIN_CFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -std=c99 -D_GNU_SOURCE \
+	-fno-finite-math-only -DDS4_NO_GPU -D_CRT_SECURE_NO_WARNINGS
+WIN_LDLIBS ?= -lm
+
 CFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -Wall -Wextra -std=c99
 OBJCFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -Wall -Wextra -fobjc-arc
 
@@ -18,7 +34,23 @@ METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
 CORE_OBJS = ds4.o ds4_metal.o
 CPU_CORE_OBJS = ds4_cpu.o
 else
+
 CFLAGS += -D_GNU_SOURCE -fno-finite-math-only
+
+ifeq ($(GPU_BACKEND),rocm)
+ROCM_PATH ?= /opt/rocm
+GPU_CC = $(ROCM_PATH)/bin/hipcc
+ROCM_ARCH ?= gfx1151
+
+GPU_CFLAGS ?= -O3 -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -Wno-unused-command-line-argument --offload-arch=$(ROCM_ARCH)
+GPU_LDLIBS = -lm -pthread -L$(ROCM_PATH)/lib -lhipblas
+
+@echo "ROCM_ARCH: $(ROCM_ARCH)"
+
+EXTRA_DEPS = ds4_rocm.h
+
+else
+
 CUDA_HOME ?= /usr/local/cuda
 NVCC ?= $(CUDA_HOME)/bin/nvcc
 CUDA_ARCH ?=
@@ -27,12 +59,21 @@ NVCC_ARCH_FLAGS := -arch=$(CUDA_ARCH)
 endif
 NVCCFLAGS ?= -O3 --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NATIVE_CPU_FLAG) -Xcompiler -pthread
 CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas
+
+GPU_CC = $(NVCC)
+GPU_CFLAGS = $(NVCCFLAGS)
+GPU_LDLIBS = $(CUDA_LDLIBS)
+
+endif
+
 CORE_OBJS = ds4.o ds4_cuda.o
+EXTRA_DEPS =
 CPU_CORE_OBJS = ds4_cpu.o
 METAL_LDLIBS := $(LDLIBS)
+
 endif
 
-.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression
+.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression windows-cpu
 
 ifeq ($(UNAME_S),Darwin)
 all: ds4 ds4-server ds4-bench
@@ -60,6 +101,46 @@ cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o linenoise.o rax.o $(CPU_CORE
 
 cuda-regression:
 	@echo "cuda-regression requires a CUDA build"
+
+else ifneq ($(IS_WINDOWS),)
+# ---- Native Windows (MinGW-w64 / HIP-clang) -------------------------------
+# CPU bench builds with MinGW. The GPU (ROCm/HIP) bench builds with the AMD HIP
+# SDK for gfx1151. The CLI (linenoise/termios + sigaction) and server (BSD
+# sockets/poll) still need Windows ports; see win/README.md.
+#
+# Windows ROCm/HIP build settings. hipcc.exe's .bat wrapper splits args on
+# spaces, so the actual compile/link is delegated to win/build-rocm.sh, which
+# relies on the SDK's default include search and a space-free import-lib dir.
+ROCM_PATH ?= C:/Program Files/AMD/ROCm/7.1
+ROCM_ARCH ?= gfx1151
+
+all: help
+
+help:
+	@echo "DS4 build targets (native Windows):"
+	@echo "  make windows-cpu    Build native Windows CPU   ./ds4-bench.exe (MinGW)"
+	@echo "  make windows-rocm   Build native Windows ROCm  ./ds4-bench.exe (HIP, gfx1151)"
+	@echo "  make clean          Remove build outputs"
+	@echo ""
+	@echo "  windows-rocm uses the AMD HIP SDK (ROCM_PATH=$(ROCM_PATH),"
+	@echo "  ROCM_ARCH=$(ROCM_ARCH)). See win/README.md for the rocWMMA vendoring"
+	@echo "  step and run caveats."
+	@echo ""
+	@echo "  ds4 (CLI) and ds4-server are not yet ported to Windows."
+
+windows-cpu: ds4-bench.exe
+
+ds4-bench.exe: ds4_bench.c ds4.c ds4.h ds4_gpu.h ds4_win.h
+	$(CC) $(WIN_CFLAGS) -c -o ds4_cpu.o ds4.c
+	$(CC) $(WIN_CFLAGS) -c -o ds4_bench_cpu.o ds4_bench.c
+	$(CC) $(WIN_CFLAGS) -o $@ ds4_bench_cpu.o ds4_cpu.o $(WIN_LDLIBS)
+
+# Native Windows ROCm/HIP ds4-bench.exe (gfx1151). Delegates to the build
+# script to work around hipcc.exe's space-splitting argument wrapper.
+.PHONY: windows-rocm
+windows-rocm:
+	ROCM_PATH="$(ROCM_PATH)" ROCM_ARCH="$(ROCM_ARCH)" bash win/build-rocm.sh
+
 else
 all: help
 
@@ -69,6 +150,7 @@ help:
 	@echo "  make cuda-generic        Build CUDA for a generic local CUDA GPU"
 	@echo "  make cuda CUDA_ARCH=sm_N Build CUDA with an explicit nvcc -arch value"
 	@echo "  make cpu                 Build CPU-only ./ds4, ./ds4-server, and ./ds4-bench"
+	@echo "  make rocm                Build ROCm"
 	@echo "  make test                Build and run tests"
 	@echo "  make clean               Remove build outputs"
 
@@ -86,14 +168,22 @@ cuda:
 	fi
 	$(MAKE) ds4 ds4-server ds4-bench CUDA_ARCH="$(CUDA_ARCH)"
 
+rocm:
+	@if [ -z "$(strip $(ROCM_ARCH))" ]; then \
+		echo "error: specify ROCM_ARCH, for example: make rocm ROCM_ARCH=gfx1151"; \
+		exit 2; \
+	fi
+	$(MAKE) ds4 ds4-server ds4-bench GPU_BACKEND=rocm ROCM_ARCH=$(ROCM_ARCH)
+
+
 ds4: ds4_cli.o linenoise.o $(CORE_OBJS)
-	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
+	$(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS)
 
 ds4-server: ds4_server.o rax.o $(CORE_OBJS)
-	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
+	$(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS)
 
 ds4-bench: ds4_bench.o $(CORE_OBJS)
-	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
+	$(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS)
 
 cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o linenoise.o rax.o $(CPU_CORE_OBJS)
 	$(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
@@ -143,11 +233,11 @@ ds4_bench_cpu.o: ds4_bench.c ds4.h
 ds4_metal.o: ds4_metal.m ds4_gpu.h $(METAL_SRCS)
 	$(CC) $(OBJCFLAGS) -c -o $@ ds4_metal.m
 
-ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc
-	$(NVCC) $(NVCCFLAGS) -c -o $@ ds4_cuda.cu
+ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(EXTRA_DEPS)
+	$(GPU_CC) $(GPU_CFLAGS) -c -o $@ ds4_cuda.cu
 
 tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o
-	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
+	$(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS)
 
 ds4_test: ds4_test.o rax.o $(CORE_OBJS)
 ifeq ($(UNAME_S),Darwin)
@@ -160,4 +250,5 @@ test: ds4_test
 	./ds4_test
 
 clean:
-	rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
+	rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o *.exe tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
+
diff --git a/ds4.c b/ds4.c
index 51410e335..52f8c978a 100644
--- a/ds4.c
+++ b/ds4.c
@@ -20,19 +20,43 @@
 #include <inttypes.h>
 #include <ctype.h>
 #include <math.h>
+#if defined(_WIN32) && defined(DS4_WIN_PTHREAD)
+/* Native Windows GPU (HIP/MSVC-ABI) build: MSVC has no <pthread.h>; use the
+ * Win32 pthread shim. The MinGW CPU build (no DS4_WIN_PTHREAD) keeps real
+ * winpthreads, so its behavior is unchanged. */
+#include "win/ds4_pthread_win.h"
+#else
 #include <pthread.h>
+#endif
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#ifdef _WIN32
+/* Native Windows CPU (MinGW-w64) and GPU (HIP/clang-MSVC) builds: a small
+ * dependency-free POSIX shim supplies mmap/flock/pread/sysconf/dprintf/
+ * fmemopen. See ds4_win.h. */
+#include "ds4_win.h"
+#include <sys/stat.h>
+#include <stdarg.h>
+#include <time.h>
+#if defined(__MINGW32__)
+#include <unistd.h>          /* MinGW provides POSIX unistd surface */
+#else
+#include <io.h>              /* MSVC-ABI build: read/write/close/lseek/isatty */
+#include <process.h>         /* getpid */
+#include <direct.h>
+#endif
+#else
 #include <sys/file.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <stdarg.h>
 #include <time.h>
 #include <unistd.h>
+#endif
 
 #include "ds4.h"
 
diff --git a/ds4_bench.c b/ds4_bench.c
index 027b2b312..08c295d6f 100644
--- a/ds4_bench.c
+++ b/ds4_bench.c
@@ -19,6 +19,12 @@
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
+#if defined(_WIN32) && !defined(__MINGW32__)
+/* Native Windows GPU (HIP/clang-MSVC) build: MSVC lacks clock_gettime/
+ * CLOCK_MONOTONIC; the shim supplies them. The MinGW CPU build already has
+ * them via <time.h>, so it does not include the shim. See win/ds4_win.h. */
+#include "ds4_win.h"
+#endif
 
 typedef struct {
     const char *model_path;
diff --git a/ds4_cuda.cu b/ds4_cuda.cu
index 3b224f99e..8f5936df4 100644
--- a/ds4_cuda.cu
+++ b/ds4_cuda.cu
@@ -1,8 +1,41 @@
+#ifdef __HIP_PLATFORM_AMD__
+#include "ds4_rocm.h"
+
+#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL
+#define MASK_T uint64_t
+
+// Precise transcendentals for the MoE router top-k scores, immune to
+// -fapprox-func / fast-math. Used on expert-selection paths where a small
+// error can flip which experts get picked (a macro-visible effect). On ROCm
+// these bind directly to the OCML library entry points so the compiler cannot
+// substitute the lower-precision approximations.
+extern "C" __device__ __attribute__((pure))  float __ocml_exp_f32(float);
+extern "C" __device__ __attribute__((pure))  float __ocml_log1p_f32(float);
+extern "C" __device__ __attribute__((const)) float __ocml_sqrt_f32(float);
+
+static __device__ __forceinline__ float ds4_precise_expf(float x)   { return __ocml_exp_f32(x); }
+static __device__ __forceinline__ float ds4_precise_log1pf(float x) { return __ocml_log1p_f32(x); }
+static __device__ __forceinline__ float ds4_precise_sqrtf(float x)  { return __ocml_sqrt_f32(x); }
+
+#else
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
 #include <mma.h>
 #include <cublas_v2.h>
 
+#define FULL_WARP_MASK 0xFFFFFFFFu
+#define MASK_T uint32_t
+
+// Precise transcendentals for the MoE router top-k scores (see ROCm note
+// above). On CUDA the standard library calls already provide the required
+// precision; kept as named wrappers so the expert-selection path is identical
+// across backends.
+static __device__ __forceinline__ float ds4_precise_expf(float x)   { return expf(x); }
+static __device__ __forceinline__ float ds4_precise_log1pf(float x) { return log1pf(x); }
+static __device__ __forceinline__ float ds4_precise_sqrtf(float x)  { return sqrtf(x); }
+
+#endif
+
 #include <stdint.h>
 #include <errno.h>
 #include <limits.h>
@@ -13,7 +46,17 @@
 #include <string.h>
 #include <sys/stat.h>
 #include <time.h>
+#ifdef _WIN32
+/* Native Windows ROCm build: the same dependency-free POSIX shim used by
+ * ds4.c supplies mmap/sysconf/pread/fcntl/flock. <io.h> provides the
+ * _open/_read/_write/_close family; the aliases below map the POSIX names the
+ * device-host code uses. The shim body is guarded by _WIN32, so POSIX/CUDA
+ * builds are byte-for-byte unchanged. See win/README.md and ds4_win.h. */
+#include "ds4_win.h"
+#include <io.h>
+#else
 #include <unistd.h>
+#endif
 #include <unordered_map>
 #include <vector>
 
@@ -1292,9 +1335,22 @@ extern "C" ds4_gpu_tensor *ds4_gpu_tensor_alloc(uint64_t bytes) {
     if (bytes == 0) bytes = 1;
     ds4_gpu_tensor *t = (ds4_gpu_tensor *)calloc(1, sizeof(*t));
     if (!t) return NULL;
-    if (!cuda_ok(cudaMalloc(&t->ptr, (size_t)bytes), "tensor alloc")) {
-        free(t);
-        return NULL;
+
+    if (getenv("DS4_CUDA_MANAGED") != NULL) {
+        /* Use cudaMallocManaged with cudaMemAttachGlobal so the allocation
+         * is GPU-accessible across all streams.  On UMA platforms (Strix
+         * Halo, Grace-Hopper) this allocates from the full unified pool,
+         * bypassing the BIOS VRAM carve-out. */
+        if (!cuda_ok(cudaMallocManaged(&t->ptr, (size_t)bytes, cudaMemAttachGlobal),
+                     "managed tensor alloc")) {
+            free(t);
+            return NULL;
+        }
+    } else {
+        if (!cuda_ok(cudaMalloc(&t->ptr, (size_t)bytes), "tensor alloc")) {
+            free(t);
+            return NULL;
+        }
     }
     t->bytes = bytes;
     t->owner = 1;
@@ -1420,6 +1476,11 @@ extern "C" int ds4_gpu_set_model_map(const void *model_map, uint64_t model_size)
         }
     }
 
+    const char *chunked_env = getenv("DS4_CUDA_COPY_MODEL_CHUNKED");
+    if (chunked_env && chunked_env[0]) {
+        return 1;
+    }
+
     cudaError_t err = cudaHostRegister((void *)model_map, (size_t)model_size,
                                        cudaHostRegisterMapped | cudaHostRegisterReadOnly);
     if (err == cudaSuccess) {
@@ -1463,7 +1524,12 @@ extern "C" int ds4_gpu_set_model_fd(int fd) {
         struct stat st;
         if (fstat(fd, &st) == 0 && st.st_size > 0) {
             g_model_file_size = (uint64_t)st.st_size;
+#ifndef _WIN32
+            /* MSVC's struct stat has no st_blksize; the direct-I/O alignment
+             * hint is a Linux O_DIRECT optimization (see below) and is unused
+             * on Windows, so leave g_model_direct_align at its default of 1. */
             if (st.st_blksize > 1) g_model_direct_align = (uint64_t)st.st_blksize;
+#endif
         }
 #if defined(__linux__) && defined(O_DIRECT)
         if (getenv("DS4_CUDA_NO_DIRECT_IO") == NULL) {
@@ -1722,14 +1788,67 @@ __global__ static void f32_to_f16_kernel(__half *out, const float *x, uint64_t n
 
 __device__ static float warp_sum_f32(float v) {
     for (int offset = 16; offset > 0; offset >>= 1) {
-        v += __shfl_down_sync(0xffffffffu, v, offset);
+        v += __shfl_down_sync(FULL_WARP_MASK, v, offset);
     }
     return v;
 }
 
+#ifdef __HIP_PLATFORM_AMD__
+/* gfx1151 fast path for the paired F16 matmul (ported from upstream PR #311,
+ * commit 9565c07). One warp (32 lanes) computes one output row across both
+ * weight matrices; ROWS_PER_BLOCK warps per block improves occupancy over the
+ * one-block-per-row ordered_chunks kernel. Mathematically identical to that
+ * kernel (a per-row dot product), just a different parallelization. Selected at
+ * the launch site, gated by DS4_ROCM_NO_F16_PAIR_WARP_MATMUL. HIP-only so the
+ * CUDA preprocessor output stays byte-for-byte unchanged. */
+template <uint32_t ROWS_PER_BLOCK>
+__global__ static void matmul_f16_pair_warp_kernel(
+        float *out0,
+        float *out1,
+        const __half *w0,
+        const __half *w1,
+        const float *x,
+        uint64_t in_dim,
+        uint64_t out0_dim,
+        uint64_t out1_dim) {
+
+    const uint64_t row_base = (uint64_t)blockIdx.x * ROWS_PER_BLOCK;
+    const uint32_t tid = threadIdx.x;
+    const uint32_t warp = tid >> 5u;
+    const uint32_t lane = tid & 31u;
+
+    const uint64_t row = row_base + warp;
+    const bool valid0 = row < out0_dim;
+    const bool valid1 = row < out1_dim;
+    if (!valid0 && !valid1) {
+        return;
+    }
+
+    float sum0 = 0.0f;
+    float sum1 = 0.0f;
+
+    const __half *wr0 = valid0 ? w0 + row * in_dim : w0;
+    const __half *wr1 = valid1 ? w1 + row * in_dim : w1;
+
+    for (uint64_t i = lane; i < in_dim; i += 32u) {
+        const float xv = x[i];
+        if (valid0) sum0 += __half2float(wr0[i]) * xv;
+        if (valid1) sum1 += __half2float(wr1[i]) * xv;
+    }
+
+    sum0 = warp_sum_f32(sum0);
+    sum1 = warp_sum_f32(sum1);
+
+    if (lane == 0) {
+        if (valid0) out0[row] = sum0;
+        if (valid1) out1[row] = sum1;
+    }
+}
+#endif
+
 __device__ static float warp_max_f32(float v) {
     for (int offset = 16; offset > 0; offset >>= 1) {
-        v = fmaxf(v, __shfl_down_sync(0xffffffffu, v, offset));
+        v = fmaxf(v, __shfl_down_sync(FULL_WARP_MASK, v, offset));
     }
     return v;
 }
@@ -2846,7 +2965,7 @@ __global__ static void attention_decode_mixed_kernel(
                     for (uint32_t d = qlane; d < head_dim; d += 8u) dot += qh[d] * kvrow[d];
                     const uint32_t mask = 0xffu << (threadIdx.x & 24u);
                     for (uint32_t off = 4u; off > 0u; off >>= 1u) {
-                        dot += __shfl_down_sync(mask, dot, off, 8);
+                        dot += __shfl_down_sync(static_cast<MASK_T>(mask), dot, off, 8);
                     }
                     s = dot * scale + add;
                 }
@@ -3008,7 +3127,7 @@ __global__ static void attention_indexed_mixed_kernel(
                 for (uint32_t d = qlane; d < head_dim; d += 8u) dot += qh[d] * kvrow[d];
                 const uint32_t mask = 0xffu << (threadIdx.x & 24u);
                 for (uint32_t off = 4u; off > 0u; off >>= 1u) {
-                    dot += __shfl_down_sync(mask, dot, off, 8);
+                    dot += __shfl_down_sync(static_cast<MASK_T>(mask), dot, off, 8);
                 }
                 if (qlane == 0) scores[row] = dot * scale;
             }
@@ -3189,7 +3308,7 @@ __global__ static void attention_indexed_mixed_heads8_rb4_kernel(
         const float *score_row = scores + warp * 768u;
         for (uint32_t i = lane; i < n_score; i += 32u) max_s = fmaxf(max_s, score_row[i]);
         max_s = warp_max_f32(max_s);
-        max_s = __shfl_sync(0xffffffffu, max_s, 0);
+        max_s = __shfl_sync(FULL_WARP_MASK, max_s, 0);
     }
     float den = 0.0f;
     if (valid_head) {
@@ -3201,7 +3320,7 @@ __global__ static void attention_indexed_mixed_heads8_rb4_kernel(
         }
         den = warp_sum_f32(den);
         den += expf(sinks[head] - max_s);
-        den = __shfl_sync(0xffffffffu, den, 0);
+        den = __shfl_sync(FULL_WARP_MASK, den, 0);
     }
 
     float4 o0 = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
@@ -3361,7 +3480,7 @@ __global__ static void attention_indexed_mixed_heads8_online_kernel(
                               dot4_f32(q2, k2) +
                               dot4_f32(q3, k3);
                 score = warp_sum_f32(score) * scale;
-                score = __shfl_sync(0xffffffffu, score, 0);
+                score = __shfl_sync(FULL_WARP_MASK, score, 0);
 
                 const float new_m = fmaxf(max_s, score);
                 const float old_scale = expf(max_s - new_m);
@@ -3485,7 +3604,7 @@ __global__ static void attention_static_mixed_heads8_online_kernel(
                               dot4_f32(q2, k2) +
                               dot4_f32(q3, k3);
                 score = warp_sum_f32(score) * scale;
-                score = __shfl_sync(0xffffffffu, score, 0);
+                score = __shfl_sync(FULL_WARP_MASK, score, 0);
 
                 const float new_m = fmaxf(max_s, score);
                 const float old_scale = expf(max_s - new_m);
@@ -3650,7 +3769,7 @@ __global__ static void attention_decode_mixed_heads8_online_kernel(
                               dot4_f32(q2, k2) +
                               dot4_f32(q3, k3);
                 score = warp_sum_f32(score) * scale;
-                score = __shfl_sync(0xffffffffu, score, 0);
+                score = __shfl_sync(FULL_WARP_MASK, score, 0);
 
                 const float new_m = fmaxf(max_s, score);
                 const float old_scale = expf(max_s - new_m);
@@ -4073,8 +4192,8 @@ __global__ static void compressor_shift_ratio4_kernel(float *state_kv, float *st
 
 __device__ static float softplus_dev(float x) {
     if (x > 20.0f) return x;
-    if (x < -20.0f) return expf(x);
-    return log1pf(expf(x));
+    if (x < -20.0f) return ds4_precise_expf(x);
+    return ds4_precise_log1pf(ds4_precise_expf(x));
 }
 
 __global__ static void router_select_kernel(
@@ -4097,7 +4216,7 @@ __global__ static void router_select_kernel(
     int32_t *sel = selected + (uint64_t)t * 6;
     float *w = weights + (uint64_t)t * 6;
 
-    for (int i = 0; i < 256; i++) prob[i] = sqrtf(softplus_dev(log[i]));
+    for (int i = 0; i < 256; i++) prob[i] = ds4_precise_sqrtf(softplus_dev(log[i]));
 
     if (hash_mode) {
         int32_t tok = tokens ? tokens[t] : token_scalar;
@@ -4151,7 +4270,7 @@ __global__ static void router_select_parallel_kernel(
     float *w = weights + (uint64_t)t * 6;
     __shared__ float sprob[256];
 
-    const float p = sqrtf(softplus_dev(log[i]));
+    const float p = ds4_precise_sqrtf(softplus_dev(log[i]));
     sprob[i] = p;
     prob[i] = p;
     __syncthreads();
@@ -4220,7 +4339,7 @@ __global__ static void router_select_warp_topk_kernel(
     #pragma unroll
     for (uint32_t j = 0; j < 8u; j++) {
         const uint32_t e = lane + j * 32u;
-        const float p = sqrtf(softplus_dev(log[e]));
+        const float p = ds4_precise_sqrtf(softplus_dev(log[e]));
         local_prob[j] = p;
         local_score[j] = p + (has_bias ? bias[e] : 0.0f);
         sprob[row_in_block][e] = p;
@@ -4268,9 +4387,9 @@ __global__ static void router_select_warp_topk_kernel(
         }
         #pragma unroll
         for (uint32_t mask = 16u; mask > 0u; mask >>= 1u) {
-            const float other_score = __shfl_xor_sync(0xffffffffu, best_score, mask);
-            const float other_prob = __shfl_xor_sync(0xffffffffu, best_prob, mask);
-            const uint32_t other_idx = __shfl_xor_sync(0xffffffffu, best_idx, mask);
+            const float other_score = __shfl_xor_sync(FULL_WARP_MASK, best_score, mask);
+            const float other_prob = __shfl_xor_sync(FULL_WARP_MASK, best_prob, mask);
+            const uint32_t other_idx = __shfl_xor_sync(FULL_WARP_MASK, best_idx, mask);
             if (router_score_better(other_score, other_idx, best_score, best_idx)) {
                 best_score = other_score;
                 best_prob = other_prob;
@@ -4958,6 +5077,13 @@ static int indexer_scores_launch(
                                                          scale, causal ? 1 : 0);
         return cuda_ok(cudaGetLastError(), "indexer score one direct launch");
     }
+#ifndef __HIP_PLATFORM_AMD__
+    // The WMMA indexer kernel body is gated on __CUDA_ARCH__ >= 700, which
+    // hipcc never defines, so on ROCm its body compiles empty and would leave
+    // `scores` uninitialised. The full rocWMMA matrix API is also not available
+    // in this build (only rocwmma-version.hpp is vendored). Skip the WMMA path
+    // on HIP entirely and fall through to the scalar indexer_scores_kernel
+    // below, which fully initialises `scores`. (On CUDA the WMMA path is kept.)
     if (!g_quality_mode && head_dim == 128u && n_head == 64u &&
         getenv("DS4_CUDA_NO_INDEXER_WMMA") == NULL) {
         dim3 grid((n_comp + 15u) / 16u, (n_tokens + 15u) / 16u, 1);
@@ -4969,6 +5095,7 @@ static int indexer_scores_launch(
                                                  head_dim, ratio, scale, causal ? 1 : 0);
         return cuda_ok(cudaGetLastError(), "indexer scores wmma launch");
     }
+#endif // !__HIP_PLATFORM_AMD__
     dim3 grid(n_comp, n_tokens, 1);
     indexer_scores_kernel<<<grid, 256>>>((float *)scores->ptr,
                                          (const float *)q->ptr,
@@ -5198,7 +5325,7 @@ static int cuda_matmul_q8_0_tensor_labeled(ds4_gpu_tensor *out, const void *mode
                                              out->ptr,
                                              CUDA_R_32F,
                                              (int)out_dim,
-                                             CUDA_R_32F,
+                                             CUBLAS_COMPUTE_32F,
                                              CUBLAS_GEMM_DEFAULT);
             if (st == CUBLAS_STATUS_SUCCESS) return 1;
             fprintf(stderr, "ds4: cuBLAS q8 f16 matmul failed: status %d\n", (int)st);
@@ -5440,7 +5567,7 @@ extern "C" int ds4_gpu_matmul_f16_tensor(ds4_gpu_tensor *out, const void *model_
                                          out->ptr,
                                          CUDA_R_32F,
                                          (int)out_dim,
-                                         CUDA_R_32F,
+                                         CUBLAS_COMPUTE_32F,
                                          CUBLAS_GEMM_DEFAULT);
         return cublas_ok(st, "f16 matmul");
     }
@@ -5496,6 +5623,22 @@ extern "C" int ds4_gpu_matmul_f16_pair_tensor(
     const __half *w0 = (const __half *)cuda_model_range_ptr(model_map, weight0_offset, weight_bytes, "f16_pair0");
     const __half *w1 = (const __half *)cuda_model_range_ptr(model_map, weight1_offset, weight_bytes, "f16_pair1");
     if (!w0 || !w1) return 0;
+#ifdef __HIP_PLATFORM_AMD__
+    if (!getenv("DS4_ROCM_NO_F16_PAIR_WARP_MATMUL")) {
+        constexpr uint32_t ROWS_PER_BLOCK = 8u;
+        const uint32_t grid = (uint32_t)((out_dim + ROWS_PER_BLOCK - 1u) / ROWS_PER_BLOCK);
+        matmul_f16_pair_warp_kernel<ROWS_PER_BLOCK><<<grid, ROWS_PER_BLOCK * 32>>>(
+            (float *)out0->ptr,
+            (float *)out1->ptr,
+            w0,
+            w1,
+            (const float *)x->ptr,
+            in_dim,
+            out_dim,
+            out_dim);
+        return cuda_ok(cudaGetLastError(), "matmul_f16_pair_warp launch");
+    }
+#endif
     matmul_f16_pair_ordered_chunks_kernel<<<(unsigned)out_dim, 32>>>(
         (float *)out0->ptr,
         (float *)out1->ptr,
@@ -6158,7 +6301,7 @@ extern "C" int ds4_gpu_attention_prefill_raw_heads_tensor(ds4_gpu_tensor *heads,
         if (!tmp) return 0;
         float *scores = tmp;
         float *out_tmp = (float *)((char *)tmp + out_offset);
-        const float alpha = rsqrtf((float)head_dim);
+        const float alpha = 1.0f / sqrtf((float)head_dim);
         const float beta = 0.0f;
         cublasStatus_t st = cublasSgemmStridedBatched(g_cublas,
                                                       CUBLAS_OP_T,
@@ -6528,7 +6671,7 @@ static int attention_prefill_mixed_launch(
                 n_comp,
                 head_dim);
         if (!cuda_ok(cudaGetLastError(), "attention mixed kv pack launch")) return 0;
-        const float alpha = rsqrtf((float)head_dim);
+        const float alpha = 1.0f / sqrtf((float)head_dim);
         const float beta = 0.0f;
         cublasStatus_t st = cublasSgemmStridedBatched(g_cublas,
                                                       CUBLAS_OP_T,
@@ -6733,7 +6876,7 @@ extern "C" int ds4_gpu_attention_output_q8_batch_tensor(
                                                        (int)rank,
                                                        (long long)rank * n_tokens,
                                                        (int)n_groups,
-                                                       CUDA_R_32F,
+                                                       CUBLAS_COMPUTE_32F,
                                                        CUBLAS_GEMM_DEFAULT);
         if (!cublas_ok(st, "attention output a gemm")) return 0;
         attention_unpack_group_low_kernel<<<(low_tmp_count + 255) / 256, 256>>>(
@@ -7395,7 +7538,7 @@ __device__ static void dev_dot_q2_K_q8_K_block8(
 __device__ static float half_warp_sum_f32(float v, uint32_t lane16) {
     uint32_t mask = 0xffffu << (threadIdx.x & 16u);
     for (int offset = 8; offset > 0; offset >>= 1) {
-        v += __shfl_down_sync(mask, v, offset, 16);
+        v += __shfl_down_sync(static_cast<MASK_T>(mask), v, offset, 16);
     }
     (void)lane16;
     return v;
@@ -7404,7 +7547,7 @@ __device__ static float half_warp_sum_f32(float v, uint32_t lane16) {
 __device__ static float quarter_warp_sum_f32(float v, uint32_t lane8) {
     uint32_t mask = 0xffu << (threadIdx.x & 24u);
     for (int offset = 4; offset > 0; offset >>= 1) {
-        v += __shfl_down_sync(mask, v, offset, 8);
+        v += __shfl_down_sync(static_cast<MASK_T>(mask), v, offset, 8);
     }
     (void)lane8;
     return v;
diff --git a/ds4_rocm.h b/ds4_rocm.h
new file mode 100644
index 000000000..55907c428
--- /dev/null
+++ b/ds4_rocm.h
@@ -0,0 +1,116 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#include <rocwmma/rocwmma-version.hpp>
+
+#define cudaError_t hipError_t
+#define cudaStream_t hipStream_t
+#define cudaEvent_t hipEvent_t
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaMemLocation hipMemLocation
+
+#define cudaSuccess hipSuccess
+#define cudaErrorNotSupported hipErrorNotSupported
+#define cudaMemAttachGlobal hipMemAttachGlobal
+#define cudaErrorInvalidValue hipErrorInvalidValue
+#define cudaGetLastError hipGetLastError
+#define cudaGetErrorString hipGetErrorString
+
+#define cudaGetDevice hipGetDevice
+#define cudaSetDevice hipSetDevice
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaDeviceGetAttribute hipDeviceGetAttribute
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaDevAttrPageableMemoryAccess hipDeviceAttributePageableMemoryAccess
+#define cudaMemLocationTypeDevice hipMemLocationTypeDevice
+
+#define cudaMalloc hipMalloc
+#define cudaMallocHost hipHostMalloc
+#define cudaMallocManaged hipMallocManaged
+#define cudaFree hipFree
+#define cudaFreeHost hipFreeHost
+#define cudaMemset hipMemset
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemGetInfo hipMemGetInfo
+#define cudaMemsetAsync hipMemsetAsync
+
+#define cudaHostRegister hipHostRegister
+#define cudaHostUnregister hipHostUnregister
+#define cudaHostGetDevicePointer hipHostGetDevicePointer
+#define cudaHostRegisterMapped hipHostRegisterMapped
+#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
+
+#define cudaMemAdvise(p1, p2, p3, p4) hipMemAdvise(p1, p2, p3, p4.id)
+#define cudaMemPrefetchAsync(devPtr, count, location, flags, stream) hipMemPrefetchAsync(devPtr, count, location.id, stream)
+#define cudaMemAdviseSetReadMostly hipMemAdviseSetReadMostly
+#define cudaMemAdviseSetPreferredLocation hipMemAdviseSetPreferredLocation
+
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamNonBlocking hipStreamNonBlocking
+
+#define cudaEventCreate hipEventCreate
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDestroy hipEventDestroy
+#define cudaEventRecord hipEventRecord
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEventElapsedTime hipEventElapsedTime
+#define cudaEventDisableTiming hipEventDisableTiming
+
+#define cublasHandle_t hipblasHandle_t
+#define cublasStatus_t hipblasStatus_t
+#define cublasMath_t hipblasMath_t
+
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_DEFAULT_MATH HIPBLAS_DEFAULT_MATH
+#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
+#define CUBLAS_TF32_TENSOR_OP_MATH HIPBLAS_TF32_TENSOR_OP_MATH
+#define CUDA_R_16F HIPBLAS_R_16F
+#define CUDA_R_32F HIPBLAS_R_32F
+
+#define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
+#define cublasSetMathMode hipblasSetMathMode
+#define cublasSgemm hipblasSgemm
+#define cublasSgemmStridedBatched hipblasSgemmStridedBatched
+#define cublasGemmEx hipblasGemmEx
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+
+static __device__ __forceinline__ int32_t __vcmpne4(uint32_t a, uint32_t b) {
+    // For each byte: 0xFF if a != b, 0x00 if a == b
+    uint32_t diff = a ^ b;
+    // Spread any set bit in each byte to fill the whole byte
+    diff |= (diff >> 1); diff |= (diff >> 2); diff |= (diff >> 4);
+    diff &= 0x01010101u;
+    diff *= 0xFFu; // 0x01 -> 0xFF per byte
+    return (int32_t)diff;
+}
+
+static __device__ __forceinline__ int32_t __vsub4(int32_t a, int32_t b) {
+    // Per-byte subtraction (wrapping, not saturating)
+    uint32_t ua = (uint32_t)a, ub = (uint32_t)b;
+    // Trick: subtract bytes in parallel avoiding cross-byte borrows
+    uint32_t diff = ((ua | 0x80808080u) - (ub & 0x7F7F7F7Fu)) ^ ((ua ^ ~ub) & 0x80808080u);
+    return (int32_t)diff;
+}
+
+// __dp4a: dot product of 4 signed int8s packed in an int32
+static __device__ __forceinline__ int32_t __dp4a(int32_t a, int32_t b, int32_t c) {
+    const int8_t *a_bytes = reinterpret_cast<const int8_t*>(&a);
+    const int8_t *b_bytes = reinterpret_cast<const int8_t*>(&b);
+    return c + (int32_t)a_bytes[0] * b_bytes[0]
+             + (int32_t)a_bytes[1] * b_bytes[1]
+             + (int32_t)a_bytes[2] * b_bytes[2]
+             + (int32_t)a_bytes[3] * b_bytes[3];
+}
+
diff --git a/ds4_win.h b/ds4_win.h
new file mode 100644
index 000000000..ce2356c13
--- /dev/null
+++ b/ds4_win.h
@@ -0,0 +1,312 @@
+/* ds4_win.h — minimal POSIX compatibility layer for native Windows builds.
+ *
+ * Provides just the POSIX surface ds4.c relies on that MinGW/UCRT lacks:
+ *   - mmap / munmap / madvise (read-only file mappings)
+ *   - sysconf(_SC_NPROCESSORS_ONLN / _SC_PAGESIZE)
+ *   - flock / fcntl(F_SETFD,FD_CLOEXEC) / pread / ftruncate / dprintf  (instance lock)
+ *   - fmemopen (fixed-buffer "wb"/"rb", temp-file backed with copy-back on close)
+ *
+ * Header-only, self-contained, no third-party deps. The whole body is guarded by
+ * _WIN32, so this header is inert on POSIX platforms. ds4.c includes it in place
+ * of <sys/mman.h> (and the other POSIX-only surface) behind #ifdef _WIN32, so the
+ * native MinGW-w64 CPU build needs no extra include/search-path flags. MinGW
+ * already provides pthread, clock_gettime and ftruncate.
+ */
+#ifndef DS4_WIN_H
+#define DS4_WIN_H
+
+#ifdef _WIN32
+
+#include <windows.h>
+#include <io.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/types.h>
+
+/* ---- mmap ---------------------------------------------------------------- */
+#define PROT_NONE  0x0
+#define PROT_READ  0x1
+#define PROT_WRITE 0x2
+#define PROT_EXEC  0x4
+#define MAP_SHARED  0x01
+#define MAP_PRIVATE 0x02
+#define MAP_FAILED  ((void *)-1)
+#define POSIX_MADV_NORMAL     0
+#define POSIX_MADV_RANDOM     1
+#define POSIX_MADV_SEQUENTIAL 2
+#define POSIX_MADV_WILLNEED   3
+#define POSIX_MADV_DONTNEED   4
+#define MADV_WILLNEED POSIX_MADV_WILLNEED
+
+#ifndef _SC_PAGESIZE
+#define _SC_PAGESIZE         0x1
+#endif
+#ifndef _SC_NPROCESSORS_ONLN
+#define _SC_NPROCESSORS_ONLN 0x2
+#endif
+
+/* ---- misc POSIX surface used by the GPU (HIP) host code ------------------ */
+/* MinGW/UCRT supplies these; the clang-MSVC HIP toolchain (ds4_cuda.cu build)
+ * does not. Guard each so the MinGW CPU build is unaffected. */
+#ifndef STDIN_FILENO
+#define STDIN_FILENO  0
+#endif
+#ifndef STDOUT_FILENO
+#define STDOUT_FILENO 1
+#endif
+#ifndef STDERR_FILENO
+#define STDERR_FILENO 2
+#endif
+
+#ifndef SSIZE_MAX
+#define SSIZE_MAX ((ssize_t)(((size_t)-1) >> 1))
+#endif
+
+/* ssize_t / off_t: MinGW defines these via <sys/types.h>; MSVC does not.
+ * MSVC exposes _SSIZE_T_DEFINED once <BaseTsd.h> (pulled in by windows.h) and
+ * the CRT have declared SSIZE_T; provide ssize_t/off_t only when absent. */
+#if !defined(_SSIZE_T_DEFINED) && !defined(__MINGW32__) && !defined(_SSIZE_T_)
+typedef SSIZE_T ssize_t;
+#define _SSIZE_T_DEFINED
+#endif
+#if !defined(_OFF_T_DEFINED) && !defined(__MINGW32__)
+/* MSVC <sys/types.h> already typedefs off_t to long; only define if missing. */
+#ifndef _OFF_T_
+typedef long long off_t;
+#endif
+#endif
+
+/* ---- 64-bit file stat --------------------------------------------------- */
+/* The Windows CRT's default `struct stat` / stat() / fstat() carry a 32-bit
+ * st_size, so stat'ing a file larger than 2 GB fails with EOVERFLOW
+ * ("value too large") — fatal for the ~80 GB DeepSeek V4 GGUF. Remap the bare
+ * names to the 64-bit `_stat64` family (which names both the struct and the
+ * functions, with an __int64 st_size).
+ *
+ * <sys/stat.h> is pulled in here first so its own real declarations are parsed
+ * before the macros exist; thanks to its include guard, any later
+ * `#include <sys/stat.h>` in a translation unit (e.g. ds4.c includes it after
+ * this header; ds4_cuda.cu includes it before) is a no-op but still sees the
+ * remap. #undef first in case the CRT already exposes stat/fstat as macros. */
+#include <sys/stat.h>
+#undef stat
+#undef fstat
+#define stat  _stat64
+#define fstat _fstat64
+
+/* clock_gettime / CLOCK_MONOTONIC: present in MinGW, absent in clang-MSVC.
+ * MSVC's <time.h> already declares struct timespec, so we only supply the
+ * clock id macros and the function. */
+#if !defined(CLOCK_MONOTONIC) && !defined(__MINGW32__)
+#include <time.h>
+#define CLOCK_REALTIME  0
+#define CLOCK_MONOTONIC 1
+static inline int clock_gettime(int clk, struct timespec *ts)
+{
+    (void)clk;
+    LARGE_INTEGER freq, cnt;
+    QueryPerformanceFrequency(&freq);
+    QueryPerformanceCounter(&cnt);
+    ts->tv_sec  = (long long)(cnt.QuadPart / freq.QuadPart);
+    long long rem = cnt.QuadPart % freq.QuadPart;
+    ts->tv_nsec = (long)((rem * 1000000000LL) / freq.QuadPart);
+    return 0;
+}
+#endif
+
+/* ---- file locking / fd flags -------------------------------------------- */
+#ifndef F_SETFD
+#define F_SETFD    2
+#endif
+#ifndef FD_CLOEXEC
+#define FD_CLOEXEC 1
+#endif
+#define LOCK_SH 1
+#define LOCK_EX 2
+#define LOCK_NB 4
+#define LOCK_UN 8
+
+static inline void *mmap(void *addr, size_t length, int prot, int flags,
+                         int fd, long long offset)
+{
+    (void)addr; (void)flags; (void)prot;
+    HANDLE fh = (HANDLE)_get_osfhandle(fd);
+    if (fh == INVALID_HANDLE_VALUE) { errno = EBADF; return MAP_FAILED; }
+    HANDLE mh = CreateFileMappingA(fh, NULL, PAGE_READONLY, 0, 0, NULL);
+    if (mh == NULL) { errno = ENOMEM; return MAP_FAILED; }
+    DWORD off_hi = (DWORD)((uint64_t)offset >> 32);
+    DWORD off_lo = (DWORD)((uint64_t)offset & 0xFFFFFFFFu);
+    void *p = MapViewOfFile(mh, FILE_MAP_READ, off_hi, off_lo, length);
+    CloseHandle(mh); /* view keeps the section alive */
+    if (p == NULL) { errno = ENOMEM; return MAP_FAILED; }
+    return p;
+}
+
+static inline int munmap(void *addr, size_t length)
+{
+    (void)length;
+    return UnmapViewOfFile(addr) ? 0 : -1;
+}
+
+static inline int posix_madvise(void *addr, size_t length, int advice)
+{
+    (void)addr; (void)length; (void)advice;
+    return 0; /* advisory only */
+}
+static inline int madvise(void *addr, size_t length, int advice)
+{
+    return posix_madvise(addr, length, advice);
+}
+
+static inline long sysconf(int name)
+{
+    SYSTEM_INFO si;
+    GetSystemInfo(&si);
+    if (name == _SC_NPROCESSORS_ONLN) return (long)si.dwNumberOfProcessors;
+    if (name == _SC_PAGESIZE)         return (long)si.dwPageSize;
+    errno = EINVAL;
+    return -1;
+}
+
+static inline int flock(int fd, int op)
+{
+    HANDLE h = (HANDLE)_get_osfhandle(fd);
+    if (h == INVALID_HANDLE_VALUE) { errno = EBADF; return -1; }
+    OVERLAPPED ov; memset(&ov, 0, sizeof(ov));
+    if (op & LOCK_UN) {
+        return UnlockFileEx(h, 0, MAXDWORD, MAXDWORD, &ov) ? 0 : -1;
+    }
+    DWORD f = 0;
+    if (op & LOCK_EX) f |= LOCKFILE_EXCLUSIVE_LOCK;
+    if (op & LOCK_NB) f |= LOCKFILE_FAIL_IMMEDIATELY;
+    if (!LockFileEx(h, f, 0, MAXDWORD, MAXDWORD, &ov)) {
+        errno = (GetLastError() == ERROR_LOCK_VIOLATION) ? EWOULDBLOCK : EACCES;
+        return -1;
+    }
+    return 0;
+}
+
+static inline int fcntl(int fd, int cmd, ...)
+{
+    (void)fd; (void)cmd;
+    return 0; /* F_SETFD/FD_CLOEXEC is a no-op: Windows handles aren't inherited by default */
+}
+
+static inline long long ds4_pread(int fd, void *buf, size_t count, long long offset)
+{
+    HANDLE h = (HANDLE)_get_osfhandle(fd);
+    if (h == INVALID_HANDLE_VALUE) { errno = EBADF; return -1; }
+    OVERLAPPED ov; memset(&ov, 0, sizeof(ov));
+    ov.Offset     = (DWORD)((uint64_t)offset & 0xFFFFFFFFu);
+    ov.OffsetHigh = (DWORD)((uint64_t)offset >> 32);
+    DWORD got = 0;
+    if (!ReadFile(h, buf, (DWORD)count, &got, &ov)) {
+        if (GetLastError() == ERROR_HANDLE_EOF) return 0;
+        errno = EIO; return -1;
+    }
+    return (long long)got;
+}
+#define pread(fd, buf, count, offset) ds4_pread((fd), (buf), (size_t)(count), (long long)(offset))
+
+/* ftruncate: provided by MinGW <unistd.h>; absent in the MSVC ABI build. */
+#if !defined(__MINGW32__)
+static inline int ftruncate(int fd, long long length)
+{
+    HANDLE h = (HANDLE)_get_osfhandle(fd);
+    if (h == INVALID_HANDLE_VALUE) { errno = EBADF; return -1; }
+    LARGE_INTEGER li; li.QuadPart = length;
+    if (!SetFilePointerEx(h, li, NULL, FILE_BEGIN)) { errno = EINVAL; return -1; }
+    if (!SetEndOfFile(h)) { errno = EIO; return -1; }
+    return 0;
+}
+#endif
+
+static inline int dprintf(int fd, const char *fmt, ...)
+{
+    char buf[512];
+    va_list ap; va_start(ap, fmt);
+    int n = vsnprintf(buf, sizeof(buf), fmt, ap);
+    va_end(ap);
+    if (n < 0) return -1;
+    if (n > (int)sizeof(buf)) n = (int)sizeof(buf);
+    return _write(fd, buf, (unsigned)n);
+}
+
+/* ---- fmemopen (temp-file backed, fixed buffer) --------------------------- */
+typedef struct { FILE *fp; void *buf; size_t cap; int writeback; } ds4_memstream;
+#define DS4_MEMSTREAM_MAX 16
+static ds4_memstream ds4_ms_tab[DS4_MEMSTREAM_MAX];
+static CRITICAL_SECTION ds4_ms_cs;
+static volatile LONG ds4_ms_init = 0;
+
+static inline void ds4_ms_ensure(void)
+{
+    if (InterlockedCompareExchange(&ds4_ms_init, 1, 0) == 0)
+        InitializeCriticalSection(&ds4_ms_cs);
+}
+
+static inline FILE *ds4_tmpfile(void)
+{
+    char dir[MAX_PATH], path[MAX_PATH];
+    if (!GetTempPathA(sizeof(dir), dir)) return NULL;
+    if (!GetTempFileNameA(dir, "ds4", 0, path)) return NULL;
+    /* open read/write, delete on close */
+    return fopen(path, "wb+TD"); /* T=temporary, D=delete-on-close (MSVCRT ext) */
+}
+
+static inline FILE *fmemopen(void *buf, size_t size, const char *mode)
+{
+    ds4_ms_ensure();
+    int writing = (mode && (strchr(mode, 'w') || strchr(mode, 'a') || strchr(mode, '+')));
+    FILE *fp = ds4_tmpfile();
+    if (!fp) return NULL;
+    if (!writing && buf && size) {
+        if (fwrite(buf, 1, size, fp) != size) { fclose(fp); return NULL; }
+        rewind(fp);
+    }
+    EnterCriticalSection(&ds4_ms_cs);
+    for (int i = 0; i < DS4_MEMSTREAM_MAX; i++) {
+        if (ds4_ms_tab[i].fp == NULL) {
+            ds4_ms_tab[i].fp = fp; ds4_ms_tab[i].buf = buf;
+            ds4_ms_tab[i].cap = size; ds4_ms_tab[i].writeback = writing ? 1 : 0;
+            break;
+        }
+    }
+    LeaveCriticalSection(&ds4_ms_cs);
+    return fp;
+}
+
+static inline int ds4_win_fclose(FILE *fp)
+{
+    if (fp && ds4_ms_init) {
+        EnterCriticalSection(&ds4_ms_cs);
+        for (int i = 0; i < DS4_MEMSTREAM_MAX; i++) {
+            if (ds4_ms_tab[i].fp == fp) {
+                if (ds4_ms_tab[i].writeback && ds4_ms_tab[i].buf && ds4_ms_tab[i].cap) {
+                    fflush(fp); rewind(fp);
+                    fread(ds4_ms_tab[i].buf, 1, ds4_ms_tab[i].cap, fp); /* copy back */
+                }
+                ds4_ms_tab[i].fp = NULL; ds4_ms_tab[i].buf = NULL;
+                ds4_ms_tab[i].cap = 0;   ds4_ms_tab[i].writeback = 0;
+                break;
+            }
+        }
+        LeaveCriticalSection(&ds4_ms_cs);
+    }
+    return fclose(fp); /* real fclose — macro defined only after this header */
+}
+
+#endif /* _WIN32 */
+
+/* Redirect fclose AFTER all helpers above so ds4_win_fclose's own call hits the
+ * real fclose. Source files including this header get the memory-stream-aware one. */
+#ifdef _WIN32
+#define fclose(fp) ds4_win_fclose(fp)
+#endif
+
+#endif /* DS4_WIN_H */
diff --git a/win/README.md b/win/README.md
new file mode 100644
index 000000000..ecf4189a3
--- /dev/null
+++ b/win/README.md
@@ -0,0 +1,165 @@
+# Native Windows build (experimental)
+
+DS4 assumes a POSIX environment. `ds4_win.h` (top level) is a small,
+dependency-free compatibility shim that lets the **CPU backend** build with
+native MinGW-w64 GCC (no WSL, no Cygwin, no MSVC). It supplies the POSIX surface
+MinGW/UCRT lacks: `mmap`/`munmap`/`madvise`, `sysconf`, `flock`/`fcntl`/`pread`/
+`dprintf`, and a temp-file-backed `fmemopen`. MinGW already provides `pthread`,
+`clock_gettime`, and `ftruncate`.
+
+The shim is wired in-tree: `ds4.c` includes `ds4_win.h` in place of
+`<sys/mman.h>` (and the other POSIX-only headers) behind `#ifdef _WIN32`. The
+header's entire body is guarded by `_WIN32`, so POSIX builds are byte-for-byte
+unchanged. No special include/search-path flags are needed.
+
+## Build `ds4-bench.exe` (CPU)
+
+### With make (MinGW/MSYS)
+
+```sh
+make windows-cpu
+```
+
+`uname -s` on MinGW/MSYS reports `MINGW64_NT*` / `MSYS_NT*`; the Makefile detects
+this and selects a Windows branch that defaults `CC` to `gcc`.
+
+### Direct gcc (no make)
+
+```sh
+CF="-O3 -ffast-math -march=native -std=c99 -D_GNU_SOURCE -fno-finite-math-only \
+    -DDS4_NO_GPU -D_CRT_SECURE_NO_WARNINGS"
+gcc $CF -c ds4.c       -o ds4_cpu.o
+gcc $CF -c ds4_bench.c -o ds4_bench_cpu.o
+gcc $CF -o ds4-bench.exe ds4_bench_cpu.o ds4_cpu.o -lm
+```
+
+Toolchain used: `x86_64-w64-mingw32` GCC 15.2.0.
+
+### Verify it runs
+
+```sh
+$ ./ds4-bench.exe
+ds4-bench: specify exactly one of --prompt-file or --chat-prompt-file
+```
+
+(Exit code 2 — argument validation fires, proving the binary executes.)
+
+## Build `ds4-bench.exe` (ROCm / HIP, AMD GPU — gfx1151)
+
+The GPU backend (`ds4_cuda.cu`, the unified CUDA/HIP source) builds **natively on
+Windows** with the AMD HIP SDK — no WSL, no MSVC on `PATH`, and no full Visual
+Studio install needed at the command line. Target: AMD Strix Halo (gfx1151).
+
+### Prerequisites
+
+- **AMD HIP SDK** (default `C:/Program Files/AMD/ROCm/7.1`), providing
+  `bin/hipcc.exe`, `bin/clang.exe`, `include/hipblas/…`, `bin/libhipblas.dll`,
+  and the gfx1151 device bitcode (`amdgcn/bitcode/oclc_isa_version_1151.bc`).
+- **`llvm-dlltool`** reachable on `PATH` (or a scoop LLVM install at
+  `~/scoop/apps/llvm/current`). Used **once** to synthesize an MSVC-style
+  `hipblas.lib` from the SDK's `libhipblas.dll` (see "Why a generated import
+  lib" below). After first run the lib is cached in `win/third_party/`.
+- **Vendored rocWMMA header** (in-tree, already committed). The Windows HIP SDK
+  does **not** ship the header-only rocWMMA library, but `ds4_rocm.h` includes
+  `<rocwmma/rocwmma-version.hpp>`. A faithful copy of the CMake-configured
+  version header (rocWMMA 2.2.1, the release that ships with ROCm 7.x) lives at
+  `win/third_party/rocwmma/rocwmma/rocwmma-version.hpp`. rocWMMA is MIT-licensed
+  and header-only; only the version header is needed because DS4's WMMA kernel
+  path is CUDA-only (guarded by `__CUDA_ARCH__`) and is not compiled for HIP.
+
+### With make (MinGW/MSYS)
+
+```sh
+make windows-rocm
+# overridable:
+make windows-rocm ROCM_PATH="C:/Program Files/AMD/ROCm/7.1" ROCM_ARCH=gfx1151
+```
+
+### Direct (the script make calls)
+
+```sh
+ROCM_PATH="C:/Program Files/AMD/ROCm/7.1" ROCM_ARCH=gfx1151 win/build-rocm.sh
+```
+
+The build:
+
+1. Generates `win/third_party/hipblas.lib` from `libhipblas.dll` (first run only).
+2. Compiles `ds4_cuda.cu` with `hipcc.exe --offload-arch=gfx1151
+   -D__HIP_PLATFORM_AMD__ -I win/third_party/rocwmma`.
+3. Compiles the host C files (`ds4.c`, `ds4_bench.c`) with
+   `clang --target=x86_64-pc-windows-msvc … -DDS4_WIN_PTHREAD` so they share the
+   **MSVC ABI** with the hipcc-built `ds4_cuda.o` (mixing the MinGW and MSVC C
+   runtimes across the `FILE*`/heap boundary is unsafe, so the GPU build uses
+   MSVC ABI throughout — unlike the MinGW CPU build above).
+4. Links `ds4-bench.exe` with `hipcc.exe`, pulling in `amdhip64_7.dll` and
+   `libhipblas.dll`.
+
+### Verify it runs
+
+```sh
+$ PATH="C:/Program Files/AMD/ROCm/7.1/bin:$PATH" \
+  DS4_LOCK_FILE="$TEMP/ds4.lock" ./ds4-bench.exe -m model.gguf --prompt-file p.txt
+ds4-bench: context buffers 753.89 MiB (ctx=32897, backend=cuda, …)
+```
+
+The SDK `bin` must be on `PATH` at runtime for the HIP/hipBLAS DLLs. The binary
+initializes the GPU backend (reported as `backend=cuda`, the unified GPU path
+name) before touching the model file. Full inference is not yet runtime-verified
+here — the ~80 GB model needs a BIOS UMA memory re-split + reboot — but the HIP
+runtime and hipBLAS load and the gfx1151 binary links and starts cleanly.
+
+### Why a generated import lib
+
+The Windows HIP SDK ships hipBLAS with only the **MinGW-style**
+`lib/libhipblas.dll.a`, which the MSVC linker (`lld-link`, used by `hipcc` on
+Windows) cannot consume. The HIP runtime itself has a proper MSVC import lib
+(`lib/amdhip64.lib`), but hipBLAS does not. `win/build-rocm.sh` therefore dumps
+`libhipblas.dll`'s export table and synthesizes a COFF `hipblas.lib` with
+`llvm-dlltool`. The generated lib is git-ignored and regenerated on demand.
+
+### Native-Windows portability shims (all behind `#ifdef _WIN32`)
+
+- `ds4_win.h` — the existing POSIX shim, extended for the MSVC-ABI GPU build
+  with `STDERR_FILENO`/`ssize_t`/`SSIZE_MAX`/`off_t`, `clock_gettime` +
+  `CLOCK_MONOTONIC`, and `ftruncate` (MinGW already supplies these; the
+  additions are guarded `!defined(__MINGW32__)` so the CPU build is unchanged).
+- `win/ds4_pthread_win.h` — a header-only Win32 pthread shim (threads, mutex,
+  condition variable, once) used when `DS4_WIN_PTHREAD` is defined, because the
+  MSVC toolchain has no `<pthread.h>`. The MinGW CPU build keeps winpthreads.
+- `st_blksize` (a Linux `O_DIRECT` alignment hint) is skipped on Windows; the
+  whole direct-I/O block is already `#if defined(__linux__) && defined(O_DIRECT)`.
+
+All edits to `ds4.c`, `ds4_bench.c`, and `ds4_cuda.cu` are behind `_WIN32`
+(further sub-guarded by `__MINGW32__` / `DS4_WIN_PTHREAD`), so the POSIX, macOS,
+CUDA, and Linux-ROCm builds produce byte-identical preprocessor output.
+
+## Status
+
+| Target               | Native Windows build | Notes |
+|----------------------|----------------------|-------|
+| `ds4-bench` (CPU)    | builds & runs        | MinGW-w64, no terminal/socket deps |
+| `ds4-bench` (ROCm)   | builds, links, starts| HIP/clang-MSVC, gfx1151; full inference pending model + UMA re-split |
+| `ds4` (CLI)          | not yet              | `linenoise.c` uses POSIX `termios`, plus `sigaction` in `ds4_cli.c` (needs Win console raw-mode port) |
+| `ds4-server`         | not yet              | BSD sockets / `poll` / `arpa/inet.h` (needs Winsock port) |
+
+## Runtime note
+
+Set `DS4_LOCK_FILE` to a Windows path (the default is `/tmp/ds4.lock`, which does
+not exist on Windows) before running, e.g.:
+
+```sh
+export DS4_LOCK_FILE="$TEMP/ds4.lock"
+```
+
+For the **ROCm** build, also:
+
+- Put the HIP SDK `bin` on `PATH` at runtime (`amdhip64_7.dll`, `libhipblas.dll`).
+- Set `DS4_CUDA_MANAGED=1` to use the full UMA pool (managed memory) once a model
+  is being loaded — required for the large Strix Halo UMA allocation.
+
+## Deferred work
+
+- **CLI (`ds4`)**: port `linenoise.c` to the Windows console (raw mode via
+  `SetConsoleMode`) and replace the `sigaction`/`SIGINT` handling in `ds4_cli.c`.
+- **Server (`ds4-server`)**: port the BSD sockets / `poll(2)` event loop to
+  Winsock 2 (`WSAStartup`, `WSAPoll`, `closesocket`).
diff --git a/win/build-rocm.sh b/win/build-rocm.sh
new file mode 100644
index 000000000..0f629424b
--- /dev/null
+++ b/win/build-rocm.sh
@@ -0,0 +1,96 @@
+#!/usr/bin/env bash
+# build-rocm.sh — native Windows ROCm/HIP build of ds4-bench.exe for gfx1151.
+#
+# Builds DS4's GPU (HIP) backend natively on Windows with the AMD HIP SDK — no
+# WSL, no MSVC on PATH, no full Visual Studio install required at the command
+# line (hipcc auto-discovers the VS build tools' headers). Produces a gfx1151
+# ROCm binary: ds4-bench.exe.
+#
+# Why a script instead of pure Make: hipcc.exe's .bat wrapper splits arguments
+# on spaces, so paths like "C:/Program Files/AMD/ROCm/7.1" break -I/-L flags.
+# This script relies on the SDK's default include/lib search (hipcc adds the SDK
+# include via -idirafter automatically) and a space-free path for the hipblas
+# import lib, sidestepping the quoting problem.
+#
+# Usage:
+#   win/build-rocm.sh                 # uses defaults below
+#   ROCM_PATH="C:/Program Files/AMD/ROCm/7.1" ROCM_ARCH=gfx1151 win/build-rocm.sh
+#
+# Requirements:
+#   - AMD HIP SDK (default C:/Program Files/AMD/ROCm/7.1) with hipcc.exe + clang.
+#   - Vendored rocWMMA version header at win/third_party/rocwmma (in-tree).
+#   - hipblas.lib (MSVC import lib). Generated on the fly from libhipblas.dll if
+#     win/third_party/hipblas.lib is absent (needs llvm-dlltool on PATH or in the
+#     scoop LLVM install).
+set -euo pipefail
+
+ROCM_PATH="${ROCM_PATH:-C:/Program Files/AMD/ROCm/7.1}"
+ROCM_ARCH="${ROCM_ARCH:-gfx1151}"
+
+# Resolve the repo root (this script lives in win/).
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$HERE"
+
+HIPCC="$ROCM_PATH/bin/hipcc.exe"
+CLANG="$ROCM_PATH/bin/clang.exe"
+THIRD="win/third_party"
+ROCWMMA_INC="$THIRD/rocwmma"
+HIPBLAS_LIB="$THIRD/hipblas.lib"
+
+if [ ! -x "$HIPCC" ]; then
+    echo "error: hipcc.exe not found at '$HIPCC' (set ROCM_PATH)" >&2
+    exit 2
+fi
+
+# --- ensure an MSVC-style hipblas import lib exists -------------------------
+# The Windows HIP SDK ships only the MinGW-style libhipblas.dll.a, which the
+# MSVC linker (lld-link) cannot consume. Generate hipblas.lib from the DLL's
+# export table once and cache it in win/third_party/.
+if [ ! -f "$HIPBLAS_LIB" ]; then
+    echo "==> generating $HIPBLAS_LIB from libhipblas.dll"
+    DLLTOOL="$(command -v llvm-dlltool.exe 2>/dev/null || true)"
+    if [ -z "$DLLTOOL" ] && [ -x "$HOME/scoop/apps/llvm/current/bin/llvm-dlltool.exe" ]; then
+        DLLTOOL="$HOME/scoop/apps/llvm/current/bin/llvm-dlltool.exe"
+    fi
+    if [ -z "$DLLTOOL" ]; then
+        echo "error: need llvm-dlltool to build hipblas.lib (install LLVM or scoop llvm)" >&2
+        exit 2
+    fi
+    OBJDUMP="$ROCM_PATH/bin/llvm-objdump.exe"
+    DEF="$(mktemp)"
+    { echo "LIBRARY libhipblas.dll"; echo "EXPORTS"; \
+      "$OBJDUMP" -p "$ROCM_PATH/bin/libhipblas.dll" \
+        | awk '/Export Table:/{f=1} f&&/^[[:space:]]+[0-9]+[[:space:]]+0x[0-9a-f]+[[:space:]]+/{print $NF}' \
+        | sort -u; } > "$DEF"
+    "$DLLTOOL" -m i386:x86-64 -d "$DEF" -l "$HIPBLAS_LIB" -D libhipblas.dll
+    rm -f "$DEF"
+fi
+
+# --- common flags ----------------------------------------------------------
+# Host C files are compiled in the MSVC ABI (clang --target=...-windows-msvc) so
+# they link against the hipcc-produced (MSVC-ABI) ds4_cuda.o. DS4_WIN_PTHREAD
+# selects the Win32 pthread shim (MSVC has no <pthread.h>).
+HOSTFLAGS="--target=x86_64-pc-windows-msvc -O3 -ffast-math -fno-finite-math-only \
+  -DDS4_WIN_PTHREAD -D_CRT_SECURE_NO_WARNINGS \
+  -Wno-deprecated-declarations -Wno-unused-command-line-argument"
+
+GPUFLAGS="--offload-arch=$ROCM_ARCH -O3 -fno-finite-math-only \
+  -D__HIP_PLATFORM_AMD__ -D_CRT_SECURE_NO_WARNINGS \
+  -Wno-deprecated-declarations -Wno-unused-command-line-argument -I$ROCWMMA_INC"
+
+echo "==> compiling ds4_cuda.cu (HIP, $ROCM_ARCH)"
+"$HIPCC" $GPUFLAGS -c ds4_cuda.cu -o ds4_cuda.o
+
+echo "==> compiling ds4.c (host, MSVC ABI)"
+"$CLANG" $HOSTFLAGS -c ds4.c -o ds4.o
+
+echo "==> compiling ds4_bench.c (host, MSVC ABI)"
+"$CLANG" $HOSTFLAGS -c ds4_bench.c -o ds4_bench.o
+
+echo "==> linking ds4-bench.exe"
+"$HIPCC" --offload-arch="$ROCM_ARCH" ds4_bench.o ds4.o ds4_cuda.o \
+    -o ds4-bench.exe -L"$THIRD" -lhipblas
+
+echo "==> done: ds4-bench.exe"
+echo "    Run with the SDK bin on PATH, e.g.:"
+echo "      PATH=\"$ROCM_PATH/bin:\$PATH\" ./ds4-bench.exe --prompt-file FILE -m MODEL.gguf"
diff --git a/win/ds4_pthread_win.h b/win/ds4_pthread_win.h
new file mode 100644
index 000000000..8eab83ee3
--- /dev/null
+++ b/win/ds4_pthread_win.h
@@ -0,0 +1,146 @@
+/* ds4_pthread_win.h — minimal pthread shim for native Windows GPU builds.
+ *
+ * The native-Windows CPU build (MinGW-w64) gets pthreads from winpthreads, but
+ * the native-Windows ROCm/HIP build compiles the C host code with clang in the
+ * MSVC ABI (to match the hipcc-built ds4_cuda.o), and the MSVC toolchain has no
+ * <pthread.h>. This header implements exactly the pthread subset DS4 uses on
+ * top of the Win32 threading primitives:
+ *
+ *   pthread_t, pthread_create, pthread_join
+ *   pthread_mutex_t / _init / _lock / _unlock / _destroy
+ *   pthread_cond_t  / _init / _wait / _signal / _broadcast / _destroy
+ *   pthread_once_t  / pthread_once / PTHREAD_ONCE_INIT
+ *
+ * Header-only and self-contained. The entire body is guarded by _WIN32, and it
+ * is only pulled in for the Windows GPU build (not the MinGW CPU build, which
+ * already has real pthreads), so POSIX builds are completely unaffected.
+ *
+ * Only included from ds4_win.h, and only when DS4_WIN_PTHREAD is requested, so
+ * the MinGW CPU build keeps using winpthreads.
+ */
+#ifndef DS4_PTHREAD_WIN_H
+#define DS4_PTHREAD_WIN_H
+
+#ifdef _WIN32
+
+#include <windows.h>
+#include <process.h>
+#include <errno.h>
+
+/* ---- threads ------------------------------------------------------------- */
+typedef struct {
+    HANDLE        handle;
+    void         *(*start)(void *);
+    void         *arg;
+    void         *retval;
+} ds4_pthread_state;
+typedef ds4_pthread_state *pthread_t;
+
+static unsigned __stdcall ds4_pthread_trampoline(void *p)
+{
+    ds4_pthread_state *st = (ds4_pthread_state *)p;
+    st->retval = st->start(st->arg);
+    return 0;
+}
+
+static inline int pthread_create(pthread_t *thread, const void *attr,
+                                 void *(*start)(void *), void *arg)
+{
+    (void)attr;
+    ds4_pthread_state *st = (ds4_pthread_state *)calloc(1, sizeof(*st));
+    if (!st) return EAGAIN;
+    st->start = start;
+    st->arg   = arg;
+    uintptr_t h = _beginthreadex(NULL, 0, ds4_pthread_trampoline, st, 0, NULL);
+    if (h == 0) { free(st); return EAGAIN; }
+    st->handle = (HANDLE)h;
+    *thread = st;
+    return 0;
+}
+
+static inline int pthread_join(pthread_t thread, void **retval)
+{
+    if (!thread) return EINVAL;
+    WaitForSingleObject(thread->handle, INFINITE);
+    if (retval) *retval = thread->retval;
+    CloseHandle(thread->handle);
+    free(thread);
+    return 0;
+}
+
+/* ---- mutex (non-recursive; matches PTHREAD default) ---------------------- */
+typedef SRWLOCK pthread_mutex_t;
+#define PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT
+
+static inline int pthread_mutex_init(pthread_mutex_t *m, const void *attr)
+{
+    (void)attr;
+    InitializeSRWLock(m);
+    return 0;
+}
+static inline int pthread_mutex_lock(pthread_mutex_t *m)
+{
+    AcquireSRWLockExclusive(m);
+    return 0;
+}
+static inline int pthread_mutex_unlock(pthread_mutex_t *m)
+{
+    ReleaseSRWLockExclusive(m);
+    return 0;
+}
+static inline int pthread_mutex_destroy(pthread_mutex_t *m)
+{
+    (void)m; /* SRWLOCK needs no teardown */
+    return 0;
+}
+
+/* ---- condition variable -------------------------------------------------- */
+typedef CONDITION_VARIABLE pthread_cond_t;
+#define PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT
+
+static inline int pthread_cond_init(pthread_cond_t *c, const void *attr)
+{
+    (void)attr;
+    InitializeConditionVariable(c);
+    return 0;
+}
+static inline int pthread_cond_wait(pthread_cond_t *c, pthread_mutex_t *m)
+{
+    /* SRWLOCK held exclusively → CONDITION_VARIABLE_LOCKMODE default (0). */
+    return SleepConditionVariableSRW(c, m, INFINITE, 0) ? 0 : EINVAL;
+}
+static inline int pthread_cond_signal(pthread_cond_t *c)
+{
+    WakeConditionVariable(c);
+    return 0;
+}
+static inline int pthread_cond_broadcast(pthread_cond_t *c)
+{
+    WakeAllConditionVariable(c);
+    return 0;
+}
+static inline int pthread_cond_destroy(pthread_cond_t *c)
+{
+    (void)c; /* CONDITION_VARIABLE needs no teardown */
+    return 0;
+}
+
+/* ---- one-time init ------------------------------------------------------- */
+typedef INIT_ONCE pthread_once_t;
+#define PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT
+
+static void (*ds4_once_fn)(void);
+static BOOL CALLBACK ds4_once_trampoline(PINIT_ONCE io, PVOID param, PVOID *ctx)
+{
+    (void)io; (void)ctx;
+    ((void (*)(void))param)();
+    return TRUE;
+}
+static inline int pthread_once(pthread_once_t *once, void (*init)(void))
+{
+    InitOnceExecuteOnce(once, ds4_once_trampoline, (PVOID)init, NULL);
+    return 0;
+}
+
+#endif /* _WIN32 */
+#endif /* DS4_PTHREAD_WIN_H */
diff --git a/win/third_party/rocwmma/rocwmma/rocwmma-version.hpp b/win/third_party/rocwmma/rocwmma/rocwmma-version.hpp
new file mode 100644
index 000000000..ec7554058
--- /dev/null
+++ b/win/third_party/rocwmma/rocwmma/rocwmma-version.hpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+//! @file
+//! @brief rocwmma-version.hpp provides the configured version and settings
+//!
+//! Vendored for the native-Windows ROCm build of DS4. The Windows HIP SDK
+//! (C:/Program Files/AMD/ROCm/7.1) does NOT ship the header-only rocWMMA
+//! library, but ds4_rocm.h includes this version header. The values below are
+//! the CMake-configured output of
+//! library/include/rocwmma/internal/rocwmma-version.hpp.in from
+//! github.com/ROCm/rocWMMA (VERSION_STRING 2.2.1, the rocWMMA release that
+//! ships with ROCm 7.x). rocWMMA is header-only and MIT-licensed; only the
+//! version header is required because DS4's wmma path is CUDA-only
+//! (guarded by __CUDA_ARCH__) and not compiled for HIP.
+
+#ifndef ROCWMMA_API_VERSION_HPP
+#define ROCWMMA_API_VERSION_HPP
+
+#include <string>
+
+// clang-format off
+#define ROCWMMA_VERSION_MAJOR       2
+#define ROCWMMA_VERSION_MINOR       2
+#define ROCWMMA_VERSION_PATCH       1
+// clang-format on
+
+inline std::string rocwmma_get_version()
+{
+    return std::to_string(ROCWMMA_VERSION_MAJOR) + "." + std::to_string(ROCWMMA_VERSION_MINOR) + "."
+           + std::to_string(ROCWMMA_VERSION_PATCH);
+}
+
+#endif // ROCWMMA_API_VERSION_HPP