diff --git a/.gitignore b/.gitignore
index 2c70e4d61..3aa15bbfa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 /ds4
 /ds4-server
 /ds4-bench
+/ds4-bench.exe
 /ds4_native
 /ds4_server_test
 /ds4_test
@@ -8,6 +9,7 @@
 /TODO.md
 /gguf/
 *.o
+*.exe
 *.dSYM/
 /misc/
 .*.swp
diff --git a/Makefile b/Makefile
index 42cde3c35..4629e8d76 100644
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,28 @@
 CC ?= cc
 UNAME_S := $(shell uname -s)
 
+# On MinGW/MSYS `uname -s` is e.g. MINGW64_NT-10.0 or MSYS_NT-10.0.
+IS_WINDOWS := $(filter MINGW% MSYS%,$(UNAME_S))
+
+# MinGW has no `cc`; default the compiler to gcc there (still overridable).
+ifneq ($(IS_WINDOWS),)
+ifeq ($(origin CC),default)
+CC := gcc
+endif
+endif
+
 ifeq ($(UNAME_S),Darwin)
 NATIVE_CPU_FLAG ?= -mcpu=native
 else
 NATIVE_CPU_FLAG ?= -march=native
 endif
 
+# Native Windows (MinGW-w64) CPU build flags. ds4.c pulls in the dependency-free
+# POSIX shim (ds4_win.h) behind #ifdef _WIN32; no extra -I/-include is needed.
+WIN_CFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -std=c99 -D_GNU_SOURCE \
+	-fno-finite-math-only -DDS4_NO_GPU -D_CRT_SECURE_NO_WARNINGS
+WIN_LDLIBS ?= -lm
+
 CFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -Wall -Wextra -std=c99
 OBJCFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -Wall -Wextra -fobjc-arc
 
@@ -18,7 +34,23 @@ METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
 CORE_OBJS = ds4.o ds4_metal.o
 CPU_CORE_OBJS = ds4_cpu.o
 else
+
 CFLAGS += -D_GNU_SOURCE -fno-finite-math-only
+
+ifeq ($(GPU_BACKEND),rocm)
+ROCM_PATH ?= /opt/rocm
+GPU_CC = $(ROCM_PATH)/bin/hipcc
+ROCM_ARCH ?= gfx1151
+
+GPU_CFLAGS ?= -O3 -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -Wno-unused-command-line-argument --offload-arch=$(ROCM_ARCH)
+GPU_LDLIBS = -lm -pthread -L$(ROCM_PATH)/lib -lhipblas
+
+@echo "ROCM_ARCH: $(ROCM_ARCH)"
+
+EXTRA_DEPS = ds4_rocm.h
+
+else
+
 CUDA_HOME ?= /usr/local/cuda
 NVCC ?= $(CUDA_HOME)/bin/nvcc
 CUDA_ARCH ?=
@@ -27,12 +59,21 @@ NVCC_ARCH_FLAGS := -arch=$(CUDA_ARCH)
 endif
 NVCCFLAGS ?= -O3 --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NATIVE_CPU_FLAG) -Xcompiler -pthread
 CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas
+
+GPU_CC = $(NVCC)
+GPU_CFLAGS = $(NVCCFLAGS)
+GPU_LDLIBS = $(CUDA_LDLIBS)
+
+endif
+
 CORE_OBJS = ds4.o ds4_cuda.o
+EXTRA_DEPS =
 CPU_CORE_OBJS = ds4_cpu.o
 METAL_LDLIBS := $(LDLIBS)
+
 endif
 
-.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression
+.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression windows-cpu
 
 ifeq ($(UNAME_S),Darwin)
 all: ds4 ds4-server ds4-bench
@@ -60,6 +101,27 @@ cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o linenoise.o rax.o $(CPU_CORE
 
 cuda-regression:
 	@echo "cuda-regression requires a CUDA build"
+
+else ifneq ($(IS_WINDOWS),)
+# ---- Native Windows (MinGW-w64) ------------------------------------------
+# Only the CPU bench is portable today. The CLI (linenoise/termios + sigaction)
+# and server (BSD sockets/poll) still need Windows ports; see win/README.md.
+all: help
+
+help:
+	@echo "DS4 build targets (native Windows / MinGW-w64):"
+	@echo "  make windows-cpu   Build native Windows CPU ./ds4-bench.exe"
+	@echo "  make clean         Remove build outputs"
+	@echo ""
+	@echo "  ds4 (CLI) and ds4-server are not yet ported to Windows."
+
+windows-cpu: ds4-bench.exe
+
+ds4-bench.exe: ds4_bench.c ds4.c ds4.h ds4_gpu.h ds4_win.h
+	$(CC) $(WIN_CFLAGS) -c -o ds4_cpu.o ds4.c
+	$(CC) $(WIN_CFLAGS) -c -o ds4_bench_cpu.o ds4_bench.c
+	$(CC) $(WIN_CFLAGS) -o $@ ds4_bench_cpu.o ds4_cpu.o $(WIN_LDLIBS)
+
 else
 all: help
 
@@ -69,6 +131,7 @@ help:
 	@echo "  make cuda-generic        Build CUDA for a generic local CUDA GPU"
 	@echo "  make cuda CUDA_ARCH=sm_N Build CUDA with an explicit nvcc -arch value"
 	@echo "  make cpu                 Build CPU-only ./ds4, ./ds4-server, and ./ds4-bench"
+	@echo "  make rocm                Build ROCm"
 	@echo "  make test                Build and run tests"
 	@echo "  make clean               Remove build outputs"
 
@@ -86,14 +149,22 @@ cuda:
 	fi
 	$(MAKE) ds4 ds4-server ds4-bench CUDA_ARCH="$(CUDA_ARCH)"
 
+rocm:
+	@if [ -z "$(strip $(ROCM_ARCH))" ]; then \
+		echo "error: specify ROCM_ARCH, for example: make rocm ROCM_ARCH=gfx1151"; \
+		exit 2; \
+	fi
+	$(MAKE) ds4 ds4-server ds4-bench GPU_BACKEND=rocm ROCM_ARCH=$(ROCM_ARCH)
+
+
 ds4: ds4_cli.o linenoise.o $(CORE_OBJS)
-	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
+	$(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS)
 
 ds4-server: ds4_server.o rax.o $(CORE_OBJS)
-	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
+	$(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS)
 
 ds4-bench: ds4_bench.o $(CORE_OBJS)
-	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
+	$(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS)
 
 cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o linenoise.o rax.o $(CPU_CORE_OBJS)
 	$(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
@@ -143,11 +214,11 @@ ds4_bench_cpu.o: ds4_bench.c ds4.h
 ds4_metal.o: ds4_metal.m ds4_gpu.h $(METAL_SRCS)
 	$(CC) $(OBJCFLAGS) -c -o $@ ds4_metal.m
 
-ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc
-	$(NVCC) $(NVCCFLAGS) -c -o $@ ds4_cuda.cu
+ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(EXTRA_DEPS)
+	$(GPU_CC) $(GPU_CFLAGS) -c -o $@ ds4_cuda.cu
 
 tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o
-	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
+	$(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS)
 
 ds4_test: ds4_test.o rax.o $(CORE_OBJS)
 ifeq ($(UNAME_S),Darwin)
@@ -160,4 +231,5 @@ test: ds4_test
 	./ds4_test
 
 clean:
-	rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
+	rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o *.exe tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
+
diff --git a/ds4.c b/ds4.c
index 51410e335..80c4ed320 100644
--- a/ds4.c
+++ b/ds4.c
@@ -27,12 +27,22 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#ifdef _WIN32
+/* Native Windows (MinGW-w64) CPU build: a small dependency-free POSIX shim
+ * supplies mmap/flock/pread/sysconf/dprintf/fmemopen. See ds4_win.h. */
+#include "ds4_win.h"
+#include <sys/stat.h>
+#include <stdarg.h>
+#include <time.h>
+#include <unistd.h>
+#else
 #include <sys/file.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <stdarg.h>
 #include <time.h>
 #include <unistd.h>
+#endif
 
 #include "ds4.h"
 
diff --git a/ds4_cuda.cu b/ds4_cuda.cu
index 3b224f99e..8b6241ca3 100644
--- a/ds4_cuda.cu
+++ b/ds4_cuda.cu
@@ -1,8 +1,18 @@
+#ifdef __HIP_PLATFORM_AMD__
+#include "ds4_rocm.h"
+
+#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL
+#define MASK_T uint64_t
+#else
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
 #include <mma.h>
 #include <cublas_v2.h>
 
+#define FULL_WARP_MASK 0xFFFFFFFFu
+#define MASK_T uint32_t
+#endif
+
 #include <stdint.h>
 #include <errno.h>
 #include <limits.h>
@@ -1722,14 +1732,14 @@ __global__ static void f32_to_f16_kernel(__half *out, const float *x, uint64_t n
 
 __device__ static float warp_sum_f32(float v) {
     for (int offset = 16; offset > 0; offset >>= 1) {
-        v += __shfl_down_sync(0xffffffffu, v, offset);
+        v += __shfl_down_sync(FULL_WARP_MASK, v, offset);
     }
     return v;
 }
 
 __device__ static float warp_max_f32(float v) {
     for (int offset = 16; offset > 0; offset >>= 1) {
-        v = fmaxf(v, __shfl_down_sync(0xffffffffu, v, offset));
+        v = fmaxf(v, __shfl_down_sync(FULL_WARP_MASK, v, offset));
     }
     return v;
 }
@@ -2846,7 +2856,7 @@ __global__ static void attention_decode_mixed_kernel(
                     for (uint32_t d = qlane; d < head_dim; d += 8u) dot += qh[d] * kvrow[d];
                     const uint32_t mask = 0xffu << (threadIdx.x & 24u);
                     for (uint32_t off = 4u; off > 0u; off >>= 1u) {
-                        dot += __shfl_down_sync(mask, dot, off, 8);
+                        dot += __shfl_down_sync(static_cast<MASK_T>(mask), dot, off, 8);
                     }
                     s = dot * scale + add;
                 }
@@ -3008,7 +3018,7 @@ __global__ static void attention_indexed_mixed_kernel(
                 for (uint32_t d = qlane; d < head_dim; d += 8u) dot += qh[d] * kvrow[d];
                 const uint32_t mask = 0xffu << (threadIdx.x & 24u);
                 for (uint32_t off = 4u; off > 0u; off >>= 1u) {
-                    dot += __shfl_down_sync(mask, dot, off, 8);
+                    dot += __shfl_down_sync(static_cast<MASK_T>(mask), dot, off, 8);
                 }
                 if (qlane == 0) scores[row] = dot * scale;
             }
@@ -3189,7 +3199,7 @@ __global__ static void attention_indexed_mixed_heads8_rb4_kernel(
         const float *score_row = scores + warp * 768u;
         for (uint32_t i = lane; i < n_score; i += 32u) max_s = fmaxf(max_s, score_row[i]);
         max_s = warp_max_f32(max_s);
-        max_s = __shfl_sync(0xffffffffu, max_s, 0);
+        max_s = __shfl_sync(FULL_WARP_MASK, max_s, 0);
     }
     float den = 0.0f;
     if (valid_head) {
@@ -3201,7 +3211,7 @@ __global__ static void attention_indexed_mixed_heads8_rb4_kernel(
         }
         den = warp_sum_f32(den);
         den += expf(sinks[head] - max_s);
-        den = __shfl_sync(0xffffffffu, den, 0);
+        den = __shfl_sync(FULL_WARP_MASK, den, 0);
     }
 
     float4 o0 = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
@@ -3361,7 +3371,7 @@ __global__ static void attention_indexed_mixed_heads8_online_kernel(
                               dot4_f32(q2, k2) +
                               dot4_f32(q3, k3);
                 score = warp_sum_f32(score) * scale;
-                score = __shfl_sync(0xffffffffu, score, 0);
+                score = __shfl_sync(FULL_WARP_MASK, score, 0);
 
                 const float new_m = fmaxf(max_s, score);
                 const float old_scale = expf(max_s - new_m);
@@ -3485,7 +3495,7 @@ __global__ static void attention_static_mixed_heads8_online_kernel(
                               dot4_f32(q2, k2) +
                               dot4_f32(q3, k3);
                 score = warp_sum_f32(score) * scale;
-                score = __shfl_sync(0xffffffffu, score, 0);
+                score = __shfl_sync(FULL_WARP_MASK, score, 0);
 
                 const float new_m = fmaxf(max_s, score);
                 const float old_scale = expf(max_s - new_m);
@@ -3650,7 +3660,7 @@ __global__ static void attention_decode_mixed_heads8_online_kernel(
                               dot4_f32(q2, k2) +
                               dot4_f32(q3, k3);
                 score = warp_sum_f32(score) * scale;
-                score = __shfl_sync(0xffffffffu, score, 0);
+                score = __shfl_sync(FULL_WARP_MASK, score, 0);
 
                 const float new_m = fmaxf(max_s, score);
                 const float old_scale = expf(max_s - new_m);
@@ -4268,9 +4278,9 @@ __global__ static void router_select_warp_topk_kernel(
         }
         #pragma unroll
         for (uint32_t mask = 16u; mask > 0u; mask >>= 1u) {
-            const float other_score = __shfl_xor_sync(0xffffffffu, best_score, mask);
-            const float other_prob = __shfl_xor_sync(0xffffffffu, best_prob, mask);
-            const uint32_t other_idx = __shfl_xor_sync(0xffffffffu, best_idx, mask);
+            const float other_score = __shfl_xor_sync(FULL_WARP_MASK, best_score, mask);
+            const float other_prob = __shfl_xor_sync(FULL_WARP_MASK, best_prob, mask);
+            const uint32_t other_idx = __shfl_xor_sync(FULL_WARP_MASK, best_idx, mask);
             if (router_score_better(other_score, other_idx, best_score, best_idx)) {
                 best_score = other_score;
                 best_prob = other_prob;
@@ -5198,7 +5208,7 @@ static int cuda_matmul_q8_0_tensor_labeled(ds4_gpu_tensor *out, const void *mode
                                              out->ptr,
                                              CUDA_R_32F,
                                              (int)out_dim,
-                                             CUDA_R_32F,
+                                             CUBLAS_COMPUTE_32F,
                                              CUBLAS_GEMM_DEFAULT);
             if (st == CUBLAS_STATUS_SUCCESS) return 1;
             fprintf(stderr, "ds4: cuBLAS q8 f16 matmul failed: status %d\n", (int)st);
@@ -5440,7 +5450,7 @@ extern "C" int ds4_gpu_matmul_f16_tensor(ds4_gpu_tensor *out, const void *model_
                                          out->ptr,
                                          CUDA_R_32F,
                                          (int)out_dim,
-                                         CUDA_R_32F,
+                                         CUBLAS_COMPUTE_32F,
                                          CUBLAS_GEMM_DEFAULT);
         return cublas_ok(st, "f16 matmul");
     }
@@ -6733,7 +6743,7 @@ extern "C" int ds4_gpu_attention_output_q8_batch_tensor(
                                                        (int)rank,
                                                        (long long)rank * n_tokens,
                                                        (int)n_groups,
-                                                       CUDA_R_32F,
+                                                       CUBLAS_COMPUTE_32F,
                                                        CUBLAS_GEMM_DEFAULT);
         if (!cublas_ok(st, "attention output a gemm")) return 0;
         attention_unpack_group_low_kernel<<<(low_tmp_count + 255) / 256, 256>>>(
@@ -7395,7 +7405,7 @@ __device__ static void dev_dot_q2_K_q8_K_block8(
 __device__ static float half_warp_sum_f32(float v, uint32_t lane16) {
     uint32_t mask = 0xffffu << (threadIdx.x & 16u);
     for (int offset = 8; offset > 0; offset >>= 1) {
-        v += __shfl_down_sync(mask, v, offset, 16);
+        v += __shfl_down_sync(static_cast<MASK_T>(mask), v, offset, 16);
     }
     (void)lane16;
     return v;
@@ -7404,7 +7414,7 @@ __device__ static float half_warp_sum_f32(float v, uint32_t lane16) {
 __device__ static float quarter_warp_sum_f32(float v, uint32_t lane8) {
     uint32_t mask = 0xffu << (threadIdx.x & 24u);
     for (int offset = 4; offset > 0; offset >>= 1) {
-        v += __shfl_down_sync(mask, v, offset, 8);
+        v += __shfl_down_sync(static_cast<MASK_T>(mask), v, offset, 8);
     }
     (void)lane8;
     return v;
diff --git a/ds4_rocm.h b/ds4_rocm.h
new file mode 100644
index 000000000..0400910df
--- /dev/null
+++ b/ds4_rocm.h
@@ -0,0 +1,115 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#include <rocwmma/rocwmma-version.hpp>
+
+#define cudaError_t hipError_t
+#define cudaStream_t hipStream_t
+#define cudaEvent_t hipEvent_t
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaMemLocation hipMemLocation
+
+#define cudaSuccess hipSuccess
+#define cudaErrorNotSupported hipErrorNotSupported
+#define cudaErrorInvalidValue hipErrorInvalidValue
+#define cudaGetLastError hipGetLastError
+#define cudaGetErrorString hipGetErrorString
+
+#define cudaGetDevice hipGetDevice
+#define cudaSetDevice hipSetDevice
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaDeviceGetAttribute hipDeviceGetAttribute
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaDevAttrPageableMemoryAccess hipDeviceAttributePageableMemoryAccess
+#define cudaMemLocationTypeDevice hipMemLocationTypeDevice
+
+#define cudaMalloc hipMalloc
+#define cudaMallocHost hipHostMalloc
+#define cudaMallocManaged hipMallocManaged
+#define cudaFree hipFree
+#define cudaFreeHost hipFreeHost
+#define cudaMemset hipMemset
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemGetInfo hipMemGetInfo
+#define cudaMemsetAsync hipMemsetAsync
+
+#define cudaHostRegister hipHostRegister
+#define cudaHostUnregister hipHostUnregister
+#define cudaHostGetDevicePointer hipHostGetDevicePointer
+#define cudaHostRegisterMapped hipHostRegisterMapped
+#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
+
+#define cudaMemAdvise(p1, p2, p3, p4) hipMemAdvise(p1, p2, p3, p4.id)
+#define cudaMemPrefetchAsync(devPtr, count, location, flags, stream) hipMemPrefetchAsync(devPtr, count, location.id, stream)
+#define cudaMemAdviseSetReadMostly hipMemAdviseSetReadMostly
+#define cudaMemAdviseSetPreferredLocation hipMemAdviseSetPreferredLocation
+
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamNonBlocking hipStreamNonBlocking
+
+#define cudaEventCreate hipEventCreate
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDestroy hipEventDestroy
+#define cudaEventRecord hipEventRecord
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEventElapsedTime hipEventElapsedTime
+#define cudaEventDisableTiming hipEventDisableTiming
+
+#define cublasHandle_t hipblasHandle_t
+#define cublasStatus_t hipblasStatus_t
+#define cublasMath_t hipblasMath_t
+
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_DEFAULT_MATH HIPBLAS_DEFAULT_MATH
+#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
+#define CUBLAS_TF32_TENSOR_OP_MATH HIPBLAS_TF32_TENSOR_OP_MATH
+#define CUDA_R_16F HIPBLAS_R_16F
+#define CUDA_R_32F HIPBLAS_R_32F
+
+#define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
+#define cublasSetMathMode hipblasSetMathMode
+#define cublasSgemm hipblasSgemm
+#define cublasSgemmStridedBatched hipblasSgemmStridedBatched
+#define cublasGemmEx hipblasGemmEx
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+
+static __device__ __forceinline__ int32_t __vcmpne4(uint32_t a, uint32_t b) {
+    // For each byte: 0xFF if a != b, 0x00 if a == b
+    uint32_t diff = a ^ b;
+    // Spread any set bit in each byte to fill the whole byte
+    diff |= (diff >> 1); diff |= (diff >> 2); diff |= (diff >> 4);
+    diff &= 0x01010101u;
+    diff *= 0xFFu; // 0x01 -> 0xFF per byte
+    return (int32_t)diff;
+}
+
+static __device__ __forceinline__ int32_t __vsub4(int32_t a, int32_t b) {
+    // Per-byte subtraction (wrapping, not saturating)
+    uint32_t ua = (uint32_t)a, ub = (uint32_t)b;
+    // Trick: subtract bytes in parallel avoiding cross-byte borrows
+    uint32_t diff = ((ua | 0x80808080u) - (ub & 0x7F7F7F7Fu)) ^ ((ua ^ ~ub) & 0x80808080u);
+    return (int32_t)diff;
+}
+
+// __dp4a: dot product of 4 signed int8s packed in an int32
+static __device__ __forceinline__ int32_t __dp4a(int32_t a, int32_t b, int32_t c) {
+    const int8_t *a_bytes = reinterpret_cast<const int8_t*>(&a);
+    const int8_t *b_bytes = reinterpret_cast<const int8_t*>(&b);
+    return c + (int32_t)a_bytes[0] * b_bytes[0]
+             + (int32_t)a_bytes[1] * b_bytes[1]
+             + (int32_t)a_bytes[2] * b_bytes[2]
+             + (int32_t)a_bytes[3] * b_bytes[3];
+}
+
diff --git a/ds4_win.h b/ds4_win.h
new file mode 100644
index 000000000..bc3009663
--- /dev/null
+++ b/ds4_win.h
@@ -0,0 +1,232 @@
+/* ds4_win.h — minimal POSIX compatibility layer for native Windows builds.
+ *
+ * Provides just the POSIX surface ds4.c relies on that MinGW/UCRT lacks:
+ *   - mmap / munmap / madvise (read-only file mappings)
+ *   - sysconf(_SC_NPROCESSORS_ONLN / _SC_PAGESIZE)
+ *   - flock / fcntl(F_SETFD,FD_CLOEXEC) / pread / ftruncate / dprintf  (instance lock)
+ *   - fmemopen (fixed-buffer "wb"/"rb", temp-file backed with copy-back on close)
+ *
+ * Header-only, self-contained, no third-party deps. The whole body is guarded by
+ * _WIN32, so this header is inert on POSIX platforms. ds4.c includes it in place
+ * of <sys/mman.h> (and the other POSIX-only surface) behind #ifdef _WIN32, so the
+ * native MinGW-w64 CPU build needs no extra include/search-path flags. MinGW
+ * already provides pthread, clock_gettime and ftruncate.
+ */
+#ifndef DS4_WIN_H
+#define DS4_WIN_H
+
+#ifdef _WIN32
+
+#include <windows.h>
+#include <io.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/types.h>
+
+/* ---- mmap ---------------------------------------------------------------- */
+#define PROT_NONE  0x0
+#define PROT_READ  0x1
+#define PROT_WRITE 0x2
+#define PROT_EXEC  0x4
+#define MAP_SHARED  0x01
+#define MAP_PRIVATE 0x02
+#define MAP_FAILED  ((void *)-1)
+#define POSIX_MADV_NORMAL     0
+#define POSIX_MADV_RANDOM     1
+#define POSIX_MADV_SEQUENTIAL 2
+#define POSIX_MADV_WILLNEED   3
+#define POSIX_MADV_DONTNEED   4
+#define MADV_WILLNEED POSIX_MADV_WILLNEED
+
+#ifndef _SC_PAGESIZE
+#define _SC_PAGESIZE         0x1
+#endif
+#ifndef _SC_NPROCESSORS_ONLN
+#define _SC_NPROCESSORS_ONLN 0x2
+#endif
+
+/* ---- file locking / fd flags -------------------------------------------- */
+#ifndef F_SETFD
+#define F_SETFD    2
+#endif
+#ifndef FD_CLOEXEC
+#define FD_CLOEXEC 1
+#endif
+#define LOCK_SH 1
+#define LOCK_EX 2
+#define LOCK_NB 4
+#define LOCK_UN 8
+
+static inline void *mmap(void *addr, size_t length, int prot, int flags,
+                         int fd, long long offset)
+{
+    (void)addr; (void)flags; (void)prot;
+    HANDLE fh = (HANDLE)_get_osfhandle(fd);
+    if (fh == INVALID_HANDLE_VALUE) { errno = EBADF; return MAP_FAILED; }
+    HANDLE mh = CreateFileMappingA(fh, NULL, PAGE_READONLY, 0, 0, NULL);
+    if (mh == NULL) { errno = ENOMEM; return MAP_FAILED; }
+    DWORD off_hi = (DWORD)((uint64_t)offset >> 32);
+    DWORD off_lo = (DWORD)((uint64_t)offset & 0xFFFFFFFFu);
+    void *p = MapViewOfFile(mh, FILE_MAP_READ, off_hi, off_lo, length);
+    CloseHandle(mh); /* view keeps the section alive */
+    if (p == NULL) { errno = ENOMEM; return MAP_FAILED; }
+    return p;
+}
+
+static inline int munmap(void *addr, size_t length)
+{
+    (void)length;
+    return UnmapViewOfFile(addr) ? 0 : -1;
+}
+
+static inline int posix_madvise(void *addr, size_t length, int advice)
+{
+    (void)addr; (void)length; (void)advice;
+    return 0; /* advisory only */
+}
+static inline int madvise(void *addr, size_t length, int advice)
+{
+    return posix_madvise(addr, length, advice);
+}
+
+static inline long sysconf(int name)
+{
+    SYSTEM_INFO si;
+    GetSystemInfo(&si);
+    if (name == _SC_NPROCESSORS_ONLN) return (long)si.dwNumberOfProcessors;
+    if (name == _SC_PAGESIZE)         return (long)si.dwPageSize;
+    errno = EINVAL;
+    return -1;
+}
+
+static inline int flock(int fd, int op)
+{
+    HANDLE h = (HANDLE)_get_osfhandle(fd);
+    if (h == INVALID_HANDLE_VALUE) { errno = EBADF; return -1; }
+    OVERLAPPED ov; memset(&ov, 0, sizeof(ov));
+    if (op & LOCK_UN) {
+        return UnlockFileEx(h, 0, MAXDWORD, MAXDWORD, &ov) ? 0 : -1;
+    }
+    DWORD f = 0;
+    if (op & LOCK_EX) f |= LOCKFILE_EXCLUSIVE_LOCK;
+    if (op & LOCK_NB) f |= LOCKFILE_FAIL_IMMEDIATELY;
+    if (!LockFileEx(h, f, 0, MAXDWORD, MAXDWORD, &ov)) {
+        errno = (GetLastError() == ERROR_LOCK_VIOLATION) ? EWOULDBLOCK : EACCES;
+        return -1;
+    }
+    return 0;
+}
+
+static inline int fcntl(int fd, int cmd, ...)
+{
+    (void)fd; (void)cmd;
+    return 0; /* F_SETFD/FD_CLOEXEC is a no-op: Windows handles aren't inherited by default */
+}
+
+static inline long long ds4_pread(int fd, void *buf, size_t count, long long offset)
+{
+    HANDLE h = (HANDLE)_get_osfhandle(fd);
+    if (h == INVALID_HANDLE_VALUE) { errno = EBADF; return -1; }
+    OVERLAPPED ov; memset(&ov, 0, sizeof(ov));
+    ov.Offset     = (DWORD)((uint64_t)offset & 0xFFFFFFFFu);
+    ov.OffsetHigh = (DWORD)((uint64_t)offset >> 32);
+    DWORD got = 0;
+    if (!ReadFile(h, buf, (DWORD)count, &got, &ov)) {
+        if (GetLastError() == ERROR_HANDLE_EOF) return 0;
+        errno = EIO; return -1;
+    }
+    return (long long)got;
+}
+#define pread(fd, buf, count, offset) ds4_pread((fd), (buf), (size_t)(count), (long long)(offset))
+
+/* ftruncate is already provided by MinGW <unistd.h>. */
+
+static inline int dprintf(int fd, const char *fmt, ...)
+{
+    char buf[512];
+    va_list ap; va_start(ap, fmt);
+    int n = vsnprintf(buf, sizeof(buf), fmt, ap);
+    va_end(ap);
+    if (n < 0) return -1;
+    if (n > (int)sizeof(buf)) n = (int)sizeof(buf);
+    return _write(fd, buf, (unsigned)n);
+}
+
+/* ---- fmemopen (temp-file backed, fixed buffer) --------------------------- */
+typedef struct { FILE *fp; void *buf; size_t cap; int writeback; } ds4_memstream;
+#define DS4_MEMSTREAM_MAX 16
+static ds4_memstream ds4_ms_tab[DS4_MEMSTREAM_MAX];
+static CRITICAL_SECTION ds4_ms_cs;
+static volatile LONG ds4_ms_init = 0;
+
+static inline void ds4_ms_ensure(void)
+{
+    if (InterlockedCompareExchange(&ds4_ms_init, 1, 0) == 0)
+        InitializeCriticalSection(&ds4_ms_cs);
+}
+
+static inline FILE *ds4_tmpfile(void)
+{
+    char dir[MAX_PATH], path[MAX_PATH];
+    if (!GetTempPathA(sizeof(dir), dir)) return NULL;
+    if (!GetTempFileNameA(dir, "ds4", 0, path)) return NULL;
+    /* open read/write, delete on close */
+    return fopen(path, "wb+TD"); /* T=temporary, D=delete-on-close (MSVCRT ext) */
+}
+
+static inline FILE *fmemopen(void *buf, size_t size, const char *mode)
+{
+    ds4_ms_ensure();
+    int writing = (mode && (strchr(mode, 'w') || strchr(mode, 'a') || strchr(mode, '+')));
+    FILE *fp = ds4_tmpfile();
+    if (!fp) return NULL;
+    if (!writing && buf && size) {
+        if (fwrite(buf, 1, size, fp) != size) { fclose(fp); return NULL; }
+        rewind(fp);
+    }
+    EnterCriticalSection(&ds4_ms_cs);
+    for (int i = 0; i < DS4_MEMSTREAM_MAX; i++) {
+        if (ds4_ms_tab[i].fp == NULL) {
+            ds4_ms_tab[i].fp = fp; ds4_ms_tab[i].buf = buf;
+            ds4_ms_tab[i].cap = size; ds4_ms_tab[i].writeback = writing ? 1 : 0;
+            break;
+        }
+    }
+    LeaveCriticalSection(&ds4_ms_cs);
+    return fp;
+}
+
+static inline int ds4_win_fclose(FILE *fp)
+{
+    if (fp && ds4_ms_init) {
+        EnterCriticalSection(&ds4_ms_cs);
+        for (int i = 0; i < DS4_MEMSTREAM_MAX; i++) {
+            if (ds4_ms_tab[i].fp == fp) {
+                if (ds4_ms_tab[i].writeback && ds4_ms_tab[i].buf && ds4_ms_tab[i].cap) {
+                    fflush(fp); rewind(fp);
+                    fread(ds4_ms_tab[i].buf, 1, ds4_ms_tab[i].cap, fp); /* copy back */
+                }
+                ds4_ms_tab[i].fp = NULL; ds4_ms_tab[i].buf = NULL;
+                ds4_ms_tab[i].cap = 0;   ds4_ms_tab[i].writeback = 0;
+                break;
+            }
+        }
+        LeaveCriticalSection(&ds4_ms_cs);
+    }
+    return fclose(fp); /* real fclose — macro defined only after this header */
+}
+
+#endif /* _WIN32 */
+
+/* Redirect fclose AFTER all helpers above so ds4_win_fclose's own call hits the
+ * real fclose. Source files including this header get the memory-stream-aware one. */
+#ifdef _WIN32
+#define fclose(fp) ds4_win_fclose(fp)
+#endif
+
+#endif /* DS4_WIN_H */
diff --git a/win/README.md b/win/README.md
new file mode 100644
index 000000000..12c7f3232
--- /dev/null
+++ b/win/README.md
@@ -0,0 +1,69 @@
+# Native Windows build (experimental)
+
+DS4 assumes a POSIX environment. `ds4_win.h` (top level) is a small,
+dependency-free compatibility shim that lets the **CPU backend** build with
+native MinGW-w64 GCC (no WSL, no Cygwin, no MSVC). It supplies the POSIX surface
+MinGW/UCRT lacks: `mmap`/`munmap`/`madvise`, `sysconf`, `flock`/`fcntl`/`pread`/
+`dprintf`, and a temp-file-backed `fmemopen`. MinGW already provides `pthread`,
+`clock_gettime`, and `ftruncate`.
+
+The shim is wired in-tree: `ds4.c` includes `ds4_win.h` in place of
+`<sys/mman.h>` (and the other POSIX-only headers) behind `#ifdef _WIN32`. The
+header's entire body is guarded by `_WIN32`, so POSIX builds are byte-for-byte
+unchanged. No special include/search-path flags are needed.
+
+## Build `ds4-bench.exe` (CPU)
+
+### With make (MinGW/MSYS)
+
+```sh
+make windows-cpu
+```
+
+`uname -s` on MinGW/MSYS reports `MINGW64_NT*` / `MSYS_NT*`; the Makefile detects
+this and selects a Windows branch that defaults `CC` to `gcc`.
+
+### Direct gcc (no make)
+
+```sh
+CF="-O3 -ffast-math -march=native -std=c99 -D_GNU_SOURCE -fno-finite-math-only \
+    -DDS4_NO_GPU -D_CRT_SECURE_NO_WARNINGS"
+gcc $CF -c ds4.c       -o ds4_cpu.o
+gcc $CF -c ds4_bench.c -o ds4_bench_cpu.o
+gcc $CF -o ds4-bench.exe ds4_bench_cpu.o ds4_cpu.o -lm
+```
+
+Toolchain used: `x86_64-w64-mingw32` GCC 15.2.0.
+
+### Verify it runs
+
+```sh
+$ ./ds4-bench.exe
+ds4-bench: specify exactly one of --prompt-file or --chat-prompt-file
+```
+
+(Exit code 2 — argument validation fires, proving the binary executes.)
+
+## Status
+
+| Target      | Native Windows CPU build | Notes |
+|-------------|--------------------------|-------|
+| `ds4-bench` | builds & runs            | no terminal/socket deps |
+| `ds4` (CLI) | not yet                  | `linenoise.c` uses POSIX `termios`, plus `sigaction` in `ds4_cli.c` (needs Win console raw-mode port) |
+| `ds4-server`| not yet                  | BSD sockets / `poll` / `arpa/inet.h` (needs Winsock port) |
+
+## Runtime note
+
+Set `DS4_LOCK_FILE` to a Windows path (the default is `/tmp/ds4.lock`, which does
+not exist on Windows) before running, e.g.:
+
+```sh
+export DS4_LOCK_FILE="$TEMP/ds4.lock"
+```
+
+## Deferred work
+
+- **CLI (`ds4`)**: port `linenoise.c` to the Windows console (raw mode via
+  `SetConsoleMode`) and replace the `sigaction`/`SIGINT` handling in `ds4_cli.c`.
+- **Server (`ds4-server`)**: port the BSD sockets / `poll(2)` event loop to
+  Winsock 2 (`WSAStartup`, `WSAPoll`, `closesocket`).