diff --git a/.gitignore b/.gitignore index 2c70e4d61..cac9297e4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ /ds4 /ds4-server /ds4-bench +/ds4-bench.exe /ds4_native /ds4_server_test /ds4_test @@ -8,7 +9,11 @@ /TODO.md /gguf/ *.o +*.exe *.dSYM/ +# Generated MSVC import lib for the Windows ROCm build (regenerated from the +# HIP SDK's libhipblas.dll by win/build-rocm.sh). +/win/third_party/hipblas.lib /misc/ .*.swp .DS_Store diff --git a/Makefile b/Makefile index 42cde3c35..750de1ae4 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,28 @@ CC ?= cc UNAME_S := $(shell uname -s) +# On MinGW/MSYS `uname -s` is e.g. MINGW64_NT-10.0 or MSYS_NT-10.0. +IS_WINDOWS := $(filter MINGW% MSYS%,$(UNAME_S)) + +# MinGW has no `cc`; default the compiler to gcc there (still overridable). +ifneq ($(IS_WINDOWS),) +ifeq ($(origin CC),default) +CC := gcc +endif +endif + ifeq ($(UNAME_S),Darwin) NATIVE_CPU_FLAG ?= -mcpu=native else NATIVE_CPU_FLAG ?= -march=native endif +# Native Windows (MinGW-w64) CPU build flags. ds4.c pulls in the dependency-free +# POSIX shim (ds4_win.h) behind #ifdef _WIN32; no extra -I/-include is needed. +WIN_CFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -std=c99 -D_GNU_SOURCE \ + -fno-finite-math-only -DDS4_NO_GPU -D_CRT_SECURE_NO_WARNINGS +WIN_LDLIBS ?= -lm + CFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -Wall -Wextra -std=c99 OBJCFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -Wall -Wextra -fobjc-arc @@ -18,7 +34,23 @@ METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal CORE_OBJS = ds4.o ds4_metal.o CPU_CORE_OBJS = ds4_cpu.o else + CFLAGS += -D_GNU_SOURCE -fno-finite-math-only + +ifeq ($(GPU_BACKEND),rocm) +ROCM_PATH ?= /opt/rocm +GPU_CC = $(ROCM_PATH)/bin/hipcc +ROCM_ARCH ?= gfx1151 + +GPU_CFLAGS ?= -O3 -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -Wno-unused-command-line-argument --offload-arch=$(ROCM_ARCH) +GPU_LDLIBS = -lm -pthread -L$(ROCM_PATH)/lib -lhipblas + +@echo "ROCM_ARCH: $(ROCM_ARCH)" + +EXTRA_DEPS = ds4_rocm.h + +else + CUDA_HOME ?= /usr/local/cuda NVCC ?= $(CUDA_HOME)/bin/nvcc CUDA_ARCH ?= @@ -27,12 +59,21 @@ NVCC_ARCH_FLAGS := -arch=$(CUDA_ARCH) endif NVCCFLAGS ?= -O3 --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NATIVE_CPU_FLAG) -Xcompiler -pthread CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas + +GPU_CC = $(NVCC) +GPU_CFLAGS = $(NVCCFLAGS) +GPU_LDLIBS = $(CUDA_LDLIBS) + +endif + CORE_OBJS = ds4.o ds4_cuda.o +EXTRA_DEPS = CPU_CORE_OBJS = ds4_cpu.o METAL_LDLIBS := $(LDLIBS) + endif -.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression +.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression windows-cpu ifeq ($(UNAME_S),Darwin) all: ds4 ds4-server ds4-bench @@ -60,6 +101,46 @@ cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o linenoise.o rax.o $(CPU_CORE cuda-regression: @echo "cuda-regression requires a CUDA build" + +else ifneq ($(IS_WINDOWS),) +# ---- Native Windows (MinGW-w64 / HIP-clang) ------------------------------- +# CPU bench builds with MinGW. The GPU (ROCm/HIP) bench builds with the AMD HIP +# SDK for gfx1151. The CLI (linenoise/termios + sigaction) and server (BSD +# sockets/poll) still need Windows ports; see win/README.md. +# +# Windows ROCm/HIP build settings. hipcc.exe's .bat wrapper splits args on +# spaces, so the actual compile/link is delegated to win/build-rocm.sh, which +# relies on the SDK's default include search and a space-free import-lib dir. +ROCM_PATH ?= C:/Program Files/AMD/ROCm/7.1 +ROCM_ARCH ?= gfx1151 + +all: help + +help: + @echo "DS4 build targets (native Windows):" + @echo " make windows-cpu Build native Windows CPU ./ds4-bench.exe (MinGW)" + @echo " make windows-rocm Build native Windows ROCm ./ds4-bench.exe (HIP, gfx1151)" + @echo " make clean Remove build outputs" + @echo "" + @echo " windows-rocm uses the AMD HIP SDK (ROCM_PATH=$(ROCM_PATH)," + @echo " ROCM_ARCH=$(ROCM_ARCH)). See win/README.md for the rocWMMA vendoring" + @echo " step and run caveats." + @echo "" + @echo " ds4 (CLI) and ds4-server are not yet ported to Windows." + +windows-cpu: ds4-bench.exe + +ds4-bench.exe: ds4_bench.c ds4.c ds4.h ds4_gpu.h ds4_win.h + $(CC) $(WIN_CFLAGS) -c -o ds4_cpu.o ds4.c + $(CC) $(WIN_CFLAGS) -c -o ds4_bench_cpu.o ds4_bench.c + $(CC) $(WIN_CFLAGS) -o $@ ds4_bench_cpu.o ds4_cpu.o $(WIN_LDLIBS) + +# Native Windows ROCm/HIP ds4-bench.exe (gfx1151). Delegates to the build +# script to work around hipcc.exe's space-splitting argument wrapper. +.PHONY: windows-rocm +windows-rocm: + ROCM_PATH="$(ROCM_PATH)" ROCM_ARCH="$(ROCM_ARCH)" bash win/build-rocm.sh + else all: help @@ -69,6 +150,7 @@ help: @echo " make cuda-generic Build CUDA for a generic local CUDA GPU" @echo " make cuda CUDA_ARCH=sm_N Build CUDA with an explicit nvcc -arch value" @echo " make cpu Build CPU-only ./ds4, ./ds4-server, and ./ds4-bench" + @echo " make rocm Build ROCm" @echo " make test Build and run tests" @echo " make clean Remove build outputs" @@ -86,14 +168,22 @@ cuda: fi $(MAKE) ds4 ds4-server ds4-bench CUDA_ARCH="$(CUDA_ARCH)" +rocm: + @if [ -z "$(strip $(ROCM_ARCH))" ]; then \ + echo "error: specify ROCM_ARCH, for example: make rocm ROCM_ARCH=gfx1151"; \ + exit 2; \ + fi + $(MAKE) ds4 ds4-server ds4-bench GPU_BACKEND=rocm ROCM_ARCH=$(ROCM_ARCH) + + ds4: ds4_cli.o linenoise.o $(CORE_OBJS) - $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) + $(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS) ds4-server: ds4_server.o rax.o $(CORE_OBJS) - $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) + $(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS) ds4-bench: ds4_bench.o $(CORE_OBJS) - $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) + $(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS) cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o linenoise.o rax.o $(CPU_CORE_OBJS) $(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS) @@ -143,11 +233,11 @@ ds4_bench_cpu.o: ds4_bench.c ds4.h ds4_metal.o: ds4_metal.m ds4_gpu.h $(METAL_SRCS) $(CC) $(OBJCFLAGS) -c -o $@ ds4_metal.m -ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc - $(NVCC) $(NVCCFLAGS) -c -o $@ ds4_cuda.cu +ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(EXTRA_DEPS) + $(GPU_CC) $(GPU_CFLAGS) -c -o $@ ds4_cuda.cu tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o - $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) + $(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS) ds4_test: ds4_test.o rax.o $(CORE_OBJS) ifeq ($(UNAME_S),Darwin) @@ -160,4 +250,5 @@ test: ds4_test ./ds4_test clean: - rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o + rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o *.exe tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o + diff --git a/ds4.c b/ds4.c index 51410e335..52f8c978a 100644 --- a/ds4.c +++ b/ds4.c @@ -20,19 +20,43 @@ #include #include #include +#if defined(_WIN32) && defined(DS4_WIN_PTHREAD) +/* Native Windows GPU (HIP/MSVC-ABI) build: MSVC has no ; use the + * Win32 pthread shim. The MinGW CPU build (no DS4_WIN_PTHREAD) keeps real + * winpthreads, so its behavior is unchanged. */ +#include "win/ds4_pthread_win.h" +#else #include +#endif #include #include #include #include #include #include +#ifdef _WIN32 +/* Native Windows CPU (MinGW-w64) and GPU (HIP/clang-MSVC) builds: a small + * dependency-free POSIX shim supplies mmap/flock/pread/sysconf/dprintf/ + * fmemopen. See ds4_win.h. */ +#include "ds4_win.h" +#include +#include +#include +#if defined(__MINGW32__) +#include /* MinGW provides POSIX unistd surface */ +#else +#include /* MSVC-ABI build: read/write/close/lseek/isatty */ +#include /* getpid */ +#include +#endif +#else #include #include #include #include #include #include +#endif #include "ds4.h" diff --git a/ds4_bench.c b/ds4_bench.c index 027b2b312..08c295d6f 100644 --- a/ds4_bench.c +++ b/ds4_bench.c @@ -19,6 +19,12 @@ #include #include #include +#if defined(_WIN32) && !defined(__MINGW32__) +/* Native Windows GPU (HIP/clang-MSVC) build: MSVC lacks clock_gettime/ + * CLOCK_MONOTONIC; the shim supplies them. The MinGW CPU build already has + * them via , so it does not include the shim. See win/ds4_win.h. */ +#include "ds4_win.h" +#endif typedef struct { const char *model_path; diff --git a/ds4_cuda.cu b/ds4_cuda.cu index 3b224f99e..8f5936df4 100644 --- a/ds4_cuda.cu +++ b/ds4_cuda.cu @@ -1,8 +1,41 @@ +#ifdef __HIP_PLATFORM_AMD__ +#include "ds4_rocm.h" + +#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL +#define MASK_T uint64_t + +// Precise transcendentals for the MoE router top-k scores, immune to +// -fapprox-func / fast-math. Used on expert-selection paths where a small +// error can flip which experts get picked (a macro-visible effect). On ROCm +// these bind directly to the OCML library entry points so the compiler cannot +// substitute the lower-precision approximations. +extern "C" __device__ __attribute__((pure)) float __ocml_exp_f32(float); +extern "C" __device__ __attribute__((pure)) float __ocml_log1p_f32(float); +extern "C" __device__ __attribute__((const)) float __ocml_sqrt_f32(float); + +static __device__ __forceinline__ float ds4_precise_expf(float x) { return __ocml_exp_f32(x); } +static __device__ __forceinline__ float ds4_precise_log1pf(float x) { return __ocml_log1p_f32(x); } +static __device__ __forceinline__ float ds4_precise_sqrtf(float x) { return __ocml_sqrt_f32(x); } + +#else #include #include #include #include +#define FULL_WARP_MASK 0xFFFFFFFFu +#define MASK_T uint32_t + +// Precise transcendentals for the MoE router top-k scores (see ROCm note +// above). On CUDA the standard library calls already provide the required +// precision; kept as named wrappers so the expert-selection path is identical +// across backends. +static __device__ __forceinline__ float ds4_precise_expf(float x) { return expf(x); } +static __device__ __forceinline__ float ds4_precise_log1pf(float x) { return log1pf(x); } +static __device__ __forceinline__ float ds4_precise_sqrtf(float x) { return sqrtf(x); } + +#endif + #include #include #include @@ -13,7 +46,17 @@ #include #include #include +#ifdef _WIN32 +/* Native Windows ROCm build: the same dependency-free POSIX shim used by + * ds4.c supplies mmap/sysconf/pread/fcntl/flock. provides the + * _open/_read/_write/_close family; the aliases below map the POSIX names the + * device-host code uses. The shim body is guarded by _WIN32, so POSIX/CUDA + * builds are byte-for-byte unchanged. See win/README.md and ds4_win.h. */ +#include "ds4_win.h" +#include +#else #include +#endif #include #include @@ -1292,9 +1335,22 @@ extern "C" ds4_gpu_tensor *ds4_gpu_tensor_alloc(uint64_t bytes) { if (bytes == 0) bytes = 1; ds4_gpu_tensor *t = (ds4_gpu_tensor *)calloc(1, sizeof(*t)); if (!t) return NULL; - if (!cuda_ok(cudaMalloc(&t->ptr, (size_t)bytes), "tensor alloc")) { - free(t); - return NULL; + + if (getenv("DS4_CUDA_MANAGED") != NULL) { + /* Use cudaMallocManaged with cudaMemAttachGlobal so the allocation + * is GPU-accessible across all streams. On UMA platforms (Strix + * Halo, Grace-Hopper) this allocates from the full unified pool, + * bypassing the BIOS VRAM carve-out. */ + if (!cuda_ok(cudaMallocManaged(&t->ptr, (size_t)bytes, cudaMemAttachGlobal), + "managed tensor alloc")) { + free(t); + return NULL; + } + } else { + if (!cuda_ok(cudaMalloc(&t->ptr, (size_t)bytes), "tensor alloc")) { + free(t); + return NULL; + } } t->bytes = bytes; t->owner = 1; @@ -1420,6 +1476,11 @@ extern "C" int ds4_gpu_set_model_map(const void *model_map, uint64_t model_size) } } + const char *chunked_env = getenv("DS4_CUDA_COPY_MODEL_CHUNKED"); + if (chunked_env && chunked_env[0]) { + return 1; + } + cudaError_t err = cudaHostRegister((void *)model_map, (size_t)model_size, cudaHostRegisterMapped | cudaHostRegisterReadOnly); if (err == cudaSuccess) { @@ -1463,7 +1524,12 @@ extern "C" int ds4_gpu_set_model_fd(int fd) { struct stat st; if (fstat(fd, &st) == 0 && st.st_size > 0) { g_model_file_size = (uint64_t)st.st_size; +#ifndef _WIN32 + /* MSVC's struct stat has no st_blksize; the direct-I/O alignment + * hint is a Linux O_DIRECT optimization (see below) and is unused + * on Windows, so leave g_model_direct_align at its default of 1. */ if (st.st_blksize > 1) g_model_direct_align = (uint64_t)st.st_blksize; +#endif } #if defined(__linux__) && defined(O_DIRECT) if (getenv("DS4_CUDA_NO_DIRECT_IO") == NULL) { @@ -1722,14 +1788,67 @@ __global__ static void f32_to_f16_kernel(__half *out, const float *x, uint64_t n __device__ static float warp_sum_f32(float v) { for (int offset = 16; offset > 0; offset >>= 1) { - v += __shfl_down_sync(0xffffffffu, v, offset); + v += __shfl_down_sync(FULL_WARP_MASK, v, offset); } return v; } +#ifdef __HIP_PLATFORM_AMD__ +/* gfx1151 fast path for the paired F16 matmul (ported from upstream PR #311, + * commit 9565c07). One warp (32 lanes) computes one output row across both + * weight matrices; ROWS_PER_BLOCK warps per block improves occupancy over the + * one-block-per-row ordered_chunks kernel. Mathematically identical to that + * kernel (a per-row dot product), just a different parallelization. Selected at + * the launch site, gated by DS4_ROCM_NO_F16_PAIR_WARP_MATMUL. HIP-only so the + * CUDA preprocessor output stays byte-for-byte unchanged. */ +template +__global__ static void matmul_f16_pair_warp_kernel( + float *out0, + float *out1, + const __half *w0, + const __half *w1, + const float *x, + uint64_t in_dim, + uint64_t out0_dim, + uint64_t out1_dim) { + + const uint64_t row_base = (uint64_t)blockIdx.x * ROWS_PER_BLOCK; + const uint32_t tid = threadIdx.x; + const uint32_t warp = tid >> 5u; + const uint32_t lane = tid & 31u; + + const uint64_t row = row_base + warp; + const bool valid0 = row < out0_dim; + const bool valid1 = row < out1_dim; + if (!valid0 && !valid1) { + return; + } + + float sum0 = 0.0f; + float sum1 = 0.0f; + + const __half *wr0 = valid0 ? w0 + row * in_dim : w0; + const __half *wr1 = valid1 ? w1 + row * in_dim : w1; + + for (uint64_t i = lane; i < in_dim; i += 32u) { + const float xv = x[i]; + if (valid0) sum0 += __half2float(wr0[i]) * xv; + if (valid1) sum1 += __half2float(wr1[i]) * xv; + } + + sum0 = warp_sum_f32(sum0); + sum1 = warp_sum_f32(sum1); + + if (lane == 0) { + if (valid0) out0[row] = sum0; + if (valid1) out1[row] = sum1; + } +} +#endif + __device__ static float warp_max_f32(float v) { for (int offset = 16; offset > 0; offset >>= 1) { - v = fmaxf(v, __shfl_down_sync(0xffffffffu, v, offset)); + v = fmaxf(v, __shfl_down_sync(FULL_WARP_MASK, v, offset)); } return v; } @@ -2846,7 +2965,7 @@ __global__ static void attention_decode_mixed_kernel( for (uint32_t d = qlane; d < head_dim; d += 8u) dot += qh[d] * kvrow[d]; const uint32_t mask = 0xffu << (threadIdx.x & 24u); for (uint32_t off = 4u; off > 0u; off >>= 1u) { - dot += __shfl_down_sync(mask, dot, off, 8); + dot += __shfl_down_sync(static_cast(mask), dot, off, 8); } s = dot * scale + add; } @@ -3008,7 +3127,7 @@ __global__ static void attention_indexed_mixed_kernel( for (uint32_t d = qlane; d < head_dim; d += 8u) dot += qh[d] * kvrow[d]; const uint32_t mask = 0xffu << (threadIdx.x & 24u); for (uint32_t off = 4u; off > 0u; off >>= 1u) { - dot += __shfl_down_sync(mask, dot, off, 8); + dot += __shfl_down_sync(static_cast(mask), dot, off, 8); } if (qlane == 0) scores[row] = dot * scale; } @@ -3189,7 +3308,7 @@ __global__ static void attention_indexed_mixed_heads8_rb4_kernel( const float *score_row = scores + warp * 768u; for (uint32_t i = lane; i < n_score; i += 32u) max_s = fmaxf(max_s, score_row[i]); max_s = warp_max_f32(max_s); - max_s = __shfl_sync(0xffffffffu, max_s, 0); + max_s = __shfl_sync(FULL_WARP_MASK, max_s, 0); } float den = 0.0f; if (valid_head) { @@ -3201,7 +3320,7 @@ __global__ static void attention_indexed_mixed_heads8_rb4_kernel( } den = warp_sum_f32(den); den += expf(sinks[head] - max_s); - den = __shfl_sync(0xffffffffu, den, 0); + den = __shfl_sync(FULL_WARP_MASK, den, 0); } float4 o0 = make_float4(0.0f, 0.0f, 0.0f, 0.0f); @@ -3361,7 +3480,7 @@ __global__ static void attention_indexed_mixed_heads8_online_kernel( dot4_f32(q2, k2) + dot4_f32(q3, k3); score = warp_sum_f32(score) * scale; - score = __shfl_sync(0xffffffffu, score, 0); + score = __shfl_sync(FULL_WARP_MASK, score, 0); const float new_m = fmaxf(max_s, score); const float old_scale = expf(max_s - new_m); @@ -3485,7 +3604,7 @@ __global__ static void attention_static_mixed_heads8_online_kernel( dot4_f32(q2, k2) + dot4_f32(q3, k3); score = warp_sum_f32(score) * scale; - score = __shfl_sync(0xffffffffu, score, 0); + score = __shfl_sync(FULL_WARP_MASK, score, 0); const float new_m = fmaxf(max_s, score); const float old_scale = expf(max_s - new_m); @@ -3650,7 +3769,7 @@ __global__ static void attention_decode_mixed_heads8_online_kernel( dot4_f32(q2, k2) + dot4_f32(q3, k3); score = warp_sum_f32(score) * scale; - score = __shfl_sync(0xffffffffu, score, 0); + score = __shfl_sync(FULL_WARP_MASK, score, 0); const float new_m = fmaxf(max_s, score); const float old_scale = expf(max_s - new_m); @@ -4073,8 +4192,8 @@ __global__ static void compressor_shift_ratio4_kernel(float *state_kv, float *st __device__ static float softplus_dev(float x) { if (x > 20.0f) return x; - if (x < -20.0f) return expf(x); - return log1pf(expf(x)); + if (x < -20.0f) return ds4_precise_expf(x); + return ds4_precise_log1pf(ds4_precise_expf(x)); } __global__ static void router_select_kernel( @@ -4097,7 +4216,7 @@ __global__ static void router_select_kernel( int32_t *sel = selected + (uint64_t)t * 6; float *w = weights + (uint64_t)t * 6; - for (int i = 0; i < 256; i++) prob[i] = sqrtf(softplus_dev(log[i])); + for (int i = 0; i < 256; i++) prob[i] = ds4_precise_sqrtf(softplus_dev(log[i])); if (hash_mode) { int32_t tok = tokens ? tokens[t] : token_scalar; @@ -4151,7 +4270,7 @@ __global__ static void router_select_parallel_kernel( float *w = weights + (uint64_t)t * 6; __shared__ float sprob[256]; - const float p = sqrtf(softplus_dev(log[i])); + const float p = ds4_precise_sqrtf(softplus_dev(log[i])); sprob[i] = p; prob[i] = p; __syncthreads(); @@ -4220,7 +4339,7 @@ __global__ static void router_select_warp_topk_kernel( #pragma unroll for (uint32_t j = 0; j < 8u; j++) { const uint32_t e = lane + j * 32u; - const float p = sqrtf(softplus_dev(log[e])); + const float p = ds4_precise_sqrtf(softplus_dev(log[e])); local_prob[j] = p; local_score[j] = p + (has_bias ? bias[e] : 0.0f); sprob[row_in_block][e] = p; @@ -4268,9 +4387,9 @@ __global__ static void router_select_warp_topk_kernel( } #pragma unroll for (uint32_t mask = 16u; mask > 0u; mask >>= 1u) { - const float other_score = __shfl_xor_sync(0xffffffffu, best_score, mask); - const float other_prob = __shfl_xor_sync(0xffffffffu, best_prob, mask); - const uint32_t other_idx = __shfl_xor_sync(0xffffffffu, best_idx, mask); + const float other_score = __shfl_xor_sync(FULL_WARP_MASK, best_score, mask); + const float other_prob = __shfl_xor_sync(FULL_WARP_MASK, best_prob, mask); + const uint32_t other_idx = __shfl_xor_sync(FULL_WARP_MASK, best_idx, mask); if (router_score_better(other_score, other_idx, best_score, best_idx)) { best_score = other_score; best_prob = other_prob; @@ -4958,6 +5077,13 @@ static int indexer_scores_launch( scale, causal ? 1 : 0); return cuda_ok(cudaGetLastError(), "indexer score one direct launch"); } +#ifndef __HIP_PLATFORM_AMD__ + // The WMMA indexer kernel body is gated on __CUDA_ARCH__ >= 700, which + // hipcc never defines, so on ROCm its body compiles empty and would leave + // `scores` uninitialised. The full rocWMMA matrix API is also not available + // in this build (only rocwmma-version.hpp is vendored). Skip the WMMA path + // on HIP entirely and fall through to the scalar indexer_scores_kernel + // below, which fully initialises `scores`. (On CUDA the WMMA path is kept.) if (!g_quality_mode && head_dim == 128u && n_head == 64u && getenv("DS4_CUDA_NO_INDEXER_WMMA") == NULL) { dim3 grid((n_comp + 15u) / 16u, (n_tokens + 15u) / 16u, 1); @@ -4969,6 +5095,7 @@ static int indexer_scores_launch( head_dim, ratio, scale, causal ? 1 : 0); return cuda_ok(cudaGetLastError(), "indexer scores wmma launch"); } +#endif // !__HIP_PLATFORM_AMD__ dim3 grid(n_comp, n_tokens, 1); indexer_scores_kernel<<>>((float *)scores->ptr, (const float *)q->ptr, @@ -5198,7 +5325,7 @@ static int cuda_matmul_q8_0_tensor_labeled(ds4_gpu_tensor *out, const void *mode out->ptr, CUDA_R_32F, (int)out_dim, - CUDA_R_32F, + CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT); if (st == CUBLAS_STATUS_SUCCESS) return 1; fprintf(stderr, "ds4: cuBLAS q8 f16 matmul failed: status %d\n", (int)st); @@ -5440,7 +5567,7 @@ extern "C" int ds4_gpu_matmul_f16_tensor(ds4_gpu_tensor *out, const void *model_ out->ptr, CUDA_R_32F, (int)out_dim, - CUDA_R_32F, + CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT); return cublas_ok(st, "f16 matmul"); } @@ -5496,6 +5623,22 @@ extern "C" int ds4_gpu_matmul_f16_pair_tensor( const __half *w0 = (const __half *)cuda_model_range_ptr(model_map, weight0_offset, weight_bytes, "f16_pair0"); const __half *w1 = (const __half *)cuda_model_range_ptr(model_map, weight1_offset, weight_bytes, "f16_pair1"); if (!w0 || !w1) return 0; +#ifdef __HIP_PLATFORM_AMD__ + if (!getenv("DS4_ROCM_NO_F16_PAIR_WARP_MATMUL")) { + constexpr uint32_t ROWS_PER_BLOCK = 8u; + const uint32_t grid = (uint32_t)((out_dim + ROWS_PER_BLOCK - 1u) / ROWS_PER_BLOCK); + matmul_f16_pair_warp_kernel<<>>( + (float *)out0->ptr, + (float *)out1->ptr, + w0, + w1, + (const float *)x->ptr, + in_dim, + out_dim, + out_dim); + return cuda_ok(cudaGetLastError(), "matmul_f16_pair_warp launch"); + } +#endif matmul_f16_pair_ordered_chunks_kernel<<<(unsigned)out_dim, 32>>>( (float *)out0->ptr, (float *)out1->ptr, @@ -6158,7 +6301,7 @@ extern "C" int ds4_gpu_attention_prefill_raw_heads_tensor(ds4_gpu_tensor *heads, if (!tmp) return 0; float *scores = tmp; float *out_tmp = (float *)((char *)tmp + out_offset); - const float alpha = rsqrtf((float)head_dim); + const float alpha = 1.0f / sqrtf((float)head_dim); const float beta = 0.0f; cublasStatus_t st = cublasSgemmStridedBatched(g_cublas, CUBLAS_OP_T, @@ -6528,7 +6671,7 @@ static int attention_prefill_mixed_launch( n_comp, head_dim); if (!cuda_ok(cudaGetLastError(), "attention mixed kv pack launch")) return 0; - const float alpha = rsqrtf((float)head_dim); + const float alpha = 1.0f / sqrtf((float)head_dim); const float beta = 0.0f; cublasStatus_t st = cublasSgemmStridedBatched(g_cublas, CUBLAS_OP_T, @@ -6733,7 +6876,7 @@ extern "C" int ds4_gpu_attention_output_q8_batch_tensor( (int)rank, (long long)rank * n_tokens, (int)n_groups, - CUDA_R_32F, + CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT); if (!cublas_ok(st, "attention output a gemm")) return 0; attention_unpack_group_low_kernel<<<(low_tmp_count + 255) / 256, 256>>>( @@ -7395,7 +7538,7 @@ __device__ static void dev_dot_q2_K_q8_K_block8( __device__ static float half_warp_sum_f32(float v, uint32_t lane16) { uint32_t mask = 0xffffu << (threadIdx.x & 16u); for (int offset = 8; offset > 0; offset >>= 1) { - v += __shfl_down_sync(mask, v, offset, 16); + v += __shfl_down_sync(static_cast(mask), v, offset, 16); } (void)lane16; return v; @@ -7404,7 +7547,7 @@ __device__ static float half_warp_sum_f32(float v, uint32_t lane16) { __device__ static float quarter_warp_sum_f32(float v, uint32_t lane8) { uint32_t mask = 0xffu << (threadIdx.x & 24u); for (int offset = 4; offset > 0; offset >>= 1) { - v += __shfl_down_sync(mask, v, offset, 8); + v += __shfl_down_sync(static_cast(mask), v, offset, 8); } (void)lane8; return v; diff --git a/ds4_rocm.h b/ds4_rocm.h new file mode 100644 index 000000000..55907c428 --- /dev/null +++ b/ds4_rocm.h @@ -0,0 +1,116 @@ +#pragma once + +#include +#include +#include +#include + +#define cudaError_t hipError_t +#define cudaStream_t hipStream_t +#define cudaEvent_t hipEvent_t +#define cudaDeviceProp hipDeviceProp_t +#define cudaMemLocation hipMemLocation + +#define cudaSuccess hipSuccess +#define cudaErrorNotSupported hipErrorNotSupported +#define cudaMemAttachGlobal hipMemAttachGlobal +#define cudaErrorInvalidValue hipErrorInvalidValue +#define cudaGetLastError hipGetLastError +#define cudaGetErrorString hipGetErrorString + +#define cudaGetDevice hipGetDevice +#define cudaSetDevice hipSetDevice +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaDeviceGetAttribute hipDeviceGetAttribute +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaDevAttrPageableMemoryAccess hipDeviceAttributePageableMemoryAccess +#define cudaMemLocationTypeDevice hipMemLocationTypeDevice + +#define cudaMalloc hipMalloc +#define cudaMallocHost hipHostMalloc +#define cudaMallocManaged hipMallocManaged +#define cudaFree hipFree +#define cudaFreeHost hipFreeHost +#define cudaMemset hipMemset +#define cudaMemcpy hipMemcpy +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemGetInfo hipMemGetInfo +#define cudaMemsetAsync hipMemsetAsync + +#define cudaHostRegister hipHostRegister +#define cudaHostUnregister hipHostUnregister +#define cudaHostGetDevicePointer hipHostGetDevicePointer +#define cudaHostRegisterMapped hipHostRegisterMapped +#define cudaHostRegisterReadOnly hipHostRegisterReadOnly + +#define cudaMemAdvise(p1, p2, p3, p4) hipMemAdvise(p1, p2, p3, p4.id) +#define cudaMemPrefetchAsync(devPtr, count, location, flags, stream) hipMemPrefetchAsync(devPtr, count, location.id, stream) +#define cudaMemAdviseSetReadMostly hipMemAdviseSetReadMostly +#define cudaMemAdviseSetPreferredLocation hipMemAdviseSetPreferredLocation + +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamDestroy hipStreamDestroy +#define cudaStreamNonBlocking hipStreamNonBlocking + +#define cudaEventCreate hipEventCreate +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDestroy hipEventDestroy +#define cudaEventRecord hipEventRecord +#define cudaEventSynchronize hipEventSynchronize +#define cudaEventElapsedTime hipEventElapsedTime +#define cudaEventDisableTiming hipEventDisableTiming + +#define cublasHandle_t hipblasHandle_t +#define cublasStatus_t hipblasStatus_t +#define cublasMath_t hipblasMath_t + +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_DEFAULT_MATH HIPBLAS_DEFAULT_MATH +#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F +#define CUBLAS_TF32_TENSOR_OP_MATH HIPBLAS_TF32_TENSOR_OP_MATH +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F + +#define cublasCreate hipblasCreate +#define cublasDestroy hipblasDestroy +#define cublasSetMathMode hipblasSetMathMode +#define cublasSgemm hipblasSgemm +#define cublasSgemmStridedBatched hipblasSgemmStridedBatched +#define cublasGemmEx hipblasGemmEx +#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx + +static __device__ __forceinline__ int32_t __vcmpne4(uint32_t a, uint32_t b) { + // For each byte: 0xFF if a != b, 0x00 if a == b + uint32_t diff = a ^ b; + // Spread any set bit in each byte to fill the whole byte + diff |= (diff >> 1); diff |= (diff >> 2); diff |= (diff >> 4); + diff &= 0x01010101u; + diff *= 0xFFu; // 0x01 -> 0xFF per byte + return (int32_t)diff; +} + +static __device__ __forceinline__ int32_t __vsub4(int32_t a, int32_t b) { + // Per-byte subtraction (wrapping, not saturating) + uint32_t ua = (uint32_t)a, ub = (uint32_t)b; + // Trick: subtract bytes in parallel avoiding cross-byte borrows + uint32_t diff = ((ua | 0x80808080u) - (ub & 0x7F7F7F7Fu)) ^ ((ua ^ ~ub) & 0x80808080u); + return (int32_t)diff; +} + +// __dp4a: dot product of 4 signed int8s packed in an int32 +static __device__ __forceinline__ int32_t __dp4a(int32_t a, int32_t b, int32_t c) { + const int8_t *a_bytes = reinterpret_cast(&a); + const int8_t *b_bytes = reinterpret_cast(&b); + return c + (int32_t)a_bytes[0] * b_bytes[0] + + (int32_t)a_bytes[1] * b_bytes[1] + + (int32_t)a_bytes[2] * b_bytes[2] + + (int32_t)a_bytes[3] * b_bytes[3]; +} + diff --git a/ds4_win.h b/ds4_win.h new file mode 100644 index 000000000..ce2356c13 --- /dev/null +++ b/ds4_win.h @@ -0,0 +1,312 @@ +/* ds4_win.h — minimal POSIX compatibility layer for native Windows builds. + * + * Provides just the POSIX surface ds4.c relies on that MinGW/UCRT lacks: + * - mmap / munmap / madvise (read-only file mappings) + * - sysconf(_SC_NPROCESSORS_ONLN / _SC_PAGESIZE) + * - flock / fcntl(F_SETFD,FD_CLOEXEC) / pread / ftruncate / dprintf (instance lock) + * - fmemopen (fixed-buffer "wb"/"rb", temp-file backed with copy-back on close) + * + * Header-only, self-contained, no third-party deps. The whole body is guarded by + * _WIN32, so this header is inert on POSIX platforms. ds4.c includes it in place + * of (and the other POSIX-only surface) behind #ifdef _WIN32, so the + * native MinGW-w64 CPU build needs no extra include/search-path flags. MinGW + * already provides pthread, clock_gettime and ftruncate. + */ +#ifndef DS4_WIN_H +#define DS4_WIN_H + +#ifdef _WIN32 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* ---- mmap ---------------------------------------------------------------- */ +#define PROT_NONE 0x0 +#define PROT_READ 0x1 +#define PROT_WRITE 0x2 +#define PROT_EXEC 0x4 +#define MAP_SHARED 0x01 +#define MAP_PRIVATE 0x02 +#define MAP_FAILED ((void *)-1) +#define POSIX_MADV_NORMAL 0 +#define POSIX_MADV_RANDOM 1 +#define POSIX_MADV_SEQUENTIAL 2 +#define POSIX_MADV_WILLNEED 3 +#define POSIX_MADV_DONTNEED 4 +#define MADV_WILLNEED POSIX_MADV_WILLNEED + +#ifndef _SC_PAGESIZE +#define _SC_PAGESIZE 0x1 +#endif +#ifndef _SC_NPROCESSORS_ONLN +#define _SC_NPROCESSORS_ONLN 0x2 +#endif + +/* ---- misc POSIX surface used by the GPU (HIP) host code ------------------ */ +/* MinGW/UCRT supplies these; the clang-MSVC HIP toolchain (ds4_cuda.cu build) + * does not. Guard each so the MinGW CPU build is unaffected. */ +#ifndef STDIN_FILENO +#define STDIN_FILENO 0 +#endif +#ifndef STDOUT_FILENO +#define STDOUT_FILENO 1 +#endif +#ifndef STDERR_FILENO +#define STDERR_FILENO 2 +#endif + +#ifndef SSIZE_MAX +#define SSIZE_MAX ((ssize_t)(((size_t)-1) >> 1)) +#endif + +/* ssize_t / off_t: MinGW defines these via ; MSVC does not. + * MSVC exposes _SSIZE_T_DEFINED once (pulled in by windows.h) and + * the CRT have declared SSIZE_T; provide ssize_t/off_t only when absent. */ +#if !defined(_SSIZE_T_DEFINED) && !defined(__MINGW32__) && !defined(_SSIZE_T_) +typedef SSIZE_T ssize_t; +#define _SSIZE_T_DEFINED +#endif +#if !defined(_OFF_T_DEFINED) && !defined(__MINGW32__) +/* MSVC already typedefs off_t to long; only define if missing. */ +#ifndef _OFF_T_ +typedef long long off_t; +#endif +#endif + +/* ---- 64-bit file stat --------------------------------------------------- */ +/* The Windows CRT's default `struct stat` / stat() / fstat() carry a 32-bit + * st_size, so stat'ing a file larger than 2 GB fails with EOVERFLOW + * ("value too large") — fatal for the ~80 GB DeepSeek V4 GGUF. Remap the bare + * names to the 64-bit `_stat64` family (which names both the struct and the + * functions, with an __int64 st_size). + * + * is pulled in here first so its own real declarations are parsed + * before the macros exist; thanks to its include guard, any later + * `#include ` in a translation unit (e.g. ds4.c includes it after + * this header; ds4_cuda.cu includes it before) is a no-op but still sees the + * remap. #undef first in case the CRT already exposes stat/fstat as macros. */ +#include +#undef stat +#undef fstat +#define stat _stat64 +#define fstat _fstat64 + +/* clock_gettime / CLOCK_MONOTONIC: present in MinGW, absent in clang-MSVC. + * MSVC's already declares struct timespec, so we only supply the + * clock id macros and the function. */ +#if !defined(CLOCK_MONOTONIC) && !defined(__MINGW32__) +#include +#define CLOCK_REALTIME 0 +#define CLOCK_MONOTONIC 1 +static inline int clock_gettime(int clk, struct timespec *ts) +{ + (void)clk; + LARGE_INTEGER freq, cnt; + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&cnt); + ts->tv_sec = (long long)(cnt.QuadPart / freq.QuadPart); + long long rem = cnt.QuadPart % freq.QuadPart; + ts->tv_nsec = (long)((rem * 1000000000LL) / freq.QuadPart); + return 0; +} +#endif + +/* ---- file locking / fd flags -------------------------------------------- */ +#ifndef F_SETFD +#define F_SETFD 2 +#endif +#ifndef FD_CLOEXEC +#define FD_CLOEXEC 1 +#endif +#define LOCK_SH 1 +#define LOCK_EX 2 +#define LOCK_NB 4 +#define LOCK_UN 8 + +static inline void *mmap(void *addr, size_t length, int prot, int flags, + int fd, long long offset) +{ + (void)addr; (void)flags; (void)prot; + HANDLE fh = (HANDLE)_get_osfhandle(fd); + if (fh == INVALID_HANDLE_VALUE) { errno = EBADF; return MAP_FAILED; } + HANDLE mh = CreateFileMappingA(fh, NULL, PAGE_READONLY, 0, 0, NULL); + if (mh == NULL) { errno = ENOMEM; return MAP_FAILED; } + DWORD off_hi = (DWORD)((uint64_t)offset >> 32); + DWORD off_lo = (DWORD)((uint64_t)offset & 0xFFFFFFFFu); + void *p = MapViewOfFile(mh, FILE_MAP_READ, off_hi, off_lo, length); + CloseHandle(mh); /* view keeps the section alive */ + if (p == NULL) { errno = ENOMEM; return MAP_FAILED; } + return p; +} + +static inline int munmap(void *addr, size_t length) +{ + (void)length; + return UnmapViewOfFile(addr) ? 0 : -1; +} + +static inline int posix_madvise(void *addr, size_t length, int advice) +{ + (void)addr; (void)length; (void)advice; + return 0; /* advisory only */ +} +static inline int madvise(void *addr, size_t length, int advice) +{ + return posix_madvise(addr, length, advice); +} + +static inline long sysconf(int name) +{ + SYSTEM_INFO si; + GetSystemInfo(&si); + if (name == _SC_NPROCESSORS_ONLN) return (long)si.dwNumberOfProcessors; + if (name == _SC_PAGESIZE) return (long)si.dwPageSize; + errno = EINVAL; + return -1; +} + +static inline int flock(int fd, int op) +{ + HANDLE h = (HANDLE)_get_osfhandle(fd); + if (h == INVALID_HANDLE_VALUE) { errno = EBADF; return -1; } + OVERLAPPED ov; memset(&ov, 0, sizeof(ov)); + if (op & LOCK_UN) { + return UnlockFileEx(h, 0, MAXDWORD, MAXDWORD, &ov) ? 0 : -1; + } + DWORD f = 0; + if (op & LOCK_EX) f |= LOCKFILE_EXCLUSIVE_LOCK; + if (op & LOCK_NB) f |= LOCKFILE_FAIL_IMMEDIATELY; + if (!LockFileEx(h, f, 0, MAXDWORD, MAXDWORD, &ov)) { + errno = (GetLastError() == ERROR_LOCK_VIOLATION) ? EWOULDBLOCK : EACCES; + return -1; + } + return 0; +} + +static inline int fcntl(int fd, int cmd, ...) +{ + (void)fd; (void)cmd; + return 0; /* F_SETFD/FD_CLOEXEC is a no-op: Windows handles aren't inherited by default */ +} + +static inline long long ds4_pread(int fd, void *buf, size_t count, long long offset) +{ + HANDLE h = (HANDLE)_get_osfhandle(fd); + if (h == INVALID_HANDLE_VALUE) { errno = EBADF; return -1; } + OVERLAPPED ov; memset(&ov, 0, sizeof(ov)); + ov.Offset = (DWORD)((uint64_t)offset & 0xFFFFFFFFu); + ov.OffsetHigh = (DWORD)((uint64_t)offset >> 32); + DWORD got = 0; + if (!ReadFile(h, buf, (DWORD)count, &got, &ov)) { + if (GetLastError() == ERROR_HANDLE_EOF) return 0; + errno = EIO; return -1; + } + return (long long)got; +} +#define pread(fd, buf, count, offset) ds4_pread((fd), (buf), (size_t)(count), (long long)(offset)) + +/* ftruncate: provided by MinGW ; absent in the MSVC ABI build. */ +#if !defined(__MINGW32__) +static inline int ftruncate(int fd, long long length) +{ + HANDLE h = (HANDLE)_get_osfhandle(fd); + if (h == INVALID_HANDLE_VALUE) { errno = EBADF; return -1; } + LARGE_INTEGER li; li.QuadPart = length; + if (!SetFilePointerEx(h, li, NULL, FILE_BEGIN)) { errno = EINVAL; return -1; } + if (!SetEndOfFile(h)) { errno = EIO; return -1; } + return 0; +} +#endif + +static inline int dprintf(int fd, const char *fmt, ...) +{ + char buf[512]; + va_list ap; va_start(ap, fmt); + int n = vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + if (n < 0) return -1; + if (n > (int)sizeof(buf)) n = (int)sizeof(buf); + return _write(fd, buf, (unsigned)n); +} + +/* ---- fmemopen (temp-file backed, fixed buffer) --------------------------- */ +typedef struct { FILE *fp; void *buf; size_t cap; int writeback; } ds4_memstream; +#define DS4_MEMSTREAM_MAX 16 +static ds4_memstream ds4_ms_tab[DS4_MEMSTREAM_MAX]; +static CRITICAL_SECTION ds4_ms_cs; +static volatile LONG ds4_ms_init = 0; + +static inline void ds4_ms_ensure(void) +{ + if (InterlockedCompareExchange(&ds4_ms_init, 1, 0) == 0) + InitializeCriticalSection(&ds4_ms_cs); +} + +static inline FILE *ds4_tmpfile(void) +{ + char dir[MAX_PATH], path[MAX_PATH]; + if (!GetTempPathA(sizeof(dir), dir)) return NULL; + if (!GetTempFileNameA(dir, "ds4", 0, path)) return NULL; + /* open read/write, delete on close */ + return fopen(path, "wb+TD"); /* T=temporary, D=delete-on-close (MSVCRT ext) */ +} + +static inline FILE *fmemopen(void *buf, size_t size, const char *mode) +{ + ds4_ms_ensure(); + int writing = (mode && (strchr(mode, 'w') || strchr(mode, 'a') || strchr(mode, '+'))); + FILE *fp = ds4_tmpfile(); + if (!fp) return NULL; + if (!writing && buf && size) { + if (fwrite(buf, 1, size, fp) != size) { fclose(fp); return NULL; } + rewind(fp); + } + EnterCriticalSection(&ds4_ms_cs); + for (int i = 0; i < DS4_MEMSTREAM_MAX; i++) { + if (ds4_ms_tab[i].fp == NULL) { + ds4_ms_tab[i].fp = fp; ds4_ms_tab[i].buf = buf; + ds4_ms_tab[i].cap = size; ds4_ms_tab[i].writeback = writing ? 1 : 0; + break; + } + } + LeaveCriticalSection(&ds4_ms_cs); + return fp; +} + +static inline int ds4_win_fclose(FILE *fp) +{ + if (fp && ds4_ms_init) { + EnterCriticalSection(&ds4_ms_cs); + for (int i = 0; i < DS4_MEMSTREAM_MAX; i++) { + if (ds4_ms_tab[i].fp == fp) { + if (ds4_ms_tab[i].writeback && ds4_ms_tab[i].buf && ds4_ms_tab[i].cap) { + fflush(fp); rewind(fp); + fread(ds4_ms_tab[i].buf, 1, ds4_ms_tab[i].cap, fp); /* copy back */ + } + ds4_ms_tab[i].fp = NULL; ds4_ms_tab[i].buf = NULL; + ds4_ms_tab[i].cap = 0; ds4_ms_tab[i].writeback = 0; + break; + } + } + LeaveCriticalSection(&ds4_ms_cs); + } + return fclose(fp); /* real fclose — macro defined only after this header */ +} + +#endif /* _WIN32 */ + +/* Redirect fclose AFTER all helpers above so ds4_win_fclose's own call hits the + * real fclose. Source files including this header get the memory-stream-aware one. */ +#ifdef _WIN32 +#define fclose(fp) ds4_win_fclose(fp) +#endif + +#endif /* DS4_WIN_H */ diff --git a/win/README.md b/win/README.md new file mode 100644 index 000000000..ecf4189a3 --- /dev/null +++ b/win/README.md @@ -0,0 +1,165 @@ +# Native Windows build (experimental) + +DS4 assumes a POSIX environment. `ds4_win.h` (top level) is a small, +dependency-free compatibility shim that lets the **CPU backend** build with +native MinGW-w64 GCC (no WSL, no Cygwin, no MSVC). It supplies the POSIX surface +MinGW/UCRT lacks: `mmap`/`munmap`/`madvise`, `sysconf`, `flock`/`fcntl`/`pread`/ +`dprintf`, and a temp-file-backed `fmemopen`. MinGW already provides `pthread`, +`clock_gettime`, and `ftruncate`. + +The shim is wired in-tree: `ds4.c` includes `ds4_win.h` in place of +`` (and the other POSIX-only headers) behind `#ifdef _WIN32`. The +header's entire body is guarded by `_WIN32`, so POSIX builds are byte-for-byte +unchanged. No special include/search-path flags are needed. + +## Build `ds4-bench.exe` (CPU) + +### With make (MinGW/MSYS) + +```sh +make windows-cpu +``` + +`uname -s` on MinGW/MSYS reports `MINGW64_NT*` / `MSYS_NT*`; the Makefile detects +this and selects a Windows branch that defaults `CC` to `gcc`. + +### Direct gcc (no make) + +```sh +CF="-O3 -ffast-math -march=native -std=c99 -D_GNU_SOURCE -fno-finite-math-only \ + -DDS4_NO_GPU -D_CRT_SECURE_NO_WARNINGS" +gcc $CF -c ds4.c -o ds4_cpu.o +gcc $CF -c ds4_bench.c -o ds4_bench_cpu.o +gcc $CF -o ds4-bench.exe ds4_bench_cpu.o ds4_cpu.o -lm +``` + +Toolchain used: `x86_64-w64-mingw32` GCC 15.2.0. + +### Verify it runs + +```sh +$ ./ds4-bench.exe +ds4-bench: specify exactly one of --prompt-file or --chat-prompt-file +``` + +(Exit code 2 — argument validation fires, proving the binary executes.) + +## Build `ds4-bench.exe` (ROCm / HIP, AMD GPU — gfx1151) + +The GPU backend (`ds4_cuda.cu`, the unified CUDA/HIP source) builds **natively on +Windows** with the AMD HIP SDK — no WSL, no MSVC on `PATH`, and no full Visual +Studio install needed at the command line. Target: AMD Strix Halo (gfx1151). + +### Prerequisites + +- **AMD HIP SDK** (default `C:/Program Files/AMD/ROCm/7.1`), providing + `bin/hipcc.exe`, `bin/clang.exe`, `include/hipblas/…`, `bin/libhipblas.dll`, + and the gfx1151 device bitcode (`amdgcn/bitcode/oclc_isa_version_1151.bc`). +- **`llvm-dlltool`** reachable on `PATH` (or a scoop LLVM install at + `~/scoop/apps/llvm/current`). Used **once** to synthesize an MSVC-style + `hipblas.lib` from the SDK's `libhipblas.dll` (see "Why a generated import + lib" below). After first run the lib is cached in `win/third_party/`. +- **Vendored rocWMMA header** (in-tree, already committed). The Windows HIP SDK + does **not** ship the header-only rocWMMA library, but `ds4_rocm.h` includes + ``. A faithful copy of the CMake-configured + version header (rocWMMA 2.2.1, the release that ships with ROCm 7.x) lives at + `win/third_party/rocwmma/rocwmma/rocwmma-version.hpp`. rocWMMA is MIT-licensed + and header-only; only the version header is needed because DS4's WMMA kernel + path is CUDA-only (guarded by `__CUDA_ARCH__`) and is not compiled for HIP. + +### With make (MinGW/MSYS) + +```sh +make windows-rocm +# overridable: +make windows-rocm ROCM_PATH="C:/Program Files/AMD/ROCm/7.1" ROCM_ARCH=gfx1151 +``` + +### Direct (the script make calls) + +```sh +ROCM_PATH="C:/Program Files/AMD/ROCm/7.1" ROCM_ARCH=gfx1151 win/build-rocm.sh +``` + +The build: + +1. Generates `win/third_party/hipblas.lib` from `libhipblas.dll` (first run only). +2. Compiles `ds4_cuda.cu` with `hipcc.exe --offload-arch=gfx1151 + -D__HIP_PLATFORM_AMD__ -I win/third_party/rocwmma`. +3. Compiles the host C files (`ds4.c`, `ds4_bench.c`) with + `clang --target=x86_64-pc-windows-msvc … -DDS4_WIN_PTHREAD` so they share the + **MSVC ABI** with the hipcc-built `ds4_cuda.o` (mixing the MinGW and MSVC C + runtimes across the `FILE*`/heap boundary is unsafe, so the GPU build uses + MSVC ABI throughout — unlike the MinGW CPU build above). +4. Links `ds4-bench.exe` with `hipcc.exe`, pulling in `amdhip64_7.dll` and + `libhipblas.dll`. + +### Verify it runs + +```sh +$ PATH="C:/Program Files/AMD/ROCm/7.1/bin:$PATH" \ + DS4_LOCK_FILE="$TEMP/ds4.lock" ./ds4-bench.exe -m model.gguf --prompt-file p.txt +ds4-bench: context buffers 753.89 MiB (ctx=32897, backend=cuda, …) +``` + +The SDK `bin` must be on `PATH` at runtime for the HIP/hipBLAS DLLs. The binary +initializes the GPU backend (reported as `backend=cuda`, the unified GPU path +name) before touching the model file. Full inference is not yet runtime-verified +here — the ~80 GB model needs a BIOS UMA memory re-split + reboot — but the HIP +runtime and hipBLAS load and the gfx1151 binary links and starts cleanly. + +### Why a generated import lib + +The Windows HIP SDK ships hipBLAS with only the **MinGW-style** +`lib/libhipblas.dll.a`, which the MSVC linker (`lld-link`, used by `hipcc` on +Windows) cannot consume. The HIP runtime itself has a proper MSVC import lib +(`lib/amdhip64.lib`), but hipBLAS does not. `win/build-rocm.sh` therefore dumps +`libhipblas.dll`'s export table and synthesizes a COFF `hipblas.lib` with +`llvm-dlltool`. The generated lib is git-ignored and regenerated on demand. + +### Native-Windows portability shims (all behind `#ifdef _WIN32`) + +- `ds4_win.h` — the existing POSIX shim, extended for the MSVC-ABI GPU build + with `STDERR_FILENO`/`ssize_t`/`SSIZE_MAX`/`off_t`, `clock_gettime` + + `CLOCK_MONOTONIC`, and `ftruncate` (MinGW already supplies these; the + additions are guarded `!defined(__MINGW32__)` so the CPU build is unchanged). +- `win/ds4_pthread_win.h` — a header-only Win32 pthread shim (threads, mutex, + condition variable, once) used when `DS4_WIN_PTHREAD` is defined, because the + MSVC toolchain has no ``. The MinGW CPU build keeps winpthreads. +- `st_blksize` (a Linux `O_DIRECT` alignment hint) is skipped on Windows; the + whole direct-I/O block is already `#if defined(__linux__) && defined(O_DIRECT)`. + +All edits to `ds4.c`, `ds4_bench.c`, and `ds4_cuda.cu` are behind `_WIN32` +(further sub-guarded by `__MINGW32__` / `DS4_WIN_PTHREAD`), so the POSIX, macOS, +CUDA, and Linux-ROCm builds produce byte-identical preprocessor output. + +## Status + +| Target | Native Windows build | Notes | +|----------------------|----------------------|-------| +| `ds4-bench` (CPU) | builds & runs | MinGW-w64, no terminal/socket deps | +| `ds4-bench` (ROCm) | builds, links, starts| HIP/clang-MSVC, gfx1151; full inference pending model + UMA re-split | +| `ds4` (CLI) | not yet | `linenoise.c` uses POSIX `termios`, plus `sigaction` in `ds4_cli.c` (needs Win console raw-mode port) | +| `ds4-server` | not yet | BSD sockets / `poll` / `arpa/inet.h` (needs Winsock port) | + +## Runtime note + +Set `DS4_LOCK_FILE` to a Windows path (the default is `/tmp/ds4.lock`, which does +not exist on Windows) before running, e.g.: + +```sh +export DS4_LOCK_FILE="$TEMP/ds4.lock" +``` + +For the **ROCm** build, also: + +- Put the HIP SDK `bin` on `PATH` at runtime (`amdhip64_7.dll`, `libhipblas.dll`). +- Set `DS4_CUDA_MANAGED=1` to use the full UMA pool (managed memory) once a model + is being loaded — required for the large Strix Halo UMA allocation. + +## Deferred work + +- **CLI (`ds4`)**: port `linenoise.c` to the Windows console (raw mode via + `SetConsoleMode`) and replace the `sigaction`/`SIGINT` handling in `ds4_cli.c`. +- **Server (`ds4-server`)**: port the BSD sockets / `poll(2)` event loop to + Winsock 2 (`WSAStartup`, `WSAPoll`, `closesocket`). diff --git a/win/build-rocm.sh b/win/build-rocm.sh new file mode 100644 index 000000000..0f629424b --- /dev/null +++ b/win/build-rocm.sh @@ -0,0 +1,96 @@ +#!/usr/bin/env bash +# build-rocm.sh — native Windows ROCm/HIP build of ds4-bench.exe for gfx1151. +# +# Builds DS4's GPU (HIP) backend natively on Windows with the AMD HIP SDK — no +# WSL, no MSVC on PATH, no full Visual Studio install required at the command +# line (hipcc auto-discovers the VS build tools' headers). Produces a gfx1151 +# ROCm binary: ds4-bench.exe. +# +# Why a script instead of pure Make: hipcc.exe's .bat wrapper splits arguments +# on spaces, so paths like "C:/Program Files/AMD/ROCm/7.1" break -I/-L flags. +# This script relies on the SDK's default include/lib search (hipcc adds the SDK +# include via -idirafter automatically) and a space-free path for the hipblas +# import lib, sidestepping the quoting problem. +# +# Usage: +# win/build-rocm.sh # uses defaults below +# ROCM_PATH="C:/Program Files/AMD/ROCm/7.1" ROCM_ARCH=gfx1151 win/build-rocm.sh +# +# Requirements: +# - AMD HIP SDK (default C:/Program Files/AMD/ROCm/7.1) with hipcc.exe + clang. +# - Vendored rocWMMA version header at win/third_party/rocwmma (in-tree). +# - hipblas.lib (MSVC import lib). Generated on the fly from libhipblas.dll if +# win/third_party/hipblas.lib is absent (needs llvm-dlltool on PATH or in the +# scoop LLVM install). +set -euo pipefail + +ROCM_PATH="${ROCM_PATH:-C:/Program Files/AMD/ROCm/7.1}" +ROCM_ARCH="${ROCM_ARCH:-gfx1151}" + +# Resolve the repo root (this script lives in win/). +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$HERE" + +HIPCC="$ROCM_PATH/bin/hipcc.exe" +CLANG="$ROCM_PATH/bin/clang.exe" +THIRD="win/third_party" +ROCWMMA_INC="$THIRD/rocwmma" +HIPBLAS_LIB="$THIRD/hipblas.lib" + +if [ ! -x "$HIPCC" ]; then + echo "error: hipcc.exe not found at '$HIPCC' (set ROCM_PATH)" >&2 + exit 2 +fi + +# --- ensure an MSVC-style hipblas import lib exists ------------------------- +# The Windows HIP SDK ships only the MinGW-style libhipblas.dll.a, which the +# MSVC linker (lld-link) cannot consume. Generate hipblas.lib from the DLL's +# export table once and cache it in win/third_party/. +if [ ! -f "$HIPBLAS_LIB" ]; then + echo "==> generating $HIPBLAS_LIB from libhipblas.dll" + DLLTOOL="$(command -v llvm-dlltool.exe 2>/dev/null || true)" + if [ -z "$DLLTOOL" ] && [ -x "$HOME/scoop/apps/llvm/current/bin/llvm-dlltool.exe" ]; then + DLLTOOL="$HOME/scoop/apps/llvm/current/bin/llvm-dlltool.exe" + fi + if [ -z "$DLLTOOL" ]; then + echo "error: need llvm-dlltool to build hipblas.lib (install LLVM or scoop llvm)" >&2 + exit 2 + fi + OBJDUMP="$ROCM_PATH/bin/llvm-objdump.exe" + DEF="$(mktemp)" + { echo "LIBRARY libhipblas.dll"; echo "EXPORTS"; \ + "$OBJDUMP" -p "$ROCM_PATH/bin/libhipblas.dll" \ + | awk '/Export Table:/{f=1} f&&/^[[:space:]]+[0-9]+[[:space:]]+0x[0-9a-f]+[[:space:]]+/{print $NF}' \ + | sort -u; } > "$DEF" + "$DLLTOOL" -m i386:x86-64 -d "$DEF" -l "$HIPBLAS_LIB" -D libhipblas.dll + rm -f "$DEF" +fi + +# --- common flags ---------------------------------------------------------- +# Host C files are compiled in the MSVC ABI (clang --target=...-windows-msvc) so +# they link against the hipcc-produced (MSVC-ABI) ds4_cuda.o. DS4_WIN_PTHREAD +# selects the Win32 pthread shim (MSVC has no ). +HOSTFLAGS="--target=x86_64-pc-windows-msvc -O3 -ffast-math -fno-finite-math-only \ + -DDS4_WIN_PTHREAD -D_CRT_SECURE_NO_WARNINGS \ + -Wno-deprecated-declarations -Wno-unused-command-line-argument" + +GPUFLAGS="--offload-arch=$ROCM_ARCH -O3 -fno-finite-math-only \ + -D__HIP_PLATFORM_AMD__ -D_CRT_SECURE_NO_WARNINGS \ + -Wno-deprecated-declarations -Wno-unused-command-line-argument -I$ROCWMMA_INC" + +echo "==> compiling ds4_cuda.cu (HIP, $ROCM_ARCH)" +"$HIPCC" $GPUFLAGS -c ds4_cuda.cu -o ds4_cuda.o + +echo "==> compiling ds4.c (host, MSVC ABI)" +"$CLANG" $HOSTFLAGS -c ds4.c -o ds4.o + +echo "==> compiling ds4_bench.c (host, MSVC ABI)" +"$CLANG" $HOSTFLAGS -c ds4_bench.c -o ds4_bench.o + +echo "==> linking ds4-bench.exe" +"$HIPCC" --offload-arch="$ROCM_ARCH" ds4_bench.o ds4.o ds4_cuda.o \ + -o ds4-bench.exe -L"$THIRD" -lhipblas + +echo "==> done: ds4-bench.exe" +echo " Run with the SDK bin on PATH, e.g.:" +echo " PATH=\"$ROCM_PATH/bin:\$PATH\" ./ds4-bench.exe --prompt-file FILE -m MODEL.gguf" diff --git a/win/ds4_pthread_win.h b/win/ds4_pthread_win.h new file mode 100644 index 000000000..8eab83ee3 --- /dev/null +++ b/win/ds4_pthread_win.h @@ -0,0 +1,146 @@ +/* ds4_pthread_win.h — minimal pthread shim for native Windows GPU builds. + * + * The native-Windows CPU build (MinGW-w64) gets pthreads from winpthreads, but + * the native-Windows ROCm/HIP build compiles the C host code with clang in the + * MSVC ABI (to match the hipcc-built ds4_cuda.o), and the MSVC toolchain has no + * . This header implements exactly the pthread subset DS4 uses on + * top of the Win32 threading primitives: + * + * pthread_t, pthread_create, pthread_join + * pthread_mutex_t / _init / _lock / _unlock / _destroy + * pthread_cond_t / _init / _wait / _signal / _broadcast / _destroy + * pthread_once_t / pthread_once / PTHREAD_ONCE_INIT + * + * Header-only and self-contained. The entire body is guarded by _WIN32, and it + * is only pulled in for the Windows GPU build (not the MinGW CPU build, which + * already has real pthreads), so POSIX builds are completely unaffected. + * + * Only included from ds4_win.h, and only when DS4_WIN_PTHREAD is requested, so + * the MinGW CPU build keeps using winpthreads. + */ +#ifndef DS4_PTHREAD_WIN_H +#define DS4_PTHREAD_WIN_H + +#ifdef _WIN32 + +#include +#include +#include + +/* ---- threads ------------------------------------------------------------- */ +typedef struct { + HANDLE handle; + void *(*start)(void *); + void *arg; + void *retval; +} ds4_pthread_state; +typedef ds4_pthread_state *pthread_t; + +static unsigned __stdcall ds4_pthread_trampoline(void *p) +{ + ds4_pthread_state *st = (ds4_pthread_state *)p; + st->retval = st->start(st->arg); + return 0; +} + +static inline int pthread_create(pthread_t *thread, const void *attr, + void *(*start)(void *), void *arg) +{ + (void)attr; + ds4_pthread_state *st = (ds4_pthread_state *)calloc(1, sizeof(*st)); + if (!st) return EAGAIN; + st->start = start; + st->arg = arg; + uintptr_t h = _beginthreadex(NULL, 0, ds4_pthread_trampoline, st, 0, NULL); + if (h == 0) { free(st); return EAGAIN; } + st->handle = (HANDLE)h; + *thread = st; + return 0; +} + +static inline int pthread_join(pthread_t thread, void **retval) +{ + if (!thread) return EINVAL; + WaitForSingleObject(thread->handle, INFINITE); + if (retval) *retval = thread->retval; + CloseHandle(thread->handle); + free(thread); + return 0; +} + +/* ---- mutex (non-recursive; matches PTHREAD default) ---------------------- */ +typedef SRWLOCK pthread_mutex_t; +#define PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT + +static inline int pthread_mutex_init(pthread_mutex_t *m, const void *attr) +{ + (void)attr; + InitializeSRWLock(m); + return 0; +} +static inline int pthread_mutex_lock(pthread_mutex_t *m) +{ + AcquireSRWLockExclusive(m); + return 0; +} +static inline int pthread_mutex_unlock(pthread_mutex_t *m) +{ + ReleaseSRWLockExclusive(m); + return 0; +} +static inline int pthread_mutex_destroy(pthread_mutex_t *m) +{ + (void)m; /* SRWLOCK needs no teardown */ + return 0; +} + +/* ---- condition variable -------------------------------------------------- */ +typedef CONDITION_VARIABLE pthread_cond_t; +#define PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT + +static inline int pthread_cond_init(pthread_cond_t *c, const void *attr) +{ + (void)attr; + InitializeConditionVariable(c); + return 0; +} +static inline int pthread_cond_wait(pthread_cond_t *c, pthread_mutex_t *m) +{ + /* SRWLOCK held exclusively → CONDITION_VARIABLE_LOCKMODE default (0). */ + return SleepConditionVariableSRW(c, m, INFINITE, 0) ? 0 : EINVAL; +} +static inline int pthread_cond_signal(pthread_cond_t *c) +{ + WakeConditionVariable(c); + return 0; +} +static inline int pthread_cond_broadcast(pthread_cond_t *c) +{ + WakeAllConditionVariable(c); + return 0; +} +static inline int pthread_cond_destroy(pthread_cond_t *c) +{ + (void)c; /* CONDITION_VARIABLE needs no teardown */ + return 0; +} + +/* ---- one-time init ------------------------------------------------------- */ +typedef INIT_ONCE pthread_once_t; +#define PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT + +static void (*ds4_once_fn)(void); +static BOOL CALLBACK ds4_once_trampoline(PINIT_ONCE io, PVOID param, PVOID *ctx) +{ + (void)io; (void)ctx; + ((void (*)(void))param)(); + return TRUE; +} +static inline int pthread_once(pthread_once_t *once, void (*init)(void)) +{ + InitOnceExecuteOnce(once, ds4_once_trampoline, (PVOID)init, NULL); + return 0; +} + +#endif /* _WIN32 */ +#endif /* DS4_PTHREAD_WIN_H */ diff --git a/win/third_party/rocwmma/rocwmma/rocwmma-version.hpp b/win/third_party/rocwmma/rocwmma/rocwmma-version.hpp new file mode 100644 index 000000000..ec7554058 --- /dev/null +++ b/win/third_party/rocwmma/rocwmma/rocwmma-version.hpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2021-2025 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +//! @file +//! @brief rocwmma-version.hpp provides the configured version and settings +//! +//! Vendored for the native-Windows ROCm build of DS4. The Windows HIP SDK +//! (C:/Program Files/AMD/ROCm/7.1) does NOT ship the header-only rocWMMA +//! library, but ds4_rocm.h includes this version header. The values below are +//! the CMake-configured output of +//! library/include/rocwmma/internal/rocwmma-version.hpp.in from +//! github.com/ROCm/rocWMMA (VERSION_STRING 2.2.1, the rocWMMA release that +//! ships with ROCm 7.x). rocWMMA is header-only and MIT-licensed; only the +//! version header is required because DS4's wmma path is CUDA-only +//! (guarded by __CUDA_ARCH__) and not compiled for HIP. + +#ifndef ROCWMMA_API_VERSION_HPP +#define ROCWMMA_API_VERSION_HPP + +#include + +// clang-format off +#define ROCWMMA_VERSION_MAJOR 2 +#define ROCWMMA_VERSION_MINOR 2 +#define ROCWMMA_VERSION_PATCH 1 +// clang-format on + +inline std::string rocwmma_get_version() +{ + return std::to_string(ROCWMMA_VERSION_MAJOR) + "." + std::to_string(ROCWMMA_VERSION_MINOR) + "." + + std::to_string(ROCWMMA_VERSION_PATCH); +} + +#endif // ROCWMMA_API_VERSION_HPP