diff --git a/.gitignore b/.gitignore index 2c70e4d61..3aa15bbfa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ /ds4 /ds4-server /ds4-bench +/ds4-bench.exe /ds4_native /ds4_server_test /ds4_test @@ -8,6 +9,7 @@ /TODO.md /gguf/ *.o +*.exe *.dSYM/ /misc/ .*.swp diff --git a/Makefile b/Makefile index 42cde3c35..4629e8d76 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,28 @@ CC ?= cc UNAME_S := $(shell uname -s) +# On MinGW/MSYS `uname -s` is e.g. MINGW64_NT-10.0 or MSYS_NT-10.0. +IS_WINDOWS := $(filter MINGW% MSYS%,$(UNAME_S)) + +# MinGW has no `cc`; default the compiler to gcc there (still overridable). +ifneq ($(IS_WINDOWS),) +ifeq ($(origin CC),default) +CC := gcc +endif +endif + ifeq ($(UNAME_S),Darwin) NATIVE_CPU_FLAG ?= -mcpu=native else NATIVE_CPU_FLAG ?= -march=native endif +# Native Windows (MinGW-w64) CPU build flags. ds4.c pulls in the dependency-free +# POSIX shim (ds4_win.h) behind #ifdef _WIN32; no extra -I/-include is needed. +WIN_CFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -std=c99 -D_GNU_SOURCE \ + -fno-finite-math-only -DDS4_NO_GPU -D_CRT_SECURE_NO_WARNINGS +WIN_LDLIBS ?= -lm + CFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -Wall -Wextra -std=c99 OBJCFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -Wall -Wextra -fobjc-arc @@ -18,7 +34,23 @@ METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal CORE_OBJS = ds4.o ds4_metal.o CPU_CORE_OBJS = ds4_cpu.o else + CFLAGS += -D_GNU_SOURCE -fno-finite-math-only + +ifeq ($(GPU_BACKEND),rocm) +ROCM_PATH ?= /opt/rocm +GPU_CC = $(ROCM_PATH)/bin/hipcc +ROCM_ARCH ?= gfx1151 + +GPU_CFLAGS ?= -O3 -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -Wno-unused-command-line-argument --offload-arch=$(ROCM_ARCH) +GPU_LDLIBS = -lm -pthread -L$(ROCM_PATH)/lib -lhipblas + +@echo "ROCM_ARCH: $(ROCM_ARCH)" + +EXTRA_DEPS = ds4_rocm.h + +else + CUDA_HOME ?= /usr/local/cuda NVCC ?= $(CUDA_HOME)/bin/nvcc CUDA_ARCH ?= @@ -27,12 +59,21 @@ NVCC_ARCH_FLAGS := -arch=$(CUDA_ARCH) endif NVCCFLAGS ?= -O3 --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NATIVE_CPU_FLAG) -Xcompiler -pthread CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas + +GPU_CC = $(NVCC) +GPU_CFLAGS = $(NVCCFLAGS) +GPU_LDLIBS = $(CUDA_LDLIBS) + +endif + CORE_OBJS = ds4.o ds4_cuda.o +EXTRA_DEPS = CPU_CORE_OBJS = ds4_cpu.o METAL_LDLIBS := $(LDLIBS) + endif -.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression +.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression windows-cpu ifeq ($(UNAME_S),Darwin) all: ds4 ds4-server ds4-bench @@ -60,6 +101,27 @@ cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o linenoise.o rax.o $(CPU_CORE cuda-regression: @echo "cuda-regression requires a CUDA build" + +else ifneq ($(IS_WINDOWS),) +# ---- Native Windows (MinGW-w64) ------------------------------------------ +# Only the CPU bench is portable today. The CLI (linenoise/termios + sigaction) +# and server (BSD sockets/poll) still need Windows ports; see win/README.md. +all: help + +help: + @echo "DS4 build targets (native Windows / MinGW-w64):" + @echo " make windows-cpu Build native Windows CPU ./ds4-bench.exe" + @echo " make clean Remove build outputs" + @echo "" + @echo " ds4 (CLI) and ds4-server are not yet ported to Windows." + +windows-cpu: ds4-bench.exe + +ds4-bench.exe: ds4_bench.c ds4.c ds4.h ds4_gpu.h ds4_win.h + $(CC) $(WIN_CFLAGS) -c -o ds4_cpu.o ds4.c + $(CC) $(WIN_CFLAGS) -c -o ds4_bench_cpu.o ds4_bench.c + $(CC) $(WIN_CFLAGS) -o $@ ds4_bench_cpu.o ds4_cpu.o $(WIN_LDLIBS) + else all: help @@ -69,6 +131,7 @@ help: @echo " make cuda-generic Build CUDA for a generic local CUDA GPU" @echo " make cuda CUDA_ARCH=sm_N Build CUDA with an explicit nvcc -arch value" @echo " make cpu Build CPU-only ./ds4, ./ds4-server, and ./ds4-bench" + @echo " make rocm Build ROCm" @echo " make test Build and run tests" @echo " make clean Remove build outputs" @@ -86,14 +149,22 @@ cuda: fi $(MAKE) ds4 ds4-server ds4-bench CUDA_ARCH="$(CUDA_ARCH)" +rocm: + @if [ -z "$(strip $(ROCM_ARCH))" ]; then \ + echo "error: specify ROCM_ARCH, for example: make rocm ROCM_ARCH=gfx1151"; \ + exit 2; \ + fi + $(MAKE) ds4 ds4-server ds4-bench GPU_BACKEND=rocm ROCM_ARCH=$(ROCM_ARCH) + + ds4: ds4_cli.o linenoise.o $(CORE_OBJS) - $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) + $(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS) ds4-server: ds4_server.o rax.o $(CORE_OBJS) - $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) + $(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS) ds4-bench: ds4_bench.o $(CORE_OBJS) - $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) + $(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS) cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o linenoise.o rax.o $(CPU_CORE_OBJS) $(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS) @@ -143,11 +214,11 @@ ds4_bench_cpu.o: ds4_bench.c ds4.h ds4_metal.o: ds4_metal.m ds4_gpu.h $(METAL_SRCS) $(CC) $(OBJCFLAGS) -c -o $@ ds4_metal.m -ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc - $(NVCC) $(NVCCFLAGS) -c -o $@ ds4_cuda.cu +ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(EXTRA_DEPS) + $(GPU_CC) $(GPU_CFLAGS) -c -o $@ ds4_cuda.cu tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o - $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) + $(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS) ds4_test: ds4_test.o rax.o $(CORE_OBJS) ifeq ($(UNAME_S),Darwin) @@ -160,4 +231,5 @@ test: ds4_test ./ds4_test clean: - rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o + rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o *.exe tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o + diff --git a/ds4.c b/ds4.c index 51410e335..80c4ed320 100644 --- a/ds4.c +++ b/ds4.c @@ -27,12 +27,22 @@ #include #include #include +#ifdef _WIN32 +/* Native Windows (MinGW-w64) CPU build: a small dependency-free POSIX shim + * supplies mmap/flock/pread/sysconf/dprintf/fmemopen. See ds4_win.h. */ +#include "ds4_win.h" +#include +#include +#include +#include +#else #include #include #include #include #include #include +#endif #include "ds4.h" diff --git a/ds4_cuda.cu b/ds4_cuda.cu index 3b224f99e..8b6241ca3 100644 --- a/ds4_cuda.cu +++ b/ds4_cuda.cu @@ -1,8 +1,18 @@ +#ifdef __HIP_PLATFORM_AMD__ +#include "ds4_rocm.h" + +#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL +#define MASK_T uint64_t +#else #include #include #include #include +#define FULL_WARP_MASK 0xFFFFFFFFu +#define MASK_T uint32_t +#endif + #include #include #include @@ -1722,14 +1732,14 @@ __global__ static void f32_to_f16_kernel(__half *out, const float *x, uint64_t n __device__ static float warp_sum_f32(float v) { for (int offset = 16; offset > 0; offset >>= 1) { - v += __shfl_down_sync(0xffffffffu, v, offset); + v += __shfl_down_sync(FULL_WARP_MASK, v, offset); } return v; } __device__ static float warp_max_f32(float v) { for (int offset = 16; offset > 0; offset >>= 1) { - v = fmaxf(v, __shfl_down_sync(0xffffffffu, v, offset)); + v = fmaxf(v, __shfl_down_sync(FULL_WARP_MASK, v, offset)); } return v; } @@ -2846,7 +2856,7 @@ __global__ static void attention_decode_mixed_kernel( for (uint32_t d = qlane; d < head_dim; d += 8u) dot += qh[d] * kvrow[d]; const uint32_t mask = 0xffu << (threadIdx.x & 24u); for (uint32_t off = 4u; off > 0u; off >>= 1u) { - dot += __shfl_down_sync(mask, dot, off, 8); + dot += __shfl_down_sync(static_cast(mask), dot, off, 8); } s = dot * scale + add; } @@ -3008,7 +3018,7 @@ __global__ static void attention_indexed_mixed_kernel( for (uint32_t d = qlane; d < head_dim; d += 8u) dot += qh[d] * kvrow[d]; const uint32_t mask = 0xffu << (threadIdx.x & 24u); for (uint32_t off = 4u; off > 0u; off >>= 1u) { - dot += __shfl_down_sync(mask, dot, off, 8); + dot += __shfl_down_sync(static_cast(mask), dot, off, 8); } if (qlane == 0) scores[row] = dot * scale; } @@ -3189,7 +3199,7 @@ __global__ static void attention_indexed_mixed_heads8_rb4_kernel( const float *score_row = scores + warp * 768u; for (uint32_t i = lane; i < n_score; i += 32u) max_s = fmaxf(max_s, score_row[i]); max_s = warp_max_f32(max_s); - max_s = __shfl_sync(0xffffffffu, max_s, 0); + max_s = __shfl_sync(FULL_WARP_MASK, max_s, 0); } float den = 0.0f; if (valid_head) { @@ -3201,7 +3211,7 @@ __global__ static void attention_indexed_mixed_heads8_rb4_kernel( } den = warp_sum_f32(den); den += expf(sinks[head] - max_s); - den = __shfl_sync(0xffffffffu, den, 0); + den = __shfl_sync(FULL_WARP_MASK, den, 0); } float4 o0 = make_float4(0.0f, 0.0f, 0.0f, 0.0f); @@ -3361,7 +3371,7 @@ __global__ static void attention_indexed_mixed_heads8_online_kernel( dot4_f32(q2, k2) + dot4_f32(q3, k3); score = warp_sum_f32(score) * scale; - score = __shfl_sync(0xffffffffu, score, 0); + score = __shfl_sync(FULL_WARP_MASK, score, 0); const float new_m = fmaxf(max_s, score); const float old_scale = expf(max_s - new_m); @@ -3485,7 +3495,7 @@ __global__ static void attention_static_mixed_heads8_online_kernel( dot4_f32(q2, k2) + dot4_f32(q3, k3); score = warp_sum_f32(score) * scale; - score = __shfl_sync(0xffffffffu, score, 0); + score = __shfl_sync(FULL_WARP_MASK, score, 0); const float new_m = fmaxf(max_s, score); const float old_scale = expf(max_s - new_m); @@ -3650,7 +3660,7 @@ __global__ static void attention_decode_mixed_heads8_online_kernel( dot4_f32(q2, k2) + dot4_f32(q3, k3); score = warp_sum_f32(score) * scale; - score = __shfl_sync(0xffffffffu, score, 0); + score = __shfl_sync(FULL_WARP_MASK, score, 0); const float new_m = fmaxf(max_s, score); const float old_scale = expf(max_s - new_m); @@ -4268,9 +4278,9 @@ __global__ static void router_select_warp_topk_kernel( } #pragma unroll for (uint32_t mask = 16u; mask > 0u; mask >>= 1u) { - const float other_score = __shfl_xor_sync(0xffffffffu, best_score, mask); - const float other_prob = __shfl_xor_sync(0xffffffffu, best_prob, mask); - const uint32_t other_idx = __shfl_xor_sync(0xffffffffu, best_idx, mask); + const float other_score = __shfl_xor_sync(FULL_WARP_MASK, best_score, mask); + const float other_prob = __shfl_xor_sync(FULL_WARP_MASK, best_prob, mask); + const uint32_t other_idx = __shfl_xor_sync(FULL_WARP_MASK, best_idx, mask); if (router_score_better(other_score, other_idx, best_score, best_idx)) { best_score = other_score; best_prob = other_prob; @@ -5198,7 +5208,7 @@ static int cuda_matmul_q8_0_tensor_labeled(ds4_gpu_tensor *out, const void *mode out->ptr, CUDA_R_32F, (int)out_dim, - CUDA_R_32F, + CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT); if (st == CUBLAS_STATUS_SUCCESS) return 1; fprintf(stderr, "ds4: cuBLAS q8 f16 matmul failed: status %d\n", (int)st); @@ -5440,7 +5450,7 @@ extern "C" int ds4_gpu_matmul_f16_tensor(ds4_gpu_tensor *out, const void *model_ out->ptr, CUDA_R_32F, (int)out_dim, - CUDA_R_32F, + CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT); return cublas_ok(st, "f16 matmul"); } @@ -6733,7 +6743,7 @@ extern "C" int ds4_gpu_attention_output_q8_batch_tensor( (int)rank, (long long)rank * n_tokens, (int)n_groups, - CUDA_R_32F, + CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT); if (!cublas_ok(st, "attention output a gemm")) return 0; attention_unpack_group_low_kernel<<<(low_tmp_count + 255) / 256, 256>>>( @@ -7395,7 +7405,7 @@ __device__ static void dev_dot_q2_K_q8_K_block8( __device__ static float half_warp_sum_f32(float v, uint32_t lane16) { uint32_t mask = 0xffffu << (threadIdx.x & 16u); for (int offset = 8; offset > 0; offset >>= 1) { - v += __shfl_down_sync(mask, v, offset, 16); + v += __shfl_down_sync(static_cast(mask), v, offset, 16); } (void)lane16; return v; @@ -7404,7 +7414,7 @@ __device__ static float half_warp_sum_f32(float v, uint32_t lane16) { __device__ static float quarter_warp_sum_f32(float v, uint32_t lane8) { uint32_t mask = 0xffu << (threadIdx.x & 24u); for (int offset = 4; offset > 0; offset >>= 1) { - v += __shfl_down_sync(mask, v, offset, 8); + v += __shfl_down_sync(static_cast(mask), v, offset, 8); } (void)lane8; return v; diff --git a/ds4_rocm.h b/ds4_rocm.h new file mode 100644 index 000000000..0400910df --- /dev/null +++ b/ds4_rocm.h @@ -0,0 +1,115 @@ +#pragma once + +#include +#include +#include +#include + +#define cudaError_t hipError_t +#define cudaStream_t hipStream_t +#define cudaEvent_t hipEvent_t +#define cudaDeviceProp hipDeviceProp_t +#define cudaMemLocation hipMemLocation + +#define cudaSuccess hipSuccess +#define cudaErrorNotSupported hipErrorNotSupported +#define cudaErrorInvalidValue hipErrorInvalidValue +#define cudaGetLastError hipGetLastError +#define cudaGetErrorString hipGetErrorString + +#define cudaGetDevice hipGetDevice +#define cudaSetDevice hipSetDevice +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaDeviceGetAttribute hipDeviceGetAttribute +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaDevAttrPageableMemoryAccess hipDeviceAttributePageableMemoryAccess +#define cudaMemLocationTypeDevice hipMemLocationTypeDevice + +#define cudaMalloc hipMalloc +#define cudaMallocHost hipHostMalloc +#define cudaMallocManaged hipMallocManaged +#define cudaFree hipFree +#define cudaFreeHost hipFreeHost +#define cudaMemset hipMemset +#define cudaMemcpy hipMemcpy +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemGetInfo hipMemGetInfo +#define cudaMemsetAsync hipMemsetAsync + +#define cudaHostRegister hipHostRegister +#define cudaHostUnregister hipHostUnregister +#define cudaHostGetDevicePointer hipHostGetDevicePointer +#define cudaHostRegisterMapped hipHostRegisterMapped +#define cudaHostRegisterReadOnly hipHostRegisterReadOnly + +#define cudaMemAdvise(p1, p2, p3, p4) hipMemAdvise(p1, p2, p3, p4.id) +#define cudaMemPrefetchAsync(devPtr, count, location, flags, stream) hipMemPrefetchAsync(devPtr, count, location.id, stream) +#define cudaMemAdviseSetReadMostly hipMemAdviseSetReadMostly +#define cudaMemAdviseSetPreferredLocation hipMemAdviseSetPreferredLocation + +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamDestroy hipStreamDestroy +#define cudaStreamNonBlocking hipStreamNonBlocking + +#define cudaEventCreate hipEventCreate +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDestroy hipEventDestroy +#define cudaEventRecord hipEventRecord +#define cudaEventSynchronize hipEventSynchronize +#define cudaEventElapsedTime hipEventElapsedTime +#define cudaEventDisableTiming hipEventDisableTiming + +#define cublasHandle_t hipblasHandle_t +#define cublasStatus_t hipblasStatus_t +#define cublasMath_t hipblasMath_t + +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_DEFAULT_MATH HIPBLAS_DEFAULT_MATH +#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F +#define CUBLAS_TF32_TENSOR_OP_MATH HIPBLAS_TF32_TENSOR_OP_MATH +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F + +#define cublasCreate hipblasCreate +#define cublasDestroy hipblasDestroy +#define cublasSetMathMode hipblasSetMathMode +#define cublasSgemm hipblasSgemm +#define cublasSgemmStridedBatched hipblasSgemmStridedBatched +#define cublasGemmEx hipblasGemmEx +#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx + +static __device__ __forceinline__ int32_t __vcmpne4(uint32_t a, uint32_t b) { + // For each byte: 0xFF if a != b, 0x00 if a == b + uint32_t diff = a ^ b; + // Spread any set bit in each byte to fill the whole byte + diff |= (diff >> 1); diff |= (diff >> 2); diff |= (diff >> 4); + diff &= 0x01010101u; + diff *= 0xFFu; // 0x01 -> 0xFF per byte + return (int32_t)diff; +} + +static __device__ __forceinline__ int32_t __vsub4(int32_t a, int32_t b) { + // Per-byte subtraction (wrapping, not saturating) + uint32_t ua = (uint32_t)a, ub = (uint32_t)b; + // Trick: subtract bytes in parallel avoiding cross-byte borrows + uint32_t diff = ((ua | 0x80808080u) - (ub & 0x7F7F7F7Fu)) ^ ((ua ^ ~ub) & 0x80808080u); + return (int32_t)diff; +} + +// __dp4a: dot product of 4 signed int8s packed in an int32 +static __device__ __forceinline__ int32_t __dp4a(int32_t a, int32_t b, int32_t c) { + const int8_t *a_bytes = reinterpret_cast(&a); + const int8_t *b_bytes = reinterpret_cast(&b); + return c + (int32_t)a_bytes[0] * b_bytes[0] + + (int32_t)a_bytes[1] * b_bytes[1] + + (int32_t)a_bytes[2] * b_bytes[2] + + (int32_t)a_bytes[3] * b_bytes[3]; +} + diff --git a/ds4_win.h b/ds4_win.h new file mode 100644 index 000000000..bc3009663 --- /dev/null +++ b/ds4_win.h @@ -0,0 +1,232 @@ +/* ds4_win.h — minimal POSIX compatibility layer for native Windows builds. + * + * Provides just the POSIX surface ds4.c relies on that MinGW/UCRT lacks: + * - mmap / munmap / madvise (read-only file mappings) + * - sysconf(_SC_NPROCESSORS_ONLN / _SC_PAGESIZE) + * - flock / fcntl(F_SETFD,FD_CLOEXEC) / pread / ftruncate / dprintf (instance lock) + * - fmemopen (fixed-buffer "wb"/"rb", temp-file backed with copy-back on close) + * + * Header-only, self-contained, no third-party deps. The whole body is guarded by + * _WIN32, so this header is inert on POSIX platforms. ds4.c includes it in place + * of (and the other POSIX-only surface) behind #ifdef _WIN32, so the + * native MinGW-w64 CPU build needs no extra include/search-path flags. MinGW + * already provides pthread, clock_gettime and ftruncate. + */ +#ifndef DS4_WIN_H +#define DS4_WIN_H + +#ifdef _WIN32 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* ---- mmap ---------------------------------------------------------------- */ +#define PROT_NONE 0x0 +#define PROT_READ 0x1 +#define PROT_WRITE 0x2 +#define PROT_EXEC 0x4 +#define MAP_SHARED 0x01 +#define MAP_PRIVATE 0x02 +#define MAP_FAILED ((void *)-1) +#define POSIX_MADV_NORMAL 0 +#define POSIX_MADV_RANDOM 1 +#define POSIX_MADV_SEQUENTIAL 2 +#define POSIX_MADV_WILLNEED 3 +#define POSIX_MADV_DONTNEED 4 +#define MADV_WILLNEED POSIX_MADV_WILLNEED + +#ifndef _SC_PAGESIZE +#define _SC_PAGESIZE 0x1 +#endif +#ifndef _SC_NPROCESSORS_ONLN +#define _SC_NPROCESSORS_ONLN 0x2 +#endif + +/* ---- file locking / fd flags -------------------------------------------- */ +#ifndef F_SETFD +#define F_SETFD 2 +#endif +#ifndef FD_CLOEXEC +#define FD_CLOEXEC 1 +#endif +#define LOCK_SH 1 +#define LOCK_EX 2 +#define LOCK_NB 4 +#define LOCK_UN 8 + +static inline void *mmap(void *addr, size_t length, int prot, int flags, + int fd, long long offset) +{ + (void)addr; (void)flags; (void)prot; + HANDLE fh = (HANDLE)_get_osfhandle(fd); + if (fh == INVALID_HANDLE_VALUE) { errno = EBADF; return MAP_FAILED; } + HANDLE mh = CreateFileMappingA(fh, NULL, PAGE_READONLY, 0, 0, NULL); + if (mh == NULL) { errno = ENOMEM; return MAP_FAILED; } + DWORD off_hi = (DWORD)((uint64_t)offset >> 32); + DWORD off_lo = (DWORD)((uint64_t)offset & 0xFFFFFFFFu); + void *p = MapViewOfFile(mh, FILE_MAP_READ, off_hi, off_lo, length); + CloseHandle(mh); /* view keeps the section alive */ + if (p == NULL) { errno = ENOMEM; return MAP_FAILED; } + return p; +} + +static inline int munmap(void *addr, size_t length) +{ + (void)length; + return UnmapViewOfFile(addr) ? 0 : -1; +} + +static inline int posix_madvise(void *addr, size_t length, int advice) +{ + (void)addr; (void)length; (void)advice; + return 0; /* advisory only */ +} +static inline int madvise(void *addr, size_t length, int advice) +{ + return posix_madvise(addr, length, advice); +} + +static inline long sysconf(int name) +{ + SYSTEM_INFO si; + GetSystemInfo(&si); + if (name == _SC_NPROCESSORS_ONLN) return (long)si.dwNumberOfProcessors; + if (name == _SC_PAGESIZE) return (long)si.dwPageSize; + errno = EINVAL; + return -1; +} + +static inline int flock(int fd, int op) +{ + HANDLE h = (HANDLE)_get_osfhandle(fd); + if (h == INVALID_HANDLE_VALUE) { errno = EBADF; return -1; } + OVERLAPPED ov; memset(&ov, 0, sizeof(ov)); + if (op & LOCK_UN) { + return UnlockFileEx(h, 0, MAXDWORD, MAXDWORD, &ov) ? 0 : -1; + } + DWORD f = 0; + if (op & LOCK_EX) f |= LOCKFILE_EXCLUSIVE_LOCK; + if (op & LOCK_NB) f |= LOCKFILE_FAIL_IMMEDIATELY; + if (!LockFileEx(h, f, 0, MAXDWORD, MAXDWORD, &ov)) { + errno = (GetLastError() == ERROR_LOCK_VIOLATION) ? EWOULDBLOCK : EACCES; + return -1; + } + return 0; +} + +static inline int fcntl(int fd, int cmd, ...) +{ + (void)fd; (void)cmd; + return 0; /* F_SETFD/FD_CLOEXEC is a no-op: Windows handles aren't inherited by default */ +} + +static inline long long ds4_pread(int fd, void *buf, size_t count, long long offset) +{ + HANDLE h = (HANDLE)_get_osfhandle(fd); + if (h == INVALID_HANDLE_VALUE) { errno = EBADF; return -1; } + OVERLAPPED ov; memset(&ov, 0, sizeof(ov)); + ov.Offset = (DWORD)((uint64_t)offset & 0xFFFFFFFFu); + ov.OffsetHigh = (DWORD)((uint64_t)offset >> 32); + DWORD got = 0; + if (!ReadFile(h, buf, (DWORD)count, &got, &ov)) { + if (GetLastError() == ERROR_HANDLE_EOF) return 0; + errno = EIO; return -1; + } + return (long long)got; +} +#define pread(fd, buf, count, offset) ds4_pread((fd), (buf), (size_t)(count), (long long)(offset)) + +/* ftruncate is already provided by MinGW . */ + +static inline int dprintf(int fd, const char *fmt, ...) +{ + char buf[512]; + va_list ap; va_start(ap, fmt); + int n = vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + if (n < 0) return -1; + if (n > (int)sizeof(buf)) n = (int)sizeof(buf); + return _write(fd, buf, (unsigned)n); +} + +/* ---- fmemopen (temp-file backed, fixed buffer) --------------------------- */ +typedef struct { FILE *fp; void *buf; size_t cap; int writeback; } ds4_memstream; +#define DS4_MEMSTREAM_MAX 16 +static ds4_memstream ds4_ms_tab[DS4_MEMSTREAM_MAX]; +static CRITICAL_SECTION ds4_ms_cs; +static volatile LONG ds4_ms_init = 0; + +static inline void ds4_ms_ensure(void) +{ + if (InterlockedCompareExchange(&ds4_ms_init, 1, 0) == 0) + InitializeCriticalSection(&ds4_ms_cs); +} + +static inline FILE *ds4_tmpfile(void) +{ + char dir[MAX_PATH], path[MAX_PATH]; + if (!GetTempPathA(sizeof(dir), dir)) return NULL; + if (!GetTempFileNameA(dir, "ds4", 0, path)) return NULL; + /* open read/write, delete on close */ + return fopen(path, "wb+TD"); /* T=temporary, D=delete-on-close (MSVCRT ext) */ +} + +static inline FILE *fmemopen(void *buf, size_t size, const char *mode) +{ + ds4_ms_ensure(); + int writing = (mode && (strchr(mode, 'w') || strchr(mode, 'a') || strchr(mode, '+'))); + FILE *fp = ds4_tmpfile(); + if (!fp) return NULL; + if (!writing && buf && size) { + if (fwrite(buf, 1, size, fp) != size) { fclose(fp); return NULL; } + rewind(fp); + } + EnterCriticalSection(&ds4_ms_cs); + for (int i = 0; i < DS4_MEMSTREAM_MAX; i++) { + if (ds4_ms_tab[i].fp == NULL) { + ds4_ms_tab[i].fp = fp; ds4_ms_tab[i].buf = buf; + ds4_ms_tab[i].cap = size; ds4_ms_tab[i].writeback = writing ? 1 : 0; + break; + } + } + LeaveCriticalSection(&ds4_ms_cs); + return fp; +} + +static inline int ds4_win_fclose(FILE *fp) +{ + if (fp && ds4_ms_init) { + EnterCriticalSection(&ds4_ms_cs); + for (int i = 0; i < DS4_MEMSTREAM_MAX; i++) { + if (ds4_ms_tab[i].fp == fp) { + if (ds4_ms_tab[i].writeback && ds4_ms_tab[i].buf && ds4_ms_tab[i].cap) { + fflush(fp); rewind(fp); + fread(ds4_ms_tab[i].buf, 1, ds4_ms_tab[i].cap, fp); /* copy back */ + } + ds4_ms_tab[i].fp = NULL; ds4_ms_tab[i].buf = NULL; + ds4_ms_tab[i].cap = 0; ds4_ms_tab[i].writeback = 0; + break; + } + } + LeaveCriticalSection(&ds4_ms_cs); + } + return fclose(fp); /* real fclose — macro defined only after this header */ +} + +#endif /* _WIN32 */ + +/* Redirect fclose AFTER all helpers above so ds4_win_fclose's own call hits the + * real fclose. Source files including this header get the memory-stream-aware one. */ +#ifdef _WIN32 +#define fclose(fp) ds4_win_fclose(fp) +#endif + +#endif /* DS4_WIN_H */ diff --git a/win/README.md b/win/README.md new file mode 100644 index 000000000..12c7f3232 --- /dev/null +++ b/win/README.md @@ -0,0 +1,69 @@ +# Native Windows build (experimental) + +DS4 assumes a POSIX environment. `ds4_win.h` (top level) is a small, +dependency-free compatibility shim that lets the **CPU backend** build with +native MinGW-w64 GCC (no WSL, no Cygwin, no MSVC). It supplies the POSIX surface +MinGW/UCRT lacks: `mmap`/`munmap`/`madvise`, `sysconf`, `flock`/`fcntl`/`pread`/ +`dprintf`, and a temp-file-backed `fmemopen`. MinGW already provides `pthread`, +`clock_gettime`, and `ftruncate`. + +The shim is wired in-tree: `ds4.c` includes `ds4_win.h` in place of +`` (and the other POSIX-only headers) behind `#ifdef _WIN32`. The +header's entire body is guarded by `_WIN32`, so POSIX builds are byte-for-byte +unchanged. No special include/search-path flags are needed. + +## Build `ds4-bench.exe` (CPU) + +### With make (MinGW/MSYS) + +```sh +make windows-cpu +``` + +`uname -s` on MinGW/MSYS reports `MINGW64_NT*` / `MSYS_NT*`; the Makefile detects +this and selects a Windows branch that defaults `CC` to `gcc`. + +### Direct gcc (no make) + +```sh +CF="-O3 -ffast-math -march=native -std=c99 -D_GNU_SOURCE -fno-finite-math-only \ + -DDS4_NO_GPU -D_CRT_SECURE_NO_WARNINGS" +gcc $CF -c ds4.c -o ds4_cpu.o +gcc $CF -c ds4_bench.c -o ds4_bench_cpu.o +gcc $CF -o ds4-bench.exe ds4_bench_cpu.o ds4_cpu.o -lm +``` + +Toolchain used: `x86_64-w64-mingw32` GCC 15.2.0. + +### Verify it runs + +```sh +$ ./ds4-bench.exe +ds4-bench: specify exactly one of --prompt-file or --chat-prompt-file +``` + +(Exit code 2 — argument validation fires, proving the binary executes.) + +## Status + +| Target | Native Windows CPU build | Notes | +|-------------|--------------------------|-------| +| `ds4-bench` | builds & runs | no terminal/socket deps | +| `ds4` (CLI) | not yet | `linenoise.c` uses POSIX `termios`, plus `sigaction` in `ds4_cli.c` (needs Win console raw-mode port) | +| `ds4-server`| not yet | BSD sockets / `poll` / `arpa/inet.h` (needs Winsock port) | + +## Runtime note + +Set `DS4_LOCK_FILE` to a Windows path (the default is `/tmp/ds4.lock`, which does +not exist on Windows) before running, e.g.: + +```sh +export DS4_LOCK_FILE="$TEMP/ds4.lock" +``` + +## Deferred work + +- **CLI (`ds4`)**: port `linenoise.c` to the Windows console (raw mode via + `SetConsoleMode`) and replace the `sigaction`/`SIGINT` handling in `ds4_cli.c`. +- **Server (`ds4-server`)**: port the BSD sockets / `poll(2)` event loop to + Winsock 2 (`WSAStartup`, `WSAPoll`, `closesocket`).