From 21a54de18fa1ad2f77fdb431165e7a7d12a51f3e Mon Sep 17 00:00:00 2001 From: kmc6042 Date: Mon, 1 Jun 2026 02:28:25 +0900 Subject: [PATCH] cuda: add DS4_CUDA_MANAGED env var for full UMA pool access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add DS4_CUDA_MANAGED=1 environment variable that switches ds4_gpu_tensor_alloc from cudaMalloc (VRAM carve-out only) to cudaMallocManaged (full unified memory pool). This is critical for UMA platforms like AMD Strix Halo where the BIOS VRAM carve-out (e.g. 96 GB) is smaller than physical memory (128 GB). Without this, context buffers are limited to the BIOS carve-out, capping usable context at ~870K when the model weights occupy ~81 GB. With managed memory, the full 128 GB pool is available, enabling 1M-token context. The change is opt-in and zero-overhead when the env var is unset. Also fix two rsqrtf() calls to 1.0f/sqrtf() for ROCm compatibility. Tested on Strix Halo (Ryzen AI MAX+ 395, gfx1151): DS4_CUDA_MANAGED=1 ./ds4-server --ctx 1000000 → context buffers 17222.50 MiB, server starts successfully → 1M context chat completion: 8.71 t/s, correct output Co-Authored-By: Claude Opus 4.8 --- ds4_cuda.cu | 23 ++++++++++++++++++----- ds4_rocm.h | 1 + 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/ds4_cuda.cu b/ds4_cuda.cu index 8b6241ca3..8f467b63c 100644 --- a/ds4_cuda.cu +++ b/ds4_cuda.cu @@ -1302,9 +1302,22 @@ extern "C" ds4_gpu_tensor *ds4_gpu_tensor_alloc(uint64_t bytes) { if (bytes == 0) bytes = 1; ds4_gpu_tensor *t = (ds4_gpu_tensor *)calloc(1, sizeof(*t)); if (!t) return NULL; - if (!cuda_ok(cudaMalloc(&t->ptr, (size_t)bytes), "tensor alloc")) { - free(t); - return NULL; + + if (getenv("DS4_CUDA_MANAGED") != NULL) { + /* Use cudaMallocManaged with cudaMemAttachGlobal so the allocation + * is GPU-accessible across all streams. On UMA platforms (Strix + * Halo, Grace-Hopper) this allocates from the full unified pool, + * bypassing the BIOS VRAM carve-out. */ + if (!cuda_ok(cudaMallocManaged(&t->ptr, (size_t)bytes, cudaMemAttachGlobal), + "managed tensor alloc")) { + free(t); + return NULL; + } + } else { + if (!cuda_ok(cudaMalloc(&t->ptr, (size_t)bytes), "tensor alloc")) { + free(t); + return NULL; + } } t->bytes = bytes; t->owner = 1; @@ -6168,7 +6181,7 @@ extern "C" int ds4_gpu_attention_prefill_raw_heads_tensor(ds4_gpu_tensor *heads, if (!tmp) return 0; float *scores = tmp; float *out_tmp = (float *)((char *)tmp + out_offset); - const float alpha = rsqrtf((float)head_dim); + const float alpha = 1.0f / sqrtf((float)head_dim); const float beta = 0.0f; cublasStatus_t st = cublasSgemmStridedBatched(g_cublas, CUBLAS_OP_T, @@ -6538,7 +6551,7 @@ static int attention_prefill_mixed_launch( n_comp, head_dim); if (!cuda_ok(cudaGetLastError(), "attention mixed kv pack launch")) return 0; - const float alpha = rsqrtf((float)head_dim); + const float alpha = 1.0f / sqrtf((float)head_dim); const float beta = 0.0f; cublasStatus_t st = cublasSgemmStridedBatched(g_cublas, CUBLAS_OP_T, diff --git a/ds4_rocm.h b/ds4_rocm.h index 0400910df..55907c428 100644 --- a/ds4_rocm.h +++ b/ds4_rocm.h @@ -13,6 +13,7 @@ #define cudaSuccess hipSuccess #define cudaErrorNotSupported hipErrorNotSupported +#define cudaMemAttachGlobal hipMemAttachGlobal #define cudaErrorInvalidValue hipErrorInvalidValue #define cudaGetLastError hipGetLastError #define cudaGetErrorString hipGetErrorString