From 21a54de18fa1ad2f77fdb431165e7a7d12a51f3e Mon Sep 17 00:00:00 2001
From: kmc6042 <kmc6042@g.skku.edu>
Date: Mon, 1 Jun 2026 02:28:25 +0900
Subject: [PATCH] cuda: add DS4_CUDA_MANAGED env var for full UMA pool access
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add DS4_CUDA_MANAGED=1 environment variable that switches
ds4_gpu_tensor_alloc from cudaMalloc (VRAM carve-out only) to
cudaMallocManaged (full unified memory pool).

This is critical for UMA platforms like AMD Strix Halo where the
BIOS VRAM carve-out (e.g. 96 GB) is smaller than physical memory
(128 GB). Without this, context buffers are limited to the BIOS
carve-out, capping usable context at ~870K when the model weights
occupy ~81 GB. With managed memory, the full 128 GB pool is
available, enabling 1M-token context.

The change is opt-in and zero-overhead when the env var is unset.

Also fix two rsqrtf() calls to 1.0f/sqrtf() for ROCm
compatibility.

Tested on Strix Halo (Ryzen AI MAX+ 395, gfx1151):
  DS4_CUDA_MANAGED=1 ./ds4-server --ctx 1000000
  → context buffers 17222.50 MiB, server starts successfully
  → 1M context chat completion: 8.71 t/s, correct output

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 ds4_cuda.cu | 23 ++++++++++++++++++-----
 ds4_rocm.h  |  1 +
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/ds4_cuda.cu b/ds4_cuda.cu
index 8b6241ca3..8f467b63c 100644
--- a/ds4_cuda.cu
+++ b/ds4_cuda.cu
@@ -1302,9 +1302,22 @@ extern "C" ds4_gpu_tensor *ds4_gpu_tensor_alloc(uint64_t bytes) {
     if (bytes == 0) bytes = 1;
     ds4_gpu_tensor *t = (ds4_gpu_tensor *)calloc(1, sizeof(*t));
     if (!t) return NULL;
-    if (!cuda_ok(cudaMalloc(&t->ptr, (size_t)bytes), "tensor alloc")) {
-        free(t);
-        return NULL;
+
+    if (getenv("DS4_CUDA_MANAGED") != NULL) {
+        /* Use cudaMallocManaged with cudaMemAttachGlobal so the allocation
+         * is GPU-accessible across all streams.  On UMA platforms (Strix
+         * Halo, Grace-Hopper) this allocates from the full unified pool,
+         * bypassing the BIOS VRAM carve-out. */
+        if (!cuda_ok(cudaMallocManaged(&t->ptr, (size_t)bytes, cudaMemAttachGlobal),
+                     "managed tensor alloc")) {
+            free(t);
+            return NULL;
+        }
+    } else {
+        if (!cuda_ok(cudaMalloc(&t->ptr, (size_t)bytes), "tensor alloc")) {
+            free(t);
+            return NULL;
+        }
     }
     t->bytes = bytes;
     t->owner = 1;
@@ -6168,7 +6181,7 @@ extern "C" int ds4_gpu_attention_prefill_raw_heads_tensor(ds4_gpu_tensor *heads,
         if (!tmp) return 0;
         float *scores = tmp;
         float *out_tmp = (float *)((char *)tmp + out_offset);
-        const float alpha = rsqrtf((float)head_dim);
+        const float alpha = 1.0f / sqrtf((float)head_dim);
         const float beta = 0.0f;
         cublasStatus_t st = cublasSgemmStridedBatched(g_cublas,
                                                       CUBLAS_OP_T,
@@ -6538,7 +6551,7 @@ static int attention_prefill_mixed_launch(
                 n_comp,
                 head_dim);
         if (!cuda_ok(cudaGetLastError(), "attention mixed kv pack launch")) return 0;
-        const float alpha = rsqrtf((float)head_dim);
+        const float alpha = 1.0f / sqrtf((float)head_dim);
         const float beta = 0.0f;
         cublasStatus_t st = cublasSgemmStridedBatched(g_cublas,
                                                       CUBLAS_OP_T,
diff --git a/ds4_rocm.h b/ds4_rocm.h
index 0400910df..55907c428 100644
--- a/ds4_rocm.h
+++ b/ds4_rocm.h
@@ -13,6 +13,7 @@
 
 #define cudaSuccess hipSuccess
 #define cudaErrorNotSupported hipErrorNotSupported
+#define cudaMemAttachGlobal hipMemAttachGlobal
 #define cudaErrorInvalidValue hipErrorInvalidValue
 #define cudaGetLastError hipGetLastError
 #define cudaGetErrorString hipGetErrorString