From 835acb79943faa702389ad18eb7e4156105d730a Mon Sep 17 00:00:00 2001
From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com>
Date: Wed, 8 Apr 2026 18:34:05 -0400
Subject: [PATCH 1/5] Prevent the sum of the dequantized activation in q8_1
 from overflowing

---
 ggml/src/ggml-cpu/arch/x86/quants.c | 14 ++++++++++++--
 ggml/src/ggml-quants.c              |  4 +++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c
index 74d699f633d3..dc93a20270c6 100644
--- a/ggml/src/ggml-cpu/arch/x86/quants.c
+++ b/ggml/src/ggml-cpu/arch/x86/quants.c
@@ -437,7 +437,12 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
 
 #if defined(__AVX2__)
         // Compute the sum of the quants and set y[i].s
-        y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
+        // clamp to fp16 range to avoid overflow when used in Q4_1/Q5_1 dot products
+        {
+            float s_val = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
+            s_val = fminf(65504.0f, fmaxf(-65504.0f, s_val));
+            y[i].s = GGML_CPU_FP32_TO_FP16(s_val);
+        }
 
         // Convert int32 to int16
         i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
@@ -465,9 +470,14 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
         __m128i ni7 = _mm256_extractf128_si256( i3, 1);
 
         // Compute the sum of the quants and set y[i].s
+        // clamp to fp16 range to avoid overflow when used in Q4_1/Q5_1 dot products
         const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
         const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
-        y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
+        {
+            float s_val = d * hsum_i32_4(_mm_add_epi32(s0, s1));
+            s_val = fminf(65504.0f, fmaxf(-65504.0f, s_val));
+            y[i].s = GGML_CPU_FP32_TO_FP16(s_val);
+        }
 
         // Convert int32 to int16
         ni0 = _mm_packs_epi32( ni0, ni1 );
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 48695a61ea33..04a1afb82893 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -253,7 +253,9 @@ void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_REST
             sum += y[i].qs[QK8_1/2 + j];
         }
 
-        y[i].s = GGML_FP32_TO_FP16(sum*d);
+        // clamp to fp16 range to avoid overflow when used in Q4_1/Q5_1 dot products
+        const float s_val = sum*d;
+        y[i].s = GGML_FP32_TO_FP16(fminf(65504.0f, fmaxf(-65504.0f, s_val)));
     }
 }
 

From 48f1d71a536503d6668324eb8c799bb81c0fc128 Mon Sep 17 00:00:00 2001
From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com>
Date: Wed, 8 Apr 2026 22:34:32 -0400
Subject: [PATCH 2/5] Add CUDA overflow protection

---
 ggml/src/ggml-cuda/quantize.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu
index 4300ffc148cf..f74f473c6660 100644
--- a/ggml/src/ggml-cuda/quantize.cu
+++ b/ggml/src/ggml-cuda/quantize.cu
@@ -44,7 +44,8 @@ static __global__ void quantize_q8_1(
         return;
     }
 
-    y[ib].ds = make_half2(d, sum);
+    // clamp sum to fp16 range to avoid overflow when used in Q4_1/Q5_1 dot products
+    y[ib].ds = make_half2(d, fminf(65504.0f, fmaxf(-65504.0f, sum)));
 }
 
 __device__ __forceinline__ uint8_t compute_e8m0_scale(float amax) {
@@ -264,7 +265,8 @@ static __global__ void quantize_mmq_q8_1(
     const float d = 1.0f / d_inv;
 
     if (ds_layout == MMQ_Q8_1_DS_LAYOUT_DS4) {
-        y[ib].ds4[iqs/32] = make_half2(d, sum);
+        // clamp sum to fp16 range to avoid overflow when used in Q4_1/Q5_1 dot products
+        y[ib].ds4[iqs/32] = make_half2(d, fminf(65504.0f, fmaxf(-65504.0f, sum)));
     } else {
         y[ib].d4[iqs/32]  = d;
     }

From a5ca8d6708bfae675a49bc484c8303fd29fd36c3 Mon Sep 17 00:00:00 2001
From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com>
Date: Tue, 21 Apr 2026 13:40:03 -0400
Subject: [PATCH 3/5] Clamp d and sum in CUDA q8_1 path to valid f16 range

---
 ggml/src/ggml-cuda/quantize.cu | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu
index f74f473c6660..3cdb23ce1c6e 100644
--- a/ggml/src/ggml-cuda/quantize.cu
+++ b/ggml/src/ggml-cuda/quantize.cu
@@ -44,8 +44,10 @@ static __global__ void quantize_q8_1(
         return;
     }
 
-    // clamp sum to fp16 range to avoid overflow when used in Q4_1/Q5_1 dot products
-    y[ib].ds = make_half2(d, fminf(65504.0f, fmaxf(-65504.0f, sum)));
+    // clamp d and sum to f16 range to avoid inf from large activations
+    const float d_clamped   = fminf(d, 65504.0f);
+    const float sum_clamped = fminf(fmaxf(sum, -65504.0f), 65504.0f);
+    y[ib].ds = make_half2(d_clamped, sum_clamped);
 }
 
 __device__ __forceinline__ uint8_t compute_e8m0_scale(float amax) {
@@ -265,8 +267,10 @@ static __global__ void quantize_mmq_q8_1(
     const float d = 1.0f / d_inv;
 
     if (ds_layout == MMQ_Q8_1_DS_LAYOUT_DS4) {
-        // clamp sum to fp16 range to avoid overflow when used in Q4_1/Q5_1 dot products
-        y[ib].ds4[iqs/32] = make_half2(d, fminf(65504.0f, fmaxf(-65504.0f, sum)));
+        // clamp d and sum to f16 range to avoid inf from large activations
+        const float d_clamped   = fminf(d, 65504.0f);
+        const float sum_clamped = fminf(fmaxf(sum, -65504.0f), 65504.0f);
+        y[ib].ds4[iqs/32] = make_half2(d_clamped, sum_clamped);
     } else {
         y[ib].d4[iqs/32]  = d;
     }

From d307b8ea3bc1d6ad78d0e7c665cef8078c86807a Mon Sep 17 00:00:00 2001
From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com>
Date: Tue, 21 Apr 2026 14:00:39 -0400
Subject: [PATCH 4/5] Standardize clamp code format for ggml-quants.c

---
 ggml/src/ggml-quants.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index a1c9ef5fe27f..94939c9d207b 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -289,8 +289,11 @@ void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_REST
         }
 
         // clamp to fp16 range to avoid overflow when used in Q4_1/Q5_1 dot products
-        const float s_val = sum*d;
-        y[i].s = GGML_FP32_TO_FP16(fminf(65504.0f, fmaxf(-65504.0f, s_val)));
+        {
+            float s_val = sum * d;
+            s_val = fminf(65504.0f, fmaxf(-65504.0f, s_val))
+            y[i].s = GGML_FP32_TO_FP16(s_val);
+        }
     }
 }
 

From 49fcb4e797e508f85bd38dca0a3cd672bbb6d0bb Mon Sep 17 00:00:00 2001
From: Bartowski <3266127+bartowski1182@users.noreply.github.com>
Date: Thu, 30 Apr 2026 15:44:32 -0400
Subject: [PATCH 5/5] Missing semicolon

---
 ggml/src/ggml-quants.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 94939c9d207b..7f0c71061e7f 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -291,7 +291,7 @@ void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_REST
         // clamp to fp16 range to avoid overflow when used in Q4_1/Q5_1 dot products
         {
             float s_val = sum * d;
-            s_val = fminf(65504.0f, fmaxf(-65504.0f, s_val))
+            s_val = fminf(65504.0f, fmaxf(-65504.0f, s_val));
             y[i].s = GGML_FP32_TO_FP16(s_val);
         }
     }