diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index 0a3e071e57c9..8a39678ba9ba 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -449,7 +449,12 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i #if defined(__AVX2__) // Compute the sum of the quants and set y[i].s - y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)))); + // clamp to fp16 range to avoid overflow when used in Q4_1/Q5_1 dot products + { + float s_val = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))); + s_val = fminf(65504.0f, fmaxf(-65504.0f, s_val)); + y[i].s = GGML_CPU_FP32_TO_FP16(s_val); + } // Convert int32 to int16 i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 @@ -477,9 +482,14 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i __m128i ni7 = _mm256_extractf128_si256( i3, 1); // Compute the sum of the quants and set y[i].s + // clamp to fp16 range to avoid overflow when used in Q4_1/Q5_1 dot products const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3)); const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7)); - y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1))); + { + float s_val = d * hsum_i32_4(_mm_add_epi32(s0, s1)); + s_val = fminf(65504.0f, fmaxf(-65504.0f, s_val)); + y[i].s = GGML_CPU_FP32_TO_FP16(s_val); + } // Convert int32 to int16 ni0 = _mm_packs_epi32( ni0, ni1 ); diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 4300ffc148cf..3cdb23ce1c6e 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -44,7 +44,10 @@ static __global__ void quantize_q8_1( return; } - y[ib].ds = make_half2(d, sum); + // clamp d and sum to f16 range to avoid inf from large activations + const float d_clamped = fminf(d, 65504.0f); + const float sum_clamped = fminf(fmaxf(sum, -65504.0f), 65504.0f); + y[ib].ds = make_half2(d_clamped, sum_clamped); } __device__ __forceinline__ uint8_t compute_e8m0_scale(float amax) { @@ -264,7 +267,10 @@ static __global__ void quantize_mmq_q8_1( const float d = 1.0f / d_inv; if (ds_layout == MMQ_Q8_1_DS_LAYOUT_DS4) { - y[ib].ds4[iqs/32] = make_half2(d, sum); + // clamp d and sum to f16 range to avoid inf from large activations + const float d_clamped = fminf(d, 65504.0f); + const float sum_clamped = fminf(fmaxf(sum, -65504.0f), 65504.0f); + y[ib].ds4[iqs/32] = make_half2(d_clamped, sum_clamped); } else { y[ib].d4[iqs/32] = d; } diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 15443aa554a4..7f0c71061e7f 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -288,7 +288,12 @@ void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_REST sum += y[i].qs[QK8_1/2 + j]; } - y[i].s = GGML_FP32_TO_FP16(sum*d); + // clamp to fp16 range to avoid overflow when used in Q4_1/Q5_1 dot products + { + float s_val = sum * d; + s_val = fminf(65504.0f, fmaxf(-65504.0f, s_val)); + y[i].s = GGML_FP32_TO_FP16(s_val); + } } }