diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index 94b19b82bbc2..2bae1ba248bd 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -448,8 +448,9 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i __m256i i3 = _mm256_cvtps_epi32( v3 ); #if defined(__AVX2__) - // Compute the sum of the quants and set y[i].s - y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)))); + // Store sum as bf16 in the fp16 slot to preserve full fp32 range + // (avoids overflow when used in Q4_1/Q5_1/Q4_K/Q5_K dot products with large activations) + y[i].s = GGML_FP32_TO_BF16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)))).bits; // Convert int32 to int16 i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 @@ -476,10 +477,11 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i __m128i ni6 = _mm256_castsi256_si128( i3 ); __m128i ni7 = _mm256_extractf128_si256( i3, 1); - // Compute the sum of the quants and set y[i].s + // Store sum as bf16 in the fp16 slot to preserve full fp32 range + // (avoids overflow when used in Q4_1/Q5_1/Q4_K/Q5_K dot products with large activations) const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3)); const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7)); - y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1))); + y[i].s = GGML_FP32_TO_BF16(d * hsum_i32_4(_mm_add_epi32(s0, s1))).bits; // Convert int32 to int16 ni0 = _mm_packs_epi32( ni0, ni1 ); @@ -883,7 +885,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d); - summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); + // y[ib].s holds bf16 bits (see quantize_row_q8_1); decode as bf16 to preserve full fp32 range. + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_BF16_TO_FP32((ggml_bf16_t){ .bits = y[ib].s }); const __m256 d0v = _mm256_set1_ps( d0 ); const __m256 d1v = _mm256_set1_ps( d1 ); @@ -1108,7 +1111,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi for (; ib < nb; ++ib) { const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d)); - summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); + // y[ib].s holds bf16 bits (see quantize_row_q8_1); decode as bf16 to preserve full fp32 range. + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_BF16_TO_FP32((ggml_bf16_t){ .bits = y[ib].s }); __m256i qx = bytes_from_nibbles_32(x[ib].qs); __m256i bxhi = bytes_from_bits_32(x[ib].qh); @@ -1135,7 +1139,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi for (; ib < nb; ++ib) { const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d)); - summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); + // y[ib].s holds bf16 bits (see quantize_row_q8_1); decode as bf16 to preserve full fp32 range. + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_BF16_TO_FP32((ggml_bf16_t){ .bits = y[ib].s }); __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs); const __m256i bxhi = bytes_from_bits_32(x[ib].qh); diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index e5f9a4083f9c..475d6cc9e21a 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -238,7 +238,8 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c } int sumi = sumi0 + sumi1; - sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); + // y[ib].s holds bf16 bits (see quantize_row_q8_1_ref); decode as bf16 to preserve full fp32 range. + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_BF16_TO_FP32((ggml_bf16_t){ .bits = y[ib].s }); } *s = sumf; @@ -391,7 +392,8 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c } int sumi = sumi0 + sumi1; - sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); + // y[ib].s holds bf16 bits (see quantize_row_q8_1_ref); decode as bf16 to preserve full fp32 range. + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_BF16_TO_FP32((ggml_bf16_t){ .bits = y[ib].s }); } *s = sumf; diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 10817505d9f0..c07434830830 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -909,6 +909,21 @@ static __device__ __forceinline__ uint2 fast_div_modulo(uint32_t n, const uint3 return make_uint2(div_val, mod_val); } +// CUDA-only bf16 sibling of block_q8_1. Identical byte layout, but the (d, s) pair +// is stored as bf16 instead of fp16 so that s = d * sum(qs) cannot overflow the +// 16-bit exponent range when activations contain large outliers (Q4_1, Q5_1, +// Q4_K and Q5_K dot products multiply by s and would otherwise produce NaN). +// CUDA quantizes Q8_1 activations on-device, so this struct never crosses the +// CPU/GPU boundary; the host-side block_q8_1 in ggml-common.h is unaffected. +// (No d/s union view: nv_bfloat16 has a non-trivial constructor, which C++ +// disallows in anonymous structs/unions. All call sites use ds directly.) +struct block_q8_1_bf16 { + nv_bfloat162 ds; + int8_t qs[QK8_1]; +}; + +static_assert(sizeof(block_q8_1_bf16) == sizeof(block_q8_1), "block_q8_1_bf16 must match block_q8_1 byte layout"); + typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, float2 & v); static __device__ __forceinline__ float get_alibi_slope( diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index edf546d8f1e2..6737eafad799 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -54,9 +54,26 @@ struct block_fp4_mmq { int8_t qs[4 * 32]; // 256 FP4 values packed as 4-bit pairs (2 per byte) }; -static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size"); -static_assert(sizeof(block_q8_1_mmq) == 4*sizeof(block_q8_1), "Unexpected block_q8_1_mmq size"); -static_assert(sizeof(block_fp4_mmq) == sizeof(block_q8_1_mmq), "Unexpected block_fp4_mmq size"); +// CUDA-only bf16 sibling of block_q8_1_mmq. Identical byte layout; the DS4 +// layout (1 scale + 1 partial sum per 32 values) holds bf16 pairs instead of +// fp16 pairs to keep the partial sum within range for Q4_1/Q5_1/Q4_K/Q5_K +// dot products. The D4 (fp32) and D2S6 (fp16, Q2_K) layouts are unchanged. +// The union is named (.u) because nv_bfloat16 carries a non-trivial constructor +// in cuda_bf16.h, which C++ disallows in anonymous aggregates. +struct block_q8_1_mmq_bf16 { + union { + float d4[4]; + nv_bfloat162 ds4[4]; + half d2s6[8]; + } u; + + int8_t qs[4 * QK8_1]; +}; + +static_assert(sizeof(block_q8_1_mmq) == 4 * QK8_1 + 4 * sizeof(half2), "Unexpected block_q8_1_mmq size"); +static_assert(sizeof(block_q8_1_mmq) == 4 * sizeof(block_q8_1), "Unexpected block_q8_1_mmq size"); +static_assert(sizeof(block_q8_1_mmq_bf16) == sizeof(block_q8_1_mmq), "Unexpected block_q8_1_mmq_bf16 size"); +static_assert(sizeof(block_fp4_mmq) == sizeof(block_q8_1_mmq), "Unexpected block_fp4_mmq size"); static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) { switch (type_x) { @@ -463,12 +480,12 @@ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a( constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_0, mmq_y); - const int * x_qs = (const int *) x; - const float * x_df = (const float *) x_qs + txs.qs; - const int * y_qs = (const int *) y + 4; - const half2 * y_ds = (const half2 *) y; + const int * x_qs = (const int *) x; + const float * x_df = (const float *) x_qs + txs.qs; + const int * y_qs = (const int *) y + 4; + const nv_bfloat162 * y_ds = (const nv_bfloat162 *) y; -// #pragma unroll + // #pragma unroll for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR4_0*VDR_Q4_0_Q8_1_MMQ) { const int k0 = k00 + k01; @@ -574,12 +591,12 @@ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a( constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_1, mmq_y); - const int * x_qs = (const int *) x; - const half2 * x_dm = (const half2 *) x_qs + txs.qs; - const int * y_qs = (const int *) y + 4; - const half2 * y_ds = (const half2 *) y; + const int * x_qs = (const int *) x; + const half2 * x_dm = (const half2 *) x_qs + txs.qs; + const int * y_qs = (const int *) y + 4; + const nv_bfloat162 * y_ds = (const nv_bfloat162 *) y; -// #pragma unroll + // #pragma unroll for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR4_1*VDR_Q4_1_Q8_1_MMQ) { const int k0 = k00 + k01; @@ -1170,11 +1187,11 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K); - const int * x_qs = (const int *) x; - const float * x_df = (const float *) x_qs + 2*MMQ_TILE_NE_K; - const int * y_qs = (const int *) y + 4; - const float * y_df = (const float *) y; - const half2 * y_ds = (const half2 *) y; + const int * x_qs = (const int *) x; + const float * x_df = (const float *) x_qs + 2 * MMQ_TILE_NE_K; + const int * y_qs = (const int *) y + 4; + const float * y_df = (const float *) y; + const nv_bfloat162 * y_ds = (const nv_bfloat162 *) y; const int i0 = (threadIdx.y / ntx) * rows_per_warp; @@ -1197,7 +1214,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( if (ds_layout == MMQ_Q8_1_DS_LAYOUT_D4) { dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1]; } else { - dB = __low2float(y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]); + dB = ggml_cuda_cast(y_ds[j * MMQ_TILE_Y_K + k01 / QI8_1]).x; } #pragma unroll @@ -1225,11 +1242,11 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K); - const int * x_qs = (const int *) x; - const float * x_df = (const float *) x_qs + 2*MMQ_TILE_NE_K; - const int * y_qs = (const int *) y + 4; - const float * y_df = (const float *) y; - const half2 * y_ds = (const half2 *) y; + const int * x_qs = (const int *) x; + const float * x_df = (const float *) x_qs + 2 * MMQ_TILE_NE_K; + const int * y_qs = (const int *) y + 4; + const float * y_df = (const float *) y; + const nv_bfloat162 * y_ds = (const nv_bfloat162 *) y; tile_A A[ntx][MMQ_TILE_NE_K/QI8_0]; float dA[ntx][tile_C::ne/2][MMQ_TILE_NE_K/QI8_0]; @@ -1272,9 +1289,9 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( const int j = j0 + tile_C::get_j(l); if (ds_layout == MMQ_Q8_1_DS_LAYOUT_D4) { - dB[l] = y_df[j*MMQ_TILE_Y_K + k01/QI8_1]; + dB[l] = y_df[j*MMQ_TILE_Y_K + k01/QI8_1]; } else { - dB[l] = __low2float(y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]); + dB[l] = ggml_cuda_cast(y_ds[j * MMQ_TILE_Y_K + k01 / QI8_1]).x; } } @@ -1301,12 +1318,12 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a( constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_1, mmq_y); - const int * x_qs = (const int *) x; - const half2 * x_dm = (const half2 *) x_qs + txs.qs; - const int * y_qs = (const int *) y + 4; - const half2 * y_ds = (const half2 *) y; + const int * x_qs = (const int *) x; + const half2 * x_dm = (const half2 *) x_qs + txs.qs; + const int * y_qs = (const int *) y + 4; + const nv_bfloat162 * y_ds = (const nv_bfloat162 *) y; -// #pragma unroll + // #pragma unroll for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += VDR_Q8_0_Q8_1_MMQ) { const int k0 = k00 + k01; @@ -1341,10 +1358,10 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K); - const int * x_qs = (const int *) x; - const half2 * x_dm = (const half2 *) x_qs + 2*MMQ_TILE_NE_K; - const int * y_qs = (const int *) y + 4; - const half2 * y_dm = (const half2 *) y; + const int * x_qs = (const int *) x; + const half2 * x_dm = (const half2 *) x_qs + 2 * MMQ_TILE_NE_K; + const int * y_qs = (const int *) y + 4; + const nv_bfloat162 * y_dm = (const nv_bfloat162 *) y; const int i0 = (threadIdx.y / ntx) * rows_per_warp; @@ -1363,7 +1380,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K); const int j = j0 + tile_C::get_j(0); - const float2 dsB = __half22float2(y_dm[j*MMQ_TILE_Y_K + k01/QI8_1]); + const float2 dsB = ggml_cuda_cast(y_dm[j * MMQ_TILE_Y_K + k01 / QI8_1]); #pragma unroll for (int n = 0; n < ntx; ++n) { @@ -1391,10 +1408,10 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K); - const int * x_qs = (const int *) x; - const half2 * x_dm = (const half2 *) x_qs + 2*MMQ_TILE_NE_K; - const int * y_qs = (const int *) y + 4; - const half2 * y_dm = (const half2 *) y; + const int * x_qs = (const int *) x; + const half2 * x_dm = (const half2 *) x_qs + 2 * MMQ_TILE_NE_K; + const int * y_qs = (const int *) y + 4; + const nv_bfloat162 * y_dm = (const nv_bfloat162 *) y; tile_A A[ntx][MMQ_TILE_NE_K/QI8_1]; float2 dmA[ntx][tile_C::ne/2][MMQ_TILE_NE_K/QI8_1]; @@ -1436,7 +1453,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( for (int l = 0; l < tile_C::ne/2; ++l) { const int j = j0 + tile_C::get_j(l); - dsB[l] = __half22float2(y_dm[j*MMQ_TILE_Y_K + k01/QI8_1]); + dsB[l] = ggml_cuda_cast(y_dm[j * MMQ_TILE_Y_K + k01 / QI8_1]); } #pragma unroll @@ -2206,13 +2223,13 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a( constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_K, mmq_y); - const int * x_qs = (const int *) x; - const half2 * x_dm = (const half2 *) x_qs + txs.qs; - const int * x_sc = (const int *) x_dm + txs.dm; - const int * y_qs = (const int *) y + 4; - const half2 * y_ds = (const half2 *) y; + const int * x_qs = (const int *) x; + const half2 * x_dm = (const half2 *) x_qs + txs.qs; + const int * x_sc = (const int *) x_dm + txs.dm; + const int * y_qs = (const int *) y + 4; + const nv_bfloat162 * y_ds = (const nv_bfloat162 *) y; -// #pragma unroll + // #pragma unroll for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR4_K*VDR_Q4_K_Q8_1_MMQ) { const int k0 = k00 + k01; @@ -2363,11 +2380,11 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a( constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_K, mmq_y); - const int * x_qs = (const int *) x; - const half2 * x_dm = (const half2 *) x_qs + txs.qs; - const int * x_sc = (const int *) x_dm + txs.dm; - const int * y_qs = (const int *) y + 4; - const half2 * y_ds = (const half2 *) y; + const int * x_qs = (const int *) x; + const half2 * x_dm = (const half2 *) x_qs + txs.qs; + const int * x_sc = (const int *) x_dm + txs.dm; + const int * y_qs = (const int *) y + 4; + const nv_bfloat162 * y_ds = (const nv_bfloat162 *) y; // #pragma unroll for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR5_K*VDR_Q5_K_Q8_1_MMQ) { diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index da48f313a38b..3f72ac766679 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -5,7 +5,10 @@ #include -typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs); +typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs); static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) { switch (type) { @@ -480,7 +483,8 @@ static __global__ void mul_mat_vec_q( float tmp[ncols_dst][rows_per_cuda_block] = {{0.0f}}; float tmp_gate[ncols_dst][rows_per_cuda_block] = {{0.0f}}; - const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y; + const block_q8_1_bf16 * y = + ((const block_q8_1_bf16 *) vy) + sample_y * stride_sample_y + channel_y * stride_channel_y; const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x; for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) { @@ -628,7 +632,8 @@ static __global__ void mul_mat_vec_q_moe( const uint32_t channel_x = ids[channel_dst + token_idx * ids_stride]; const uint32_t channel_y = fastmodulo(channel_dst, nchannels_y); - const block_q8_1 * y = ((const block_q8_1 *) vy) + channel_y*stride_channel_y + token_idx*stride_col_y; + const block_q8_1_bf16 * y = + ((const block_q8_1_bf16 *) vy) + channel_y * stride_channel_y + token_idx * stride_col_y; const int kbx_offset = channel_x*stride_channel_x + row0*stride_row_x; // partial sum for each thread diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 52f664719ae1..b0c838d5b989 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -23,7 +23,7 @@ static __global__ void quantize_q8_1( const int64_t i_cont = ((i3*ne2.z + i2) * ne1 + i1) * ne0 + i0; - block_q8_1 * y = (block_q8_1 *) vy; + block_q8_1_bf16 * y = (block_q8_1_bf16 *) vy; const int64_t ib = i_cont / QK8_1; // block index const int64_t iqs = i_cont % QK8_1; // quant index @@ -44,7 +44,7 @@ static __global__ void quantize_q8_1( return; } - y[ib].ds = make_half2(d, sum); + y[ib].ds = nv_bfloat162{ __float2bfloat16(d), __float2bfloat16(sum) }; } __device__ __forceinline__ uint8_t compute_e8m0_scale(float amax) { @@ -294,7 +294,7 @@ static __global__ void quantize_mmq_q8_1( const float4 * x4 = (const float4 *) x; - block_q8_1_mmq * y = (block_q8_1_mmq *) vy; + block_q8_1_mmq_bf16 * y = (block_q8_1_mmq_bf16 *) vy; const int64_t ib0 = blockIdx.z*((int64_t)gridDim.x*gridDim.y*blockDim.x/QK8_1); // first block of channel const int64_t ib = ib0 + (i0 / (4*QK8_1))*ne1 + blockIdx.x; // block index in channel @@ -340,7 +340,7 @@ static __global__ void quantize_mmq_q8_1( return; } - y[ib].d2s6[2 + iqs/16] = sum; + y[ib].u.d2s6[2 + iqs / 16] = sum; if (iqs % 64 != 0) { return; @@ -348,7 +348,7 @@ static __global__ void quantize_mmq_q8_1( const float d = 1.0f / d_inv; - y[ib].d2s6[iqs/64] = d; + y[ib].u.d2s6[iqs / 64] = d; return; } @@ -360,9 +360,9 @@ static __global__ void quantize_mmq_q8_1( const float d = 1.0f / d_inv; if (ds_layout == MMQ_Q8_1_DS_LAYOUT_DS4) { - y[ib].ds4[iqs/32] = make_half2(d, sum); + y[ib].u.ds4[iqs / 32] = nv_bfloat162{ __float2bfloat16(d), __float2bfloat16(sum) }; } else { - y[ib].d4[iqs/32] = d; + y[ib].u.d4[iqs / 32] = d; } } diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh index d1741cc8d7ba..f8b14bd66693 100644 --- a/ggml/src/ggml-cuda/vecdotq.cuh +++ b/ggml/src/ggml-cuda/vecdotq.cuh @@ -1,6 +1,7 @@ #pragma once #include "common.cuh" +#include "convert.cuh" #include @@ -112,9 +113,11 @@ static __device__ __forceinline__ uint32_t unpack_ksigns(const uint8_t v) { #define VDR_Q4_0_Q8_1_MMVQ 2 #define VDR_Q4_0_Q8_1_MMQ 4 -template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl( - const int * v, const int * u, const float & d4, const half2 & ds8) { - +template +static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(const int * v, + const int * u, + const float & d4, + const nv_bfloat162 & ds8) { int sumi = 0; #pragma unroll @@ -127,7 +130,7 @@ template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); } - const float2 ds8f = __half22float2(ds8); + const float2 ds8f = ggml_cuda_cast(ds8); // second part effectively subtracts 8 from each quant value return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y); @@ -136,9 +139,11 @@ template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp #define VDR_Q4_1_Q8_1_MMVQ 2 #define VDR_Q4_1_Q8_1_MMQ 4 -template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl( - const int * v, const int * u, const half2 & dm4, const half2 & ds8) { - +template +static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(const int * v, + const int * u, + const half2 & dm4, + const nv_bfloat162 & ds8) { int sumi = 0; #pragma unroll @@ -151,16 +156,10 @@ template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); } -#ifdef FAST_FP16_AVAILABLE - const float2 tmp = __half22float2(__hmul2(dm4, ds8)); - const float d4d8 = tmp.x; - const float m4s8 = tmp.y; -#else const float2 dm4f = __half22float2(dm4); - const float2 ds8f = __half22float2(ds8); + const float2 ds8f = ggml_cuda_cast(ds8); const float d4d8 = dm4f.x * ds8f.x; const float m4s8 = dm4f.y * ds8f.y; -#endif // FAST_FP16_AVAILABLE // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); @@ -169,9 +168,12 @@ template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp #define VDR_Q5_0_Q8_1_MMVQ 2 #define VDR_Q5_0_Q8_1_MMQ 4 -template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl( - const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) { - +template +static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(const int * vl, + const int * vh, + const int * u, + const float & d5, + const nv_bfloat162 & ds8) { int sumi = 0; #pragma unroll @@ -191,7 +193,7 @@ template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values } - const float2 ds8f = __half22float2(ds8); + const float2 ds8f = ggml_cuda_cast(ds8); // second part effectively subtracts 16 from each quant value return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y); @@ -200,9 +202,12 @@ template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp #define VDR_Q5_1_Q8_1_MMVQ 2 #define VDR_Q5_1_Q8_1_MMQ 4 -template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl( - const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) { - +template +static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(const int * vl, + const int * vh, + const int * u, + const half2 & dm5, + const nv_bfloat162 & ds8) { int sumi = 0; #pragma unroll @@ -222,16 +227,10 @@ template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values } -#ifdef FAST_FP16_AVAILABLE - const float2 tmp = __half22float2(__hmul2(dm5, ds8)); - const float d5d8 = tmp.x; - const float m5s8 = tmp.y; -#else const float2 dm5f = __half22float2(dm5); - const float2 ds8f = __half22float2(ds8); + const float2 ds8f = ggml_cuda_cast(ds8); const float d5d8 = dm5f.x * ds8f.x; const float m5s8 = dm5f.y * ds8f.y; -#endif // FAST_FP16_AVAILABLE // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it return sumi*d5d8 + m5s8 / (QI5_1 / vdr); @@ -254,9 +253,11 @@ template static __device__ __forceinline__ T vec_dot_q8_0_ return d8_0*d8_1 * ((T) sumi); } -template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl( - const int * v, const int * u, const half2 & dm8, const half2 & ds8) { - +template +static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(const int * v, + const int * u, + const half2 & dm8, + const nv_bfloat162 & ds8) { int sumi = 0; #pragma unroll @@ -265,16 +266,10 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp sumi = ggml_cuda_dp4a(v[i], u[i], sumi); } -#ifdef FAST_FP16_AVAILABLE - const float2 tmp = __half22float2(__hmul2(dm8, ds8)); - const float d8d8 = tmp.x; - const float m8s8 = tmp.y; -#else const float2 dm8f = __half22float2(dm8); - const float2 ds8f = __half22float2(ds8); + const float2 ds8f = ggml_cuda_cast(ds8); const float d8d8 = dm8f.x * ds8f.x; const float m8s8 = dm8f.y * ds8f.y; -#endif // FAST_FP16_AVAILABLE // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it return sumi*d8d8 + m8s8 / (QI8_1 / vdr); @@ -304,9 +299,10 @@ template static __device__ __forceinline__ float vec_dot_q8_0_16_q8_1_ #define VDR_MXFP4_Q8_1_MMVQ 2 #define VDR_MXFP4_Q8_1_MMQ 4 -static __device__ __forceinline__ float vec_dot_mxfp4_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - +static __device__ __forceinline__ float vec_dot_mxfp4_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_mxfp4 * bq4 = (const block_mxfp4 *) vbq + kbx; const int * q8 = (const int *) bq8_1->qs + iqs; @@ -321,19 +317,17 @@ static __device__ __forceinline__ float vec_dot_mxfp4_q8_1( sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi); } - const float d = ggml_cuda_e8m0_to_fp32(bq4->e) * 0.5f * __low2float(bq8_1->ds); + const float d = ggml_cuda_e8m0_to_fp32(bq4->e) * 0.5f * ggml_cuda_cast(bq8_1->ds).x; return d * sumi; } #define VDR_NVFP4_Q8_1_MMVQ 4 #define VDR_NVFP4_Q8_1_MMQ 8 -static __device__ __forceinline__ float vec_dot_nvfp4_q8_1( - const void * __restrict__ vbq, - const block_q8_1 * __restrict__ bq8_1, - const int32_t & kbx, - const int32_t & iqs) { - +static __device__ __forceinline__ float vec_dot_nvfp4_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int32_t & kbx, + const int32_t & iqs) { const block_nvfp4 * bq4 = (const block_nvfp4 *) vbq + kbx; float sum = 0.0f; #pragma unroll @@ -343,7 +337,7 @@ static __device__ __forceinline__ float vec_dot_nvfp4_q8_1( const int32_t is = iqs0 >> 1; const int2 v0 = get_int_from_table_16(get_int_b4(bq4->qs, iqs0), kvalues_mxfp4); const int2 v1 = get_int_from_table_16(get_int_b4(bq4->qs, iqs1), kvalues_mxfp4); - const block_q8_1 * bq8 = bq8_1 + (is >> 1); + const block_q8_1_bf16 * bq8 = bq8_1 + (is >> 1); const int32_t i8 = ((is & 1) << 2); int sumi = ggml_cuda_dp4a(v0.x, get_int_b4(bq8->qs, i8 + 0), 0); @@ -351,7 +345,7 @@ static __device__ __forceinline__ float vec_dot_nvfp4_q8_1( sumi = ggml_cuda_dp4a(v1.x, get_int_b4(bq8->qs, i8 + 1), sumi); sumi = ggml_cuda_dp4a(v1.y, get_int_b4(bq8->qs, i8 + 3), sumi); - const float d = ggml_cuda_ue4m3_to_fp32(bq4->d[is]) * __low2float(bq8->ds); + const float d = ggml_cuda_ue4m3_to_fp32(bq4->d[is]) * ggml_cuda_cast(bq8->ds).x; sum += d * float(sumi); } @@ -527,10 +521,12 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( } // contiguous v/x + u/y values -static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( - const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, - const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { - +static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(const int * __restrict__ v, + const int * __restrict__ u, + const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, + const half2 & dm4, + const nv_bfloat162 * __restrict__ ds8) { float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -543,7 +539,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( sumi_d = ggml_cuda_dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product } - const float2 ds8f = __half22float2(ds8[i]); + const float2 ds8f = ggml_cuda_cast(ds8[i]); sumf_d += ds8f.x * (sc[i] * sumi_d); sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val @@ -590,10 +586,12 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( } // contiguous v/x + u/y values -static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( - const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, - const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { - +static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(const int * __restrict__ v, + const int * __restrict__ u, + const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, + const half2 & dm4, + const nv_bfloat162 * __restrict__ ds8) { float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -606,7 +604,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( sumi_d = ggml_cuda_dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product } - const float2 ds8f = __half22float2(ds8[i]); + const float2 ds8f = ggml_cuda_cast(ds8[i]); sumf_d += ds8f.x * (sc[i] * sumi_d); sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val @@ -672,6 +670,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq( return d6 * sumf_d; } + static __device__ __forceinline__ float vec_dot_q1_0_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { @@ -718,7 +717,7 @@ static __device__ __forceinline__ float vec_dot_q1_0_q8_1( } static __device__ __forceinline__ float vec_dot_q4_0_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { + const void * __restrict__ vbq, const block_q8_1_bf16 * __restrict__ bq8_1, const int & kbx, const int & iqs) { const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq + kbx; @@ -735,10 +734,10 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1( return vec_dot_q4_0_q8_1_impl(v, u, bq4_0->d, bq8_1->ds); } - -static __device__ __forceinline__ float vec_dot_q4_1_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - +static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq + kbx; int v[VDR_Q4_1_Q8_1_MMVQ]; @@ -754,9 +753,10 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1( return vec_dot_q4_1_q8_1_impl(v, u, bq4_1->dm, bq8_1->ds); } -static __device__ __forceinline__ float vec_dot_q5_0_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - +static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq + kbx; int vl[VDR_Q5_0_Q8_1_MMVQ]; @@ -774,9 +774,10 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1( return vec_dot_q5_0_q8_1_impl(vl, vh, u, bq5_0->d, bq8_1->ds); } -static __device__ __forceinline__ float vec_dot_q5_1_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - +static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq + kbx; int vl[VDR_Q5_1_Q8_1_MMVQ]; @@ -794,9 +795,10 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1( return vec_dot_q5_1_q8_1_impl(vl, vh, u, bq5_1->dm, bq8_1->ds); } -static __device__ __forceinline__ float vec_dot_q8_0_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - +static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq + kbx; int v[VDR_Q8_0_Q8_1_MMVQ]; @@ -808,12 +810,13 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( u[i] = get_int_b4(bq8_1->qs, iqs + i); } - return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, __low2half(bq8_1->ds)); + return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, ggml_cuda_cast(bq8_1->ds).x); } -static __device__ __forceinline__ float vec_dot_q2_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - +static __device__ __forceinline__ float vec_dot_q2_K_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_q2_K * bq2_K = (const block_q2_K *) vbq + kbx; const int bq8_offset = QR2_K * (iqs / QI8_1); @@ -828,15 +831,16 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1( #pragma unroll for (int i = 0; i < QR2_K; ++ i) { u[i] = get_int_b4(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = __low2float(bq8_1[bq8_offset + i].ds); + d8[i] = ggml_cuda_cast(bq8_1[bq8_offset + i].ds).x; } return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); } -static __device__ __forceinline__ float vec_dot_q3_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - +static __device__ __forceinline__ float vec_dot_q3_K_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_q3_K * bq3_K = (const block_q3_K *) vbq + kbx; const int bq8_offset = QR3_K * (iqs / (QI3_K/2)); @@ -855,15 +859,16 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1( #pragma unroll for (int i = 0; i < QR3_K; ++i) { u[i] = get_int_b4(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = __low2float(bq8_1[bq8_offset + i].ds); + d8[i] = ggml_cuda_cast(bq8_1[bq8_offset + i].ds).x; } return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); } -static __device__ __forceinline__ float vec_dot_q4_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - +static __device__ __forceinline__ float vec_dot_q4_K_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_q4_K * bq4_K = (const block_q4_K *) vbq + kbx; int v[2]; @@ -896,8 +901,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( const uint8_t * m = sc + 2; for (int i = 0; i < QR4_K; ++i) { - const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - d8[i] = __low2float(bq8i->ds); + const block_q8_1_bf16 * bq8i = bq8_1 + bq8_offset + i; + d8[i] = ggml_cuda_cast(bq8i->ds).x; const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); u[2*i+0] = q8[0]; @@ -907,9 +912,10 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8); } -static __device__ __forceinline__ float vec_dot_q5_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - +static __device__ __forceinline__ float vec_dot_q5_K_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_q5_K * bq5_K = (const block_q5_K *) vbq + kbx; int vl[2]; @@ -942,8 +948,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( #pragma unroll for (int i = 0; i < QR5_K; ++i) { - const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - d8[i] = __low2float(bq8i->ds); + const block_q8_1_bf16 * bq8i = bq8_1 + bq8_offset + i; + d8[i] = ggml_cuda_cast(bq8i->ds).x; const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); u[2*i+0] = q8[0]; @@ -953,9 +959,10 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8); } -static __device__ __forceinline__ float vec_dot_q6_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - +static __device__ __forceinline__ float vec_dot_q6_K_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_q6_K * bq6_K = (const block_q6_K *) vbq + kbx; const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4); @@ -973,7 +980,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( #pragma unroll for (int i = 0; i < QR6_K; ++i) { u[i] = get_int_b4(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); - d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds); + d8[i] = ggml_cuda_cast(bq8_1[bq8_offset + 2 * i].ds).x; } return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); @@ -982,9 +989,10 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( #define VDR_IQ2_XXS_Q8_1_MMVQ 2 #define VDR_IQ2_XXS_Q8_1_MMQ 2 -static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - +static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq + kbx; const int q2 = get_int_b2(bq2->qs, iqs); @@ -1010,16 +1018,17 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( const int ls = aux32 >> 27 | 1; // (scale * 2 + 1) sumi = sumi * ls / 8; // (sumi * scale + sumi / 2) / 4 - const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds); + const float d = __half2float(bq2->d) * ggml_cuda_cast(bq8_1[iqs / 2].ds).x; return d * sumi; } #define VDR_IQ2_XS_Q8_1_MMVQ 2 #define VDR_IQ2_XS_Q8_1_MMQ 2 -static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - +static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq + kbx; const int2 q2_packed = make_int2(get_int_b2(bq2->qs, iqs + 0), get_int_b2(bq2->qs, iqs + 1)); @@ -1051,16 +1060,17 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( } } const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4; - const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds); + const float d = __half2float(bq2->d) * ggml_cuda_cast(bq8_1[iqs / 2].ds).x; return d * sumi; } #define VDR_IQ2_S_Q8_1_MMVQ 2 #define VDR_IQ2_S_Q8_1_MMQ 2 -static __device__ __forceinline__ float vec_dot_iq2_s_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - +static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_iq2_s * bq2 = (const block_iq2_s *) vbq + kbx; const int qs_packed = get_int_b2(bq2->qs, iqs/2); @@ -1099,16 +1109,17 @@ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1( } const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4; - const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds); + const float d = __half2float(bq2->d) * ggml_cuda_cast(bq8_1[iqs / 2].ds).x; return d * sumi; } #define VDR_IQ3_XXS_Q8_1_MMVQ 2 #define VDR_IQ3_XXS_Q8_1_MMQ 2 -static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - +static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_iq3_xxs * bq3 = (const block_iq3_xxs *) vbq + kbx; const int2 q3_packed = make_int2(get_int_b2(bq3->qs, iqs), get_int_b2(bq3->qs, iqs+1)); @@ -1137,7 +1148,7 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( const int ls = aux32 >> 28; sumi = (ls*sumi + sumi/2)/2; - const float d = __half2float(bq3->d) * __low2float(bq8_1[iqs/2].ds); + const float d = __half2float(bq3->d) * ggml_cuda_cast(bq8_1[iqs / 2].ds).x; return d * sumi; } @@ -1145,9 +1156,10 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( #define VDR_IQ3_S_Q8_1_MMQ 2 // TODO: don't use lookup table for signs -static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - +static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_iq3_s * bq3 = (const block_iq3_s *) vbq + kbx; const int2 qs_packed = make_int2(get_int_b2(bq3->qs, iqs + 0), get_int_b2(bq3->qs, iqs + 1)); @@ -1180,15 +1192,17 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( sumi *= 1 + 2*((bq3->scales[iqs/4] >> ((iqs << 1) & 0x04)) & 0x0F); - const float d = __half2float(bq3->d) * __low2float(bq8_1[iqs/2].ds); + const float d = __half2float(bq3->d) * ggml_cuda_cast(bq8_1[iqs / 2].ds).x; return d * sumi; } #define VDR_IQ1_S_Q8_1_MMVQ 1 #define VDR_IQ1_S_Q8_1_MMQ 1 -static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { +static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_iq1_s * bq1 = (const block_iq1_s *) vbq + kbx; const int qs_packed = get_int_b2(bq1->qs, iqs); @@ -1213,16 +1227,17 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( const float d1q = __half2float(bq1->d) * (((qh >> 11) & 0x0E) + 1); const float delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000); - const float2 ds = __half22float2(bq8_1[iqs].ds); + const float2 ds = ggml_cuda_cast(bq8_1[iqs].ds); return d1q * (ds.x*sumi + ds.y*delta); } #define VDR_IQ1_M_Q8_1_MMVQ 1 #define VDR_IQ1_M_Q8_1_MMQ 1 -static __device__ __forceinline__ float vec_dot_iq1_m_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - +static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_iq1_m * bq1 = (const block_iq1_m *) vbq + kbx; const int qs_packed = get_int_b4(bq1->qs, iqs); @@ -1256,7 +1271,7 @@ static __device__ __forceinline__ float vec_dot_iq1_m_q8_1( iq1m_scale_t scale; scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00F0) | ((sc[2] >> 4) & 0x0F00) | (sc[3] & 0xF000); - const float d = __half2float(scale.f16) * __low2float(bq8_1[iqs].ds); + const float d = __half2float(scale.f16) * ggml_cuda_cast(bq8_1[iqs].ds).x; const int tmp = sc[iqs/2] >> (6*(iqs%2)); const int sc0 = 2*((tmp >> 0) & 0x07) + 1; @@ -1267,9 +1282,10 @@ static __device__ __forceinline__ float vec_dot_iq1_m_q8_1( #define VDR_IQ4_NL_Q8_1_MMVQ 2 #define VDR_IQ4_NL_Q8_1_MMQ 4 -static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - +static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_iq4_nl * bq4 = (const block_iq4_nl *) vbq + kbx; const int * q8 = (const int *) bq8_1->qs + iqs; @@ -1284,16 +1300,17 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1( sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi); } - const float d = __half2float(bq4->d) * __low2float(bq8_1->ds); + const float d = __half2float(bq4->d) * ggml_cuda_cast(bq8_1->ds).x; return d * sumi; } #define VDR_IQ4_XS_Q8_1_MMVQ 4 #define VDR_IQ4_XS_Q8_1_MMQ 4 -static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - +static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(const void * __restrict__ vbq, + const block_q8_1_bf16 * __restrict__ bq8_1, + const int & kbx, + const int & iqs) { const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq + kbx; int sumi = 0; @@ -1312,6 +1329,6 @@ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1( const int ls = ((bq4->scales_l[iqs/8] >> (iqs & 0x04)) & 0x0F) | (((bq4->scales_h >> (iqs/2)) & 0x03) << 4); sumi *= ls - 32; - const float d = __half2float(bq4->d) * __low2float(bq8_1[iqs/4].ds); + const float d = __half2float(bq4->d) * ggml_cuda_cast(bq8_1[iqs / 4].ds).x; return d * sumi; } diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 15443aa554a4..2ab801af4088 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -288,7 +288,9 @@ void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_REST sum += y[i].qs[QK8_1/2 + j]; } - y[i].s = GGML_FP32_TO_FP16(sum*d); + // store sum as bf16 in the fp16 slot to preserve full fp32 range + // (avoids overflow when used in Q4_1/Q5_1/Q4_K/Q5_K dot products with large activations) + y[i].s = GGML_FP32_TO_BF16(sum * d).bits; } }