diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c
index 94b19b82bbc2..2bae1ba248bd 100644
--- a/ggml/src/ggml-cpu/arch/x86/quants.c
+++ b/ggml/src/ggml-cpu/arch/x86/quants.c
@@ -448,8 +448,9 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
         __m256i i3 = _mm256_cvtps_epi32( v3 );
 
 #if defined(__AVX2__)
-        // Compute the sum of the quants and set y[i].s
-        y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
+        // Store sum as bf16 in the fp16 slot to preserve full fp32 range
+        // (avoids overflow when used in Q4_1/Q5_1/Q4_K/Q5_K dot products with large activations)
+        y[i].s = GGML_FP32_TO_BF16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)))).bits;
 
         // Convert int32 to int16
         i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
@@ -476,10 +477,11 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
         __m128i ni6 = _mm256_castsi256_si128( i3 );
         __m128i ni7 = _mm256_extractf128_si256( i3, 1);
 
-        // Compute the sum of the quants and set y[i].s
+        // Store sum as bf16 in the fp16 slot to preserve full fp32 range
+        // (avoids overflow when used in Q4_1/Q5_1/Q4_K/Q5_K dot products with large activations)
         const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
         const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
-        y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
+        y[i].s = GGML_FP32_TO_BF16(d * hsum_i32_4(_mm_add_epi32(s0, s1))).bits;
 
         // Convert int32 to int16
         ni0 = _mm_packs_epi32( ni0, ni1 );
@@ -883,7 +885,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
         const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
         const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d);
 
-        summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
+        // y[ib].s holds bf16 bits (see quantize_row_q8_1); decode as bf16 to preserve full fp32 range.
+        summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_BF16_TO_FP32((ggml_bf16_t){ .bits = y[ib].s });
 
         const __m256 d0v = _mm256_set1_ps( d0 );
         const __m256 d1v = _mm256_set1_ps( d1 );
@@ -1108,7 +1111,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
     for (; ib < nb; ++ib) {
         const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
 
-        summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
+        // y[ib].s holds bf16 bits (see quantize_row_q8_1); decode as bf16 to preserve full fp32 range.
+        summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_BF16_TO_FP32((ggml_bf16_t){ .bits = y[ib].s });
 
         __m256i qx = bytes_from_nibbles_32(x[ib].qs);
         __m256i bxhi = bytes_from_bits_32(x[ib].qh);
@@ -1135,7 +1139,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
     for (; ib < nb; ++ib) {
         const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
 
-        summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
+        // y[ib].s holds bf16 bits (see quantize_row_q8_1); decode as bf16 to preserve full fp32 range.
+        summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_BF16_TO_FP32((ggml_bf16_t){ .bits = y[ib].s });
 
         __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
         const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c
index e5f9a4083f9c..475d6cc9e21a 100644
--- a/ggml/src/ggml-cpu/quants.c
+++ b/ggml/src/ggml-cpu/quants.c
@@ -238,7 +238,8 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
         }
 
         int sumi = sumi0 + sumi1;
-        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+        // y[ib].s holds bf16 bits (see quantize_row_q8_1_ref); decode as bf16 to preserve full fp32 range.
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_BF16_TO_FP32((ggml_bf16_t){ .bits = y[ib].s });
     }
 
     *s = sumf;
@@ -391,7 +392,8 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
         }
 
         int sumi = sumi0 + sumi1;
-        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+        // y[ib].s holds bf16 bits (see quantize_row_q8_1_ref); decode as bf16 to preserve full fp32 range.
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_BF16_TO_FP32((ggml_bf16_t){ .bits = y[ib].s });
     }
 
     *s = sumf;
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 10817505d9f0..c07434830830 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -909,6 +909,21 @@ static __device__ __forceinline__ uint2 fast_div_modulo(uint32_t n, const uint3
     return make_uint2(div_val, mod_val);
 }
 
+// CUDA-only bf16 sibling of block_q8_1. Identical byte layout, but the (d, s) pair
+// is stored as bf16 instead of fp16 so that s = d * sum(qs) cannot overflow the
+// 16-bit exponent range when activations contain large outliers (Q4_1, Q5_1,
+// Q4_K and Q5_K dot products multiply by s and would otherwise produce NaN).
+// CUDA quantizes Q8_1 activations on-device, so this struct never crosses the
+// CPU/GPU boundary; the host-side block_q8_1 in ggml-common.h is unaffected.
+// (No d/s union view: nv_bfloat16 has a non-trivial constructor, which C++
+// disallows in anonymous structs/unions. All call sites use ds directly.)
+struct block_q8_1_bf16 {
+    nv_bfloat162 ds;
+    int8_t       qs[QK8_1];
+};
+
+static_assert(sizeof(block_q8_1_bf16) == sizeof(block_q8_1), "block_q8_1_bf16 must match block_q8_1 byte layout");
+
 typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, float2 & v);
 
 static __device__ __forceinline__ float get_alibi_slope(
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index edf546d8f1e2..6737eafad799 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -54,9 +54,26 @@ struct block_fp4_mmq {
     int8_t   qs[4 * 32];  // 256 FP4 values packed as 4-bit pairs (2 per byte)
 };
 
-static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size");
-static_assert(sizeof(block_q8_1_mmq) == 4*sizeof(block_q8_1),      "Unexpected block_q8_1_mmq size");
-static_assert(sizeof(block_fp4_mmq)  == sizeof(block_q8_1_mmq),    "Unexpected block_fp4_mmq size");
+// CUDA-only bf16 sibling of block_q8_1_mmq. Identical byte layout; the DS4
+// layout (1 scale + 1 partial sum per 32 values) holds bf16 pairs instead of
+// fp16 pairs to keep the partial sum within range for Q4_1/Q5_1/Q4_K/Q5_K
+// dot products. The D4 (fp32) and D2S6 (fp16, Q2_K) layouts are unchanged.
+// The union is named (.u) because nv_bfloat16 carries a non-trivial constructor
+// in cuda_bf16.h, which C++ disallows in anonymous aggregates.
+struct block_q8_1_mmq_bf16 {
+    union {
+        float        d4[4];
+        nv_bfloat162 ds4[4];
+        half         d2s6[8];
+    } u;
+
+    int8_t qs[4 * QK8_1];
+};
+
+static_assert(sizeof(block_q8_1_mmq) == 4 * QK8_1 + 4 * sizeof(half2), "Unexpected block_q8_1_mmq size");
+static_assert(sizeof(block_q8_1_mmq) == 4 * sizeof(block_q8_1), "Unexpected block_q8_1_mmq size");
+static_assert(sizeof(block_q8_1_mmq_bf16) == sizeof(block_q8_1_mmq), "Unexpected block_q8_1_mmq_bf16 size");
+static_assert(sizeof(block_fp4_mmq) == sizeof(block_q8_1_mmq), "Unexpected block_fp4_mmq size");
 
 static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
     switch (type_x) {
@@ -463,12 +480,12 @@ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a(
     constexpr int warp_size = ggml_cuda_get_physical_warp_size();
 
     constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_0, mmq_y);
-    const int   * x_qs = (const int   *) x;
-    const float * x_df = (const float *) x_qs + txs.qs;
-    const int   * y_qs = (const int   *) y + 4;
-    const half2 * y_ds = (const half2 *) y;
+    const int *            x_qs = (const int *) x;
+    const float *          x_df = (const float *) x_qs + txs.qs;
+    const int *            y_qs = (const int *) y + 4;
+    const nv_bfloat162 *   y_ds = (const nv_bfloat162 *) y;
 
-// #pragma unroll
+    // #pragma unroll
     for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR4_0*VDR_Q4_0_Q8_1_MMQ) {
         const int k0 = k00 + k01;
 
@@ -574,12 +591,12 @@ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a(
     constexpr int warp_size = ggml_cuda_get_physical_warp_size();
 
     constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_1, mmq_y);
-    const int   * x_qs = (const int   *) x;
-    const half2 * x_dm = (const half2 *) x_qs + txs.qs;
-    const int   * y_qs = (const int   *) y + 4;
-    const half2 * y_ds = (const half2 *) y;
+    const int *            x_qs = (const int *) x;
+    const half2 *          x_dm = (const half2 *) x_qs + txs.qs;
+    const int *            y_qs = (const int *) y + 4;
+    const nv_bfloat162 *   y_ds = (const nv_bfloat162 *) y;
 
-// #pragma unroll
+    // #pragma unroll
     for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR4_1*VDR_Q4_1_Q8_1_MMQ) {
         const int k0 = k00 + k01;
 
@@ -1170,11 +1187,11 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
 
     y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
 
-    const int   * x_qs = (const int   *) x;
-    const float * x_df = (const float *) x_qs + 2*MMQ_TILE_NE_K;
-    const int   * y_qs = (const int   *) y + 4;
-    const float * y_df = (const float *) y;
-    const half2 * y_ds = (const half2 *) y;
+    const int *          x_qs = (const int *) x;
+    const float *        x_df = (const float *) x_qs + 2 * MMQ_TILE_NE_K;
+    const int *          y_qs = (const int *) y + 4;
+    const float *        y_df = (const float *) y;
+    const nv_bfloat162 * y_ds = (const nv_bfloat162 *) y;
 
     const int i0 = (threadIdx.y / ntx) * rows_per_warp;
 
@@ -1197,7 +1214,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
             if (ds_layout == MMQ_Q8_1_DS_LAYOUT_D4) {
                 dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
             } else {
-                dB = __low2float(y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
+                dB = ggml_cuda_cast<float2>(y_ds[j * MMQ_TILE_Y_K + k01 / QI8_1]).x;
             }
 
 #pragma unroll
@@ -1225,11 +1242,11 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
 
     y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
 
-    const int   * x_qs = (const int   *) x;
-    const float * x_df = (const float *) x_qs + 2*MMQ_TILE_NE_K;
-    const int   * y_qs = (const int   *) y + 4;
-    const float * y_df = (const float *) y;
-    const half2 * y_ds = (const half2 *) y;
+    const int *          x_qs = (const int *) x;
+    const float *        x_df = (const float *) x_qs + 2 * MMQ_TILE_NE_K;
+    const int *          y_qs = (const int *) y + 4;
+    const float *        y_df = (const float *) y;
+    const nv_bfloat162 * y_ds = (const nv_bfloat162 *) y;
 
     tile_A A[ntx][MMQ_TILE_NE_K/QI8_0];
     float dA[ntx][tile_C::ne/2][MMQ_TILE_NE_K/QI8_0];
@@ -1272,9 +1289,9 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
                 const int j = j0 + tile_C::get_j(l);
 
                 if (ds_layout == MMQ_Q8_1_DS_LAYOUT_D4) {
-                    dB[l] =             y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
+                    dB[l] = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
                 } else {
-                    dB[l] = __low2float(y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
+                    dB[l] = ggml_cuda_cast<float2>(y_ds[j * MMQ_TILE_Y_K + k01 / QI8_1]).x;
                 }
             }
 
@@ -1301,12 +1318,12 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a(
     constexpr int warp_size = ggml_cuda_get_physical_warp_size();
 
     constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_1, mmq_y);
-    const int   * x_qs = (const int   *) x;
-    const half2 * x_dm = (const half2 *) x_qs + txs.qs;
-    const int   * y_qs = (const int   *) y + 4;
-    const half2 * y_ds = (const half2 *) y;
+    const int *            x_qs = (const int *) x;
+    const half2 *          x_dm = (const half2 *) x_qs + txs.qs;
+    const int *            y_qs = (const int *) y + 4;
+    const nv_bfloat162 *   y_ds = (const nv_bfloat162 *) y;
 
-// #pragma unroll
+    // #pragma unroll
     for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += VDR_Q8_0_Q8_1_MMQ) {
         const int k0 = k00 + k01;
 
@@ -1341,10 +1358,10 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
 
     y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
 
-    const int   * x_qs = (const int   *) x;
-    const half2 * x_dm = (const half2 *) x_qs + 2*MMQ_TILE_NE_K;
-    const int   * y_qs = (const int   *) y + 4;
-    const half2 * y_dm = (const half2 *) y;
+    const int *          x_qs = (const int *) x;
+    const half2 *        x_dm = (const half2 *) x_qs + 2 * MMQ_TILE_NE_K;
+    const int *          y_qs = (const int *) y + 4;
+    const nv_bfloat162 * y_dm = (const nv_bfloat162 *) y;
 
     const int i0 = (threadIdx.y / ntx) * rows_per_warp;
 
@@ -1363,7 +1380,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
             load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
 
             const int j = j0 + tile_C::get_j(0);
-            const float2 dsB = __half22float2(y_dm[j*MMQ_TILE_Y_K + k01/QI8_1]);
+            const float2 dsB = ggml_cuda_cast<float2>(y_dm[j * MMQ_TILE_Y_K + k01 / QI8_1]);
 
 #pragma unroll
             for (int n = 0; n < ntx; ++n) {
@@ -1391,10 +1408,10 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
 
     y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
 
-    const int   * x_qs = (const int   *) x;
-    const half2 * x_dm = (const half2 *) x_qs + 2*MMQ_TILE_NE_K;
-    const int   * y_qs = (const int   *) y + 4;
-    const half2 * y_dm = (const half2 *) y;
+    const int *          x_qs = (const int *) x;
+    const half2 *        x_dm = (const half2 *) x_qs + 2 * MMQ_TILE_NE_K;
+    const int *          y_qs = (const int *) y + 4;
+    const nv_bfloat162 * y_dm = (const nv_bfloat162 *) y;
 
     tile_A   A[ntx][MMQ_TILE_NE_K/QI8_1];
     float2 dmA[ntx][tile_C::ne/2][MMQ_TILE_NE_K/QI8_1];
@@ -1436,7 +1453,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
             for (int l = 0; l < tile_C::ne/2; ++l) {
                 const int j = j0 + tile_C::get_j(l);
 
-                dsB[l] = __half22float2(y_dm[j*MMQ_TILE_Y_K + k01/QI8_1]);
+                dsB[l] = ggml_cuda_cast<float2>(y_dm[j * MMQ_TILE_Y_K + k01 / QI8_1]);
             }
 
 #pragma unroll
@@ -2206,13 +2223,13 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a(
     constexpr int warp_size = ggml_cuda_get_physical_warp_size();
 
     constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_K, mmq_y);
-    const int   * x_qs = (const int   *) x;
-    const half2 * x_dm = (const half2 *) x_qs + txs.qs;
-    const int   * x_sc = (const int   *) x_dm + txs.dm;
-    const int   * y_qs = (const int   *) y + 4;
-    const half2 * y_ds = (const half2 *) y;
+    const int *            x_qs = (const int *) x;
+    const half2 *          x_dm = (const half2 *) x_qs + txs.qs;
+    const int *            x_sc = (const int *) x_dm + txs.dm;
+    const int *            y_qs = (const int *) y + 4;
+    const nv_bfloat162 *   y_ds = (const nv_bfloat162 *) y;
 
-// #pragma unroll
+    // #pragma unroll
     for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR4_K*VDR_Q4_K_Q8_1_MMQ) {
         const int k0 = k00 + k01;
 
@@ -2363,11 +2380,11 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a(
     constexpr int warp_size = ggml_cuda_get_physical_warp_size();
 
     constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_K, mmq_y);
-    const int   * x_qs = (const int   *) x;
-    const half2 * x_dm = (const half2 *) x_qs + txs.qs;
-    const int   * x_sc = (const int   *) x_dm + txs.dm;
-    const int   * y_qs = (const int   *) y + 4;
-    const half2 * y_ds = (const half2 *) y;
+    const int *            x_qs = (const int *) x;
+    const half2 *          x_dm = (const half2 *) x_qs + txs.qs;
+    const int *            x_sc = (const int *) x_dm + txs.dm;
+    const int *            y_qs = (const int *) y + 4;
+    const nv_bfloat162 *   y_ds = (const nv_bfloat162 *) y;
 
 // #pragma unroll
     for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR5_K*VDR_Q5_K_Q8_1_MMQ) {
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index da48f313a38b..3f72ac766679 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -5,7 +5,10 @@
 
 #include <cstdint>
 
-typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
+typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq,
+                                  const block_q8_1_bf16 * __restrict__ bq8_1,
+                                  const int & kbx,
+                                  const int & iqs);
 
 static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
     switch (type) {
@@ -480,7 +483,8 @@ static __global__ void mul_mat_vec_q(
     float tmp[ncols_dst][rows_per_cuda_block] = {{0.0f}};
     float tmp_gate[ncols_dst][rows_per_cuda_block] = {{0.0f}};
 
-    const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y;
+    const block_q8_1_bf16 * y =
+        ((const block_q8_1_bf16 *) vy) + sample_y * stride_sample_y + channel_y * stride_channel_y;
     const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x;
 
     for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
@@ -628,7 +632,8 @@ static __global__ void mul_mat_vec_q_moe(
     const uint32_t channel_x = ids[channel_dst + token_idx * ids_stride];
     const uint32_t channel_y = fastmodulo(channel_dst, nchannels_y);
 
-    const block_q8_1 * y = ((const block_q8_1 *) vy) + channel_y*stride_channel_y + token_idx*stride_col_y;
+    const block_q8_1_bf16 * y =
+        ((const block_q8_1_bf16 *) vy) + channel_y * stride_channel_y + token_idx * stride_col_y;
     const int kbx_offset  = channel_x*stride_channel_x + row0*stride_row_x;
 
     // partial sum for each thread
diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu
index 52f664719ae1..b0c838d5b989 100644
--- a/ggml/src/ggml-cuda/quantize.cu
+++ b/ggml/src/ggml-cuda/quantize.cu
@@ -23,7 +23,7 @@ static __global__ void quantize_q8_1(
 
     const int64_t i_cont = ((i3*ne2.z + i2) * ne1 + i1) * ne0 + i0;
 
-    block_q8_1 * y = (block_q8_1 *) vy;
+    block_q8_1_bf16 * y = (block_q8_1_bf16 *) vy;
 
     const int64_t ib  = i_cont / QK8_1; // block index
     const int64_t iqs = i_cont % QK8_1; // quant index
@@ -44,7 +44,7 @@ static __global__ void quantize_q8_1(
         return;
     }
 
-    y[ib].ds = make_half2(d, sum);
+    y[ib].ds = nv_bfloat162{ __float2bfloat16(d), __float2bfloat16(sum) };
 }
 
 __device__ __forceinline__ uint8_t compute_e8m0_scale(float amax) {
@@ -294,7 +294,7 @@ static __global__ void quantize_mmq_q8_1(
 
     const float4 * x4 = (const float4 *) x;
 
-    block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
+    block_q8_1_mmq_bf16 * y = (block_q8_1_mmq_bf16 *) vy;
 
     const int64_t ib0 = blockIdx.z*((int64_t)gridDim.x*gridDim.y*blockDim.x/QK8_1); // first block of channel
     const int64_t ib  = ib0 + (i0 / (4*QK8_1))*ne1 + blockIdx.x;                    // block index in channel
@@ -340,7 +340,7 @@ static __global__ void quantize_mmq_q8_1(
             return;
         }
 
-        y[ib].d2s6[2 + iqs/16] = sum;
+        y[ib].u.d2s6[2 + iqs / 16] = sum;
 
         if (iqs % 64 != 0) {
             return;
@@ -348,7 +348,7 @@ static __global__ void quantize_mmq_q8_1(
 
         const float d = 1.0f / d_inv;
 
-        y[ib].d2s6[iqs/64] = d;
+        y[ib].u.d2s6[iqs / 64] = d;
 
         return;
     }
@@ -360,9 +360,9 @@ static __global__ void quantize_mmq_q8_1(
     const float d = 1.0f / d_inv;
 
     if (ds_layout == MMQ_Q8_1_DS_LAYOUT_DS4) {
-        y[ib].ds4[iqs/32] = make_half2(d, sum);
+        y[ib].u.ds4[iqs / 32] = nv_bfloat162{ __float2bfloat16(d), __float2bfloat16(sum) };
     } else {
-        y[ib].d4[iqs/32]  = d;
+        y[ib].u.d4[iqs / 32] = d;
     }
 }
 
diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh
index d1741cc8d7ba..f8b14bd66693 100644
--- a/ggml/src/ggml-cuda/vecdotq.cuh
+++ b/ggml/src/ggml-cuda/vecdotq.cuh
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "common.cuh"
+#include "convert.cuh"
 
 #include <cstdint>
 
@@ -112,9 +113,11 @@ static __device__ __forceinline__ uint32_t unpack_ksigns(const uint8_t v) {
 #define VDR_Q4_0_Q8_1_MMVQ 2
 #define VDR_Q4_0_Q8_1_MMQ  4
 
-template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
-    const int * v, const int * u, const float & d4, const half2 & ds8) {
-
+template <int vdr>
+static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(const int *          v,
+                                                               const int *          u,
+                                                               const float &        d4,
+                                                               const nv_bfloat162 & ds8) {
     int sumi = 0;
 
 #pragma unroll
@@ -127,7 +130,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
         sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
     }
 
-    const float2 ds8f = __half22float2(ds8);
+    const float2 ds8f = ggml_cuda_cast<float2>(ds8);
 
     // second part effectively subtracts 8 from each quant value
     return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
@@ -136,9 +139,11 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
 #define VDR_Q4_1_Q8_1_MMVQ 2
 #define VDR_Q4_1_Q8_1_MMQ  4
 
-template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
-    const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
-
+template <int vdr>
+static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(const int *          v,
+                                                               const int *          u,
+                                                               const half2 &        dm4,
+                                                               const nv_bfloat162 & ds8) {
     int sumi = 0;
 
 #pragma unroll
@@ -151,16 +156,10 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
         sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
     }
 
-#ifdef FAST_FP16_AVAILABLE
-    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
-    const float d4d8 = tmp.x;
-    const float m4s8 = tmp.y;
-#else
     const float2 dm4f = __half22float2(dm4);
-    const float2 ds8f = __half22float2(ds8);
+    const float2 ds8f = ggml_cuda_cast<float2>(ds8);
     const float d4d8 = dm4f.x * ds8f.x;
     const float m4s8 = dm4f.y * ds8f.y;
-#endif // FAST_FP16_AVAILABLE
 
     // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
     return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
@@ -169,9 +168,12 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
 #define VDR_Q5_0_Q8_1_MMVQ 2
 #define VDR_Q5_0_Q8_1_MMQ  4
 
-template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
-    const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
-
+template <int vdr>
+static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(const int *          vl,
+                                                               const int *          vh,
+                                                               const int *          u,
+                                                               const float &        d5,
+                                                               const nv_bfloat162 & ds8) {
     int sumi = 0;
 
 #pragma unroll
@@ -191,7 +193,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
         sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
     }
 
-    const float2 ds8f = __half22float2(ds8);
+    const float2 ds8f = ggml_cuda_cast<float2>(ds8);
 
     // second part effectively subtracts 16 from each quant value
     return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
@@ -200,9 +202,12 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
 #define VDR_Q5_1_Q8_1_MMVQ 2
 #define VDR_Q5_1_Q8_1_MMQ  4
 
-template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
-    const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
-
+template <int vdr>
+static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(const int *          vl,
+                                                               const int *          vh,
+                                                               const int *          u,
+                                                               const half2 &        dm5,
+                                                               const nv_bfloat162 & ds8) {
     int sumi = 0;
 
 #pragma unroll
@@ -222,16 +227,10 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
         sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
     }
 
-#ifdef FAST_FP16_AVAILABLE
-    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
-    const float d5d8 = tmp.x;
-    const float m5s8 = tmp.y;
-#else
     const float2 dm5f = __half22float2(dm5);
-    const float2 ds8f = __half22float2(ds8);
+    const float2 ds8f = ggml_cuda_cast<float2>(ds8);
     const float d5d8 = dm5f.x * ds8f.x;
     const float m5s8 = dm5f.y * ds8f.y;
-#endif // FAST_FP16_AVAILABLE
 
     // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
     return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
@@ -254,9 +253,11 @@ template <typename T, int vdr> static __device__ __forceinline__ T vec_dot_q8_0_
     return d8_0*d8_1 * ((T) sumi);
 }
 
-template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
-    const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
-
+template <int vdr>
+static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(const int *          v,
+                                                               const int *          u,
+                                                               const half2 &        dm8,
+                                                               const nv_bfloat162 & ds8) {
     int sumi = 0;
 
 #pragma unroll
@@ -265,16 +266,10 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
         sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
     }
 
-#ifdef FAST_FP16_AVAILABLE
-    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
-    const float d8d8 = tmp.x;
-    const float m8s8 = tmp.y;
-#else
     const float2 dm8f = __half22float2(dm8);
-    const float2 ds8f = __half22float2(ds8);
+    const float2 ds8f = ggml_cuda_cast<float2>(ds8);
     const float d8d8 = dm8f.x * ds8f.x;
     const float m8s8 = dm8f.y * ds8f.y;
-#endif // FAST_FP16_AVAILABLE
 
     // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
     return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
@@ -304,9 +299,10 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_16_q8_1_
 #define VDR_MXFP4_Q8_1_MMVQ 2
 #define VDR_MXFP4_Q8_1_MMQ  4
 
-static __device__ __forceinline__ float vec_dot_mxfp4_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
+static __device__ __forceinline__ float vec_dot_mxfp4_q8_1(const void * __restrict__ vbq,
+                                                           const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                           const int & kbx,
+                                                           const int & iqs) {
     const block_mxfp4 * bq4 = (const block_mxfp4 *) vbq + kbx;
 
     const int * q8 = (const int *) bq8_1->qs + iqs;
@@ -321,19 +317,17 @@ static __device__ __forceinline__ float vec_dot_mxfp4_q8_1(
         sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi);
     }
 
-    const float d = ggml_cuda_e8m0_to_fp32(bq4->e) * 0.5f * __low2float(bq8_1->ds);
+    const float d = ggml_cuda_e8m0_to_fp32(bq4->e) * 0.5f * ggml_cuda_cast<float2>(bq8_1->ds).x;
     return d * sumi;
 }
 
 #define VDR_NVFP4_Q8_1_MMVQ 4
 #define VDR_NVFP4_Q8_1_MMQ  8
 
-static __device__ __forceinline__ float vec_dot_nvfp4_q8_1(
-                                        const void * __restrict__ vbq,
-                                        const block_q8_1 * __restrict__ bq8_1,
-                                        const int32_t & kbx,
-                                        const int32_t & iqs) {
-
+static __device__ __forceinline__ float vec_dot_nvfp4_q8_1(const void * __restrict__ vbq,
+                                                           const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                           const int32_t & kbx,
+                                                           const int32_t & iqs) {
     const block_nvfp4 * bq4 = (const block_nvfp4 *) vbq + kbx;
     float sum = 0.0f;
 #pragma unroll
@@ -343,7 +337,7 @@ static __device__ __forceinline__ float vec_dot_nvfp4_q8_1(
         const int32_t is = iqs0 >> 1;
         const int2 v0 = get_int_from_table_16(get_int_b4(bq4->qs, iqs0), kvalues_mxfp4);
         const int2 v1 = get_int_from_table_16(get_int_b4(bq4->qs, iqs1), kvalues_mxfp4);
-        const block_q8_1 * bq8 = bq8_1 + (is >> 1);
+        const block_q8_1_bf16 * bq8  = bq8_1 + (is >> 1);
         const int32_t i8 = ((is & 1) << 2);
 
         int sumi = ggml_cuda_dp4a(v0.x, get_int_b4(bq8->qs, i8 + 0), 0);
@@ -351,7 +345,7 @@ static __device__ __forceinline__ float vec_dot_nvfp4_q8_1(
         sumi = ggml_cuda_dp4a(v1.x, get_int_b4(bq8->qs, i8 + 1), sumi);
         sumi = ggml_cuda_dp4a(v1.y, get_int_b4(bq8->qs, i8 + 3), sumi);
 
-        const float d = ggml_cuda_ue4m3_to_fp32(bq4->d[is]) * __low2float(bq8->ds);
+        const float d = ggml_cuda_ue4m3_to_fp32(bq4->d[is]) * ggml_cuda_cast<float2>(bq8->ds).x;
         sum += d * float(sumi);
     }
 
@@ -527,10 +521,12 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
 }
 
 // contiguous v/x + u/y values
-static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
-    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
-    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
-
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(const int * __restrict__ v,
+                                                                   const int * __restrict__ u,
+                                                                   const uint8_t * __restrict__ sc,
+                                                                   const uint8_t * __restrict__ m,
+                                                                   const half2 & dm4,
+                                                                   const nv_bfloat162 * __restrict__ ds8) {
     float sumf_d = 0.0f;
     float sumf_m = 0.0f;
 
@@ -543,7 +539,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
             sumi_d = ggml_cuda_dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
         }
 
-        const float2 ds8f = __half22float2(ds8[i]);
+        const float2 ds8f = ggml_cuda_cast<float2>(ds8[i]);
 
         sumf_d += ds8f.x * (sc[i] * sumi_d);
         sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
@@ -590,10 +586,12 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
 }
 
 // contiguous v/x + u/y values
-static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
-    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
-    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
-
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(const int * __restrict__ v,
+                                                                   const int * __restrict__ u,
+                                                                   const uint8_t * __restrict__ sc,
+                                                                   const uint8_t * __restrict__ m,
+                                                                   const half2 & dm4,
+                                                                   const nv_bfloat162 * __restrict__ ds8) {
     float sumf_d = 0.0f;
     float sumf_m = 0.0f;
 
@@ -606,7 +604,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
             sumi_d = ggml_cuda_dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
         }
 
-        const float2 ds8f = __half22float2(ds8[i]);
+        const float2 ds8f = ggml_cuda_cast<float2>(ds8[i]);
 
         sumf_d += ds8f.x * (sc[i] * sumi_d);
         sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
@@ -672,6 +670,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
     return d6 * sumf_d;
 }
 
+
 static __device__ __forceinline__ float vec_dot_q1_0_q8_1(
     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 
@@ -718,7 +717,7 @@ static __device__ __forceinline__ float vec_dot_q1_0_q8_1(
 }
 
 static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+    const void * __restrict__ vbq, const block_q8_1_bf16 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 
     const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq + kbx;
 
@@ -735,10 +734,10 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
     return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
 }
 
-
-static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
+static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq,
+                                                          const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                          const int & kbx,
+                                                          const int & iqs) {
     const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq + kbx;
 
     int v[VDR_Q4_1_Q8_1_MMVQ];
@@ -754,9 +753,10 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
     return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
 }
 
-static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
+static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq,
+                                                          const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                          const int & kbx,
+                                                          const int & iqs) {
     const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq + kbx;
 
     int vl[VDR_Q5_0_Q8_1_MMVQ];
@@ -774,9 +774,10 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
     return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
 }
 
-static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
+static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq,
+                                                          const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                          const int & kbx,
+                                                          const int & iqs) {
     const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq + kbx;
 
     int vl[VDR_Q5_1_Q8_1_MMVQ];
@@ -794,9 +795,10 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
     return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
 }
 
-static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
+static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq,
+                                                          const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                          const int & kbx,
+                                                          const int & iqs) {
     const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq + kbx;
 
     int v[VDR_Q8_0_Q8_1_MMVQ];
@@ -808,12 +810,13 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
         u[i] = get_int_b4(bq8_1->qs, iqs + i);
     }
 
-    return vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
+    return vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, ggml_cuda_cast<float2>(bq8_1->ds).x);
 }
 
-static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1(const void * __restrict__ vbq,
+                                                          const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                          const int & kbx,
+                                                          const int & iqs) {
     const block_q2_K * bq2_K = (const block_q2_K *) vbq + kbx;
 
     const int bq8_offset = QR2_K * (iqs / QI8_1);
@@ -828,15 +831,16 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
 #pragma unroll
     for (int i = 0; i < QR2_K; ++ i) {
         u[i]  = get_int_b4(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
+        d8[i] = ggml_cuda_cast<float2>(bq8_1[bq8_offset + i].ds).x;
     }
 
     return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
 }
 
-static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1(const void * __restrict__ vbq,
+                                                          const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                          const int & kbx,
+                                                          const int & iqs) {
     const block_q3_K * bq3_K = (const block_q3_K *) vbq + kbx;
 
     const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
@@ -855,15 +859,16 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
 #pragma unroll
     for (int i = 0; i < QR3_K; ++i) {
         u[i]  = get_int_b4(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
+        d8[i] = ggml_cuda_cast<float2>(bq8_1[bq8_offset + i].ds).x;
     }
 
     return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
 }
 
-static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1(const void * __restrict__ vbq,
+                                                          const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                          const int & kbx,
+                                                          const int & iqs) {
     const block_q4_K * bq4_K = (const block_q4_K *) vbq + kbx;
 
     int    v[2];
@@ -896,8 +901,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
     const uint8_t * m  = sc + 2;
 
     for (int i = 0; i < QR4_K; ++i) {
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = __low2float(bq8i->ds);
+        const block_q8_1_bf16 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i]                        = ggml_cuda_cast<float2>(bq8i->ds).x;
 
         const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
         u[2*i+0] = q8[0];
@@ -907,9 +912,10 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
     return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
 }
 
-static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1(const void * __restrict__ vbq,
+                                                          const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                          const int & kbx,
+                                                          const int & iqs) {
     const block_q5_K * bq5_K = (const block_q5_K *) vbq + kbx;
 
     int   vl[2];
@@ -942,8 +948,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
 
 #pragma unroll
     for (int i = 0; i < QR5_K; ++i) {
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = __low2float(bq8i->ds);
+        const block_q8_1_bf16 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i]                        = ggml_cuda_cast<float2>(bq8i->ds).x;
 
         const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
         u[2*i+0] = q8[0];
@@ -953,9 +959,10 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
     return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
 }
 
-static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1(const void * __restrict__ vbq,
+                                                          const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                          const int & kbx,
+                                                          const int & iqs) {
     const block_q6_K * bq6_K = (const block_q6_K *) vbq + kbx;
 
     const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
@@ -973,7 +980,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
 #pragma unroll
     for (int i = 0; i < QR6_K; ++i) {
         u[i]  = get_int_b4(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
-        d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
+        d8[i] = ggml_cuda_cast<float2>(bq8_1[bq8_offset + 2 * i].ds).x;
     }
 
     return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
@@ -982,9 +989,10 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
 #define VDR_IQ2_XXS_Q8_1_MMVQ 2
 #define VDR_IQ2_XXS_Q8_1_MMQ  2
 
-static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
+static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(const void * __restrict__ vbq,
+                                                             const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                             const int & kbx,
+                                                             const int & iqs) {
     const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq + kbx;
 
     const int q2 = get_int_b2(bq2->qs, iqs);
@@ -1010,16 +1018,17 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
 
     const int ls = aux32 >> 27 | 1; // (scale * 2 + 1)
     sumi = sumi * ls / 8;           // (sumi * scale + sumi / 2) / 4
-    const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
+    const float d = __half2float(bq2->d) * ggml_cuda_cast<float2>(bq8_1[iqs / 2].ds).x;
     return d * sumi;
 }
 
 #define VDR_IQ2_XS_Q8_1_MMVQ 2
 #define VDR_IQ2_XS_Q8_1_MMQ  2
 
-static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
+static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(const void * __restrict__ vbq,
+                                                            const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                            const int & kbx,
+                                                            const int & iqs) {
     const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq + kbx;
 
     const int2 q2_packed = make_int2(get_int_b2(bq2->qs, iqs + 0), get_int_b2(bq2->qs, iqs + 1));
@@ -1051,16 +1060,17 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
         }
     }
     const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4;
-    const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
+    const float d    = __half2float(bq2->d) * ggml_cuda_cast<float2>(bq8_1[iqs / 2].ds).x;
     return d * sumi;
 }
 
 #define VDR_IQ2_S_Q8_1_MMVQ 2
 #define VDR_IQ2_S_Q8_1_MMQ  2
 
-static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
+static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(const void * __restrict__ vbq,
+                                                           const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                           const int & kbx,
+                                                           const int & iqs) {
     const block_iq2_s * bq2 = (const block_iq2_s *) vbq + kbx;
 
     const int       qs_packed = get_int_b2(bq2->qs, iqs/2);
@@ -1099,16 +1109,17 @@ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
     }
     const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4;
 
-    const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
+    const float d = __half2float(bq2->d) * ggml_cuda_cast<float2>(bq8_1[iqs / 2].ds).x;
     return d * sumi;
 }
 
 #define VDR_IQ3_XXS_Q8_1_MMVQ 2
 #define VDR_IQ3_XXS_Q8_1_MMQ  2
 
-static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
+static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(const void * __restrict__ vbq,
+                                                             const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                             const int & kbx,
+                                                             const int & iqs) {
     const block_iq3_xxs * bq3 = (const block_iq3_xxs *) vbq + kbx;
 
     const int2 q3_packed = make_int2(get_int_b2(bq3->qs, iqs), get_int_b2(bq3->qs, iqs+1));
@@ -1137,7 +1148,7 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
 
     const int ls = aux32 >> 28;
     sumi = (ls*sumi + sumi/2)/2;
-    const float d = __half2float(bq3->d) * __low2float(bq8_1[iqs/2].ds);
+    const float d = __half2float(bq3->d) * ggml_cuda_cast<float2>(bq8_1[iqs / 2].ds).x;
     return d * sumi;
 }
 
@@ -1145,9 +1156,10 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
 #define VDR_IQ3_S_Q8_1_MMQ  2
 
 // TODO: don't use lookup table for signs
-static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
+static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(const void * __restrict__ vbq,
+                                                           const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                           const int & kbx,
+                                                           const int & iqs) {
     const block_iq3_s * bq3 = (const block_iq3_s *) vbq + kbx;
 
     const int2      qs_packed = make_int2(get_int_b2(bq3->qs, iqs + 0), get_int_b2(bq3->qs, iqs + 1));
@@ -1180,15 +1192,17 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
 
     sumi *= 1 + 2*((bq3->scales[iqs/4] >> ((iqs << 1) & 0x04)) & 0x0F);
 
-    const float d = __half2float(bq3->d) * __low2float(bq8_1[iqs/2].ds);
+    const float d = __half2float(bq3->d) * ggml_cuda_cast<float2>(bq8_1[iqs / 2].ds).x;
     return d * sumi;
 }
 
 #define VDR_IQ1_S_Q8_1_MMVQ 1
 #define VDR_IQ1_S_Q8_1_MMQ  1
 
-static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(const void * __restrict__ vbq,
+                                                           const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                           const int & kbx,
+                                                           const int & iqs) {
     const block_iq1_s * bq1 = (const block_iq1_s *) vbq + kbx;
 
     const int       qs_packed = get_int_b2(bq1->qs, iqs);
@@ -1213,16 +1227,17 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
 
     const float  d1q   = __half2float(bq1->d) * (((qh >> 11) & 0x0E) + 1);
     const float  delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000);
-    const float2 ds    = __half22float2(bq8_1[iqs].ds);
+    const float2 ds    = ggml_cuda_cast<float2>(bq8_1[iqs].ds);
     return d1q * (ds.x*sumi + ds.y*delta);
 }
 
 #define VDR_IQ1_M_Q8_1_MMVQ 1
 #define VDR_IQ1_M_Q8_1_MMQ  1
 
-static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
+static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(const void * __restrict__ vbq,
+                                                           const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                           const int & kbx,
+                                                           const int & iqs) {
     const block_iq1_m * bq1 = (const block_iq1_m *) vbq + kbx;
 
     const int       qs_packed = get_int_b4(bq1->qs, iqs);
@@ -1256,7 +1271,7 @@ static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
 
     iq1m_scale_t scale;
     scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00F0) | ((sc[2] >> 4) & 0x0F00) | (sc[3] & 0xF000);
-    const float d = __half2float(scale.f16) * __low2float(bq8_1[iqs].ds);
+    const float d = __half2float(scale.f16) * ggml_cuda_cast<float2>(bq8_1[iqs].ds).x;
 
     const int tmp = sc[iqs/2] >> (6*(iqs%2));
     const int sc0 = 2*((tmp >> 0) & 0x07) + 1;
@@ -1267,9 +1282,10 @@ static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
 #define VDR_IQ4_NL_Q8_1_MMVQ 2
 #define VDR_IQ4_NL_Q8_1_MMQ  4
 
-static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
+static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(const void * __restrict__ vbq,
+                                                            const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                            const int & kbx,
+                                                            const int & iqs) {
     const block_iq4_nl * bq4 = (const block_iq4_nl *) vbq + kbx;
 
     const int * q8 = (const int *) bq8_1->qs + iqs;
@@ -1284,16 +1300,17 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
         sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi);
     }
 
-    const float d = __half2float(bq4->d) * __low2float(bq8_1->ds);
+    const float d = __half2float(bq4->d) * ggml_cuda_cast<float2>(bq8_1->ds).x;
     return d * sumi;
 }
 
 #define VDR_IQ4_XS_Q8_1_MMVQ 4
 #define VDR_IQ4_XS_Q8_1_MMQ  4
 
-static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
+static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(const void * __restrict__ vbq,
+                                                            const block_q8_1_bf16 * __restrict__ bq8_1,
+                                                            const int & kbx,
+                                                            const int & iqs) {
     const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq + kbx;
 
     int sumi = 0;
@@ -1312,6 +1329,6 @@ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
     const int ls = ((bq4->scales_l[iqs/8] >> (iqs & 0x04)) & 0x0F) | (((bq4->scales_h >> (iqs/2)) & 0x03) << 4);
     sumi *= ls - 32;
 
-    const float d = __half2float(bq4->d) * __low2float(bq8_1[iqs/4].ds);
+    const float d = __half2float(bq4->d) * ggml_cuda_cast<float2>(bq8_1[iqs / 4].ds).x;
     return d * sumi;
 }
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 15443aa554a4..2ab801af4088 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -288,7 +288,9 @@ void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_REST
             sum += y[i].qs[QK8_1/2 + j];
         }
 
-        y[i].s = GGML_FP32_TO_FP16(sum*d);
+        // store sum as bf16 in the fp16 slot to preserve full fp32 range
+        // (avoids overflow when used in Q4_1/Q5_1/Q4_K/Q5_K dot products with large activations)
+        y[i].s = GGML_FP32_TO_BF16(sum * d).bits;
     }
 }