diff --git a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c index 2666a78a96a..9e8c9966e04 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c @@ -742,17 +742,45 @@ static void transfer_output_chunk_threaded(struct htp_context *ctx, float *dst, // activations : fp32 -> fp16 static void transfer_activation_chunk_fp32_to_fp16(__fp16 *restrict vtcm_dst, const float *restrict src, int n_rows, int k_block, int k_stride) { - for (int r = 0; r < n_rows; r += 2) { + const int n_rows_padded = hex_align_up(n_rows, HMX_FP16_TILE_N_ROWS); + const int n_rows_tiled = (n_rows / HMX_FP16_TILE_N_ROWS) * HMX_FP16_TILE_N_ROWS; + + int r = 0; + + #pragma unroll(2) + for (r = 0; r < n_rows_tiled; r += 2) { int r0 = r / HMX_FP16_TILE_N_ROWS; // tile row index int r1 = r % HMX_FP16_TILE_N_ROWS; // intra-tile row idx - const bool next_row_valid = (r + 1) < n_rows; - const HVX_Vector *pv_in0 = (const HVX_Vector *) (src + (r + 0) * k_stride); const HVX_Vector *pv_in1 = (const HVX_Vector *) (src + (r + 1) * k_stride); for (int c = 0; c < k_block; c += 32) { HVX_Vector v0 = *pv_in0++; - HVX_Vector v1 = next_row_valid ? *pv_in1++ : Q6_V_vzero(); + HVX_Vector v1 = *pv_in1++; + + HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); + + // compute output position + int c0 = c / HMX_FP16_TILE_N_COLS; // tile column index + int tile_idx = r0 * (k_block / HMX_FP16_TILE_N_COLS) + c0; + + HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HMX_FP16_TILE_N_ELMS); + tile[r1 / 2] = v_out; + } + } + + for (; r < n_rows_padded; r += 2) { + int r0 = r / HMX_FP16_TILE_N_ROWS; // tile row index + int r1 = r % HMX_FP16_TILE_N_ROWS; // intra-tile row idx + + const bool row0_valid = r < n_rows; + const bool row1_valid = (r + 1) < n_rows; + + const HVX_Vector *pv_in0 = row0_valid ? (const HVX_Vector *) (src + (r + 0) * k_stride) : NULL; + const HVX_Vector *pv_in1 = row1_valid ? (const HVX_Vector *) (src + (r + 1) * k_stride) : NULL; + for (int c = 0; c < k_block; c += 32) { + HVX_Vector v0 = row0_valid ? *pv_in0++ : Q6_V_vzero(); + HVX_Vector v1 = row1_valid ? *pv_in1++ : Q6_V_vzero(); HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); @@ -889,7 +917,9 @@ static __attribute__((noinline)) int mat_mul_qk_0_d16a32_out_stationary(struct h // n_block_cost = m*2: each extra N-block re-loads all M×K activation (cheaper). const size_t m_block_cost = (size_t) n * 3; const size_t n_block_cost = (size_t) m * 2; - if (hmx_compute_chunks(vtcm_budget, overhead, per_n, per_m, per_mn, m, n, m_block_cost, n_block_cost, &M_BLOCK_SIZE, + if (hmx_compute_chunks(vtcm_budget, overhead, per_n, per_m, per_mn, + hex_align_up(m, HMX_FP16_TILE_N_ROWS), n, + m_block_cost, n_block_cost, &M_BLOCK_SIZE, &N_BLOCK_SIZE, &vtcm_used) != 0) { FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget); return -1; @@ -1084,7 +1114,8 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds if (m >= 128) { size_t mc = 0, nc = 0, used = 0; - if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, pipe_per_n, /*per_m=*/vec_dot_size, pipe_per_mn, m, n, + if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, pipe_per_n, /*per_m=*/vec_dot_size, pipe_per_mn, + hex_align_up(m, HMX_FP16_TILE_N_ROWS), n, /*m_block_cost=*/(size_t) n * 3, /*n_block_cost=*/(size_t) m * 2, &mc, &nc, &used) == 0 && hmx_ceil_div((size_t) n, nc) >= 2) { @@ -1096,7 +1127,8 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds } if (!use_pipeline) { - if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, seq_per_n, /*per_m=*/vec_dot_size, seq_per_mn, m, n, + if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, seq_per_n, /*per_m=*/vec_dot_size, seq_per_mn, + hex_align_up(m, HMX_FP16_TILE_N_ROWS), n, /*m_block_cost=*/(size_t) n * 3, /*n_block_cost=*/(size_t) m * 2, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) { FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget); @@ -1432,7 +1464,8 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, /*per_n=*/3 * vec_dot_size, /*per_m=*/group_size * vec_dot_size + f32_scratch_per_m, - /*per_mn=*/sizeof(__fp16), params->m, params->n, + /*per_mn=*/sizeof(__fp16), + hex_align_up(params->m, HMX_FP16_TILE_N_ROWS), params->n, /*m_block_cost=*/(size_t) params->n, /*n_block_cost=*/(size_t) params->m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) { FARF(HIGH, "%s: grouped path does not fit VTCM, falling back to legacy batched loop", __func__); @@ -1612,7 +1645,7 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co /*per_n=*/3 * vec_dot_size, // W + S0 + S1 /*per_m=*/vec_dot_size + f32_scratch_per_m, // A + optional F32 scratch /*per_mn=*/sizeof(__fp16), // O - m, n, + hex_align_up(m, HMX_FP16_TILE_N_ROWS), n, /*m_block_cost=*/(size_t) n, /*n_block_cost=*/(size_t) m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) { FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget); diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index a0c265132c8..2461ae617fa 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -2991,12 +2991,10 @@ int op_matmul(struct htp_ops_context * octx) { return op_matmul_hvx(octx); } - // M alignment: when M > 32 but not 32-aligned, we split into - // HMX (first m_hmx = M & ~31 rows) + HVX (remaining m_tail rows). - // When M <= 32 and not 32-aligned, fall back entirely to HVX. + // M alignment: Use HMX when M >= 32, the last partial tile (m_total % 32 rows) + // is handled by HMX itself; when M < 32 fall back to HVX. const int m_total = (int) src1->ne[1]; - const int m_tail = m_total % 32; - const int m_hmx = m_total - m_tail; + const int m_hmx = m_total & ~31; // 0 when M < 32 if (m_hmx == 0) { return op_matmul_hvx(octx); @@ -3009,7 +3007,6 @@ int op_matmul(struct htp_ops_context * octx) { int k = (int) src0->ne[0]; // inner dimension int n = (int) src0->ne[1]; // weight columns - // --- Phase 1: HMX on the first m_hmx (32-aligned) rows --- int ret = -1; // Row strides in elements. For compact tensors these equal k; for @@ -3027,7 +3024,7 @@ int op_matmul(struct htp_ops_context * octx) { .dst = (float *) dst->data, .activation = (float *) src1->data, .permuted_weight = (const __fp16 *) src0->data, - .m = m_hmx, + .m = m_total, .k = k, .n = n, .act_stride = act_stride, @@ -3048,12 +3045,12 @@ int op_matmul(struct htp_ops_context * octx) { } else { ret = hmx_mat_mul_permuted_w16a32(octx->ctx, (float*) dst->data, (float*) src1->data, (const __fp16 *) src0->data, - m_hmx, k, n, act_stride, wgt_stride); + m_total, k, n, act_stride, wgt_stride); } } else { ret = hmx_mat_mul_permuted_qk_0_d16a32(octx->ctx, (float*) dst->data, (float*) src1->data, (const uint8_t *) src0->data, - m_hmx, k, n, (int) src0->type); + m_total, k, n, (int) src0->type); } if (ret != 0) { @@ -3061,27 +3058,6 @@ int op_matmul(struct htp_ops_context * octx) { return op_matmul(octx); } - // --- Phase 2: HVX on the remaining m_tail rows --- - if (m_tail > 0) { - // copy of src1 and dst - struct htp_tensor src1_tail = *src1; - struct htp_tensor dst_tail = *dst; - - src1_tail.ne[1] = m_tail; // only tail rows - dst_tail.ne[1] = m_tail; // only tail rows - - // Offset activation and dst pointers past the HMX-processed rows. - // Use nb[1] (row stride in bytes) to compute the byte offset. - src1_tail.data += (uint32_t) m_hmx * src1->nb[1]; - dst_tail.data += (uint32_t) m_hmx * dst->nb[1]; - - octx->src[1] = &src1_tail; - octx->dst = &dst_tail; - - FARF(HIGH, "hmx-matmul: HVX tail m_tail %d src1 %p dst %p", m_tail, (void *) src1_tail.data, (void *) dst_tail.data); - return op_matmul_hvx(octx); - } - return 0; #endif // HTP_HAS_HMX } diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index 35d425a431f..0a45a4daa13 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -66,8 +66,6 @@ set(GGML_OPENCL_KERNELS diag div gelu - gemv_noshuffle_general - gemv_noshuffle get_rows glu group_norm @@ -75,7 +73,6 @@ set(GGML_OPENCL_KERNELS im2col_f32 im2col_f16 mean - mul_mat_Ab_Bi_8x4 mul_mv_f16_f16 mul_mv_f16_f32_1row mul_mv_f16_f32_l4 @@ -120,12 +117,15 @@ set(GGML_OPENCL_KERNELS mul_mm_q4_k_f32_l4_lm mul_mm_q5_k_f32_l4_lm mul_mm_q6_k_f32_l4_lm - mul_mm_q8_0_f32_8x4 + gemv_noshuffle_q4_0_f32 + gemv_noshuffle_q4_0_f32_spec + gemm_noshuffle_q4_0_f32 gemv_noshuffle_q4_1_f32 gemm_noshuffle_q4_1_f32 gemv_noshuffle_iq4_nl_f32 gemm_noshuffle_iq4_nl_f32 - gemv_noshuffle_general_q8_0_f32 + gemv_noshuffle_q8_0_f32 + gemm_noshuffle_q8_0_f32 gemv_noshuffle_q4_k_f32 gemm_noshuffle_q4_k_f32 gemv_noshuffle_q6_k_f32 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 74948c27e4e..8c7bf98c16f 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -731,22 +731,16 @@ struct ggml_backend_opencl_context { cl_kernel kernel_transpose_16_4x1; // Gemm and Gemv related programs, kernels, etc - cl_program program_CL_gemm; - cl_program program_CL_gemv_general; - cl_program program_CL_gemv_4096_1_11008; - cl_program program_CL_gemv_4096_1_4096; - cl_program program_CL_gemv_11008_1_4096; - cl_program program_CL_gemv_32000_1_4096; - cl_kernel CL_mul_mat_Ab_Bi_8x4; - cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general; - cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008; - cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096; - cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096; - cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096; + cl_kernel kernel_gemm_noshuffle_q4_0_f32; + cl_kernel kernel_gemv_noshuffle_q4_0_f32; + cl_kernel kernel_gemv_noshuffle_q4_0_f32_4096_1_11008; + cl_kernel kernel_gemv_noshuffle_q4_0_f32_4096_1_4096; + cl_kernel kernel_gemv_noshuffle_q4_0_f32_11008_1_4096; + cl_kernel kernel_gemv_noshuffle_q4_0_f32_32000_1_4096; cl_kernel kernel_gemv_noshuffle_q4_1_f32; cl_kernel kernel_gemm_noshuffle_q4_1_f32; - cl_kernel kernel_mul_mm_q8_0_f32_8x4; - cl_kernel CL_mul_mat_vec_q8_0_f32; + cl_kernel kernel_gemm_noshuffle_q8_0_f32; + cl_kernel kernel_gemv_noshuffle_q8_0_f32; cl_kernel kernel_gemv_noshuffle_q4_k_f32; cl_kernel kernel_gemm_noshuffle_q4_k_f32; cl_kernel kernel_gemv_noshuffle_q6_K_f32; @@ -2578,21 +2572,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size); if (backend_ctx->has_vector_subgroup_broadcast) { - CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; + CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST "; } #ifdef GGML_OPENCL_EMBED_KERNELS const std::string kernel_src_CL_gemv_general { - #include "gemv_noshuffle_general.cl.h" + #include "gemv_noshuffle_q4_0_f32.cl.h" }; #else - const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general.cl"); + const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_q4_0_f32.cl"); #endif - backend_ctx->program_CL_gemv_general = build_program_from_source( + cl_program prog = build_program_from_source( backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts); - CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err)); + CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q4_0_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q4_0_f32", &err), err)); + CL_CHECK(clReleaseProgram(prog)); GGML_LOG_CONT("."); } @@ -2606,20 +2601,21 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size); if (backend_ctx->has_vector_subgroup_broadcast) { - CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; + CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST "; } #ifdef GGML_OPENCL_EMBED_KERNELS const std::string kernel_src_CL_gemv { - #include "gemv_noshuffle.cl.h" + #include "gemv_noshuffle_q4_0_f32_spec.cl.h" }; #else - const std::string kernel_src_CL_gemv = read_file("gemv_noshuffle.cl"); + const std::string kernel_src_CL_gemv = read_file("gemv_noshuffle_q4_0_f32_spec.cl"); #endif - backend_ctx->program_CL_gemv_4096_1_4096 = build_program_from_source( + cl_program prog = build_program_from_source( backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts); - CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err)); + CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q4_0_f32_4096_1_4096 = clCreateKernel(prog, "kernel_gemv_noshuffle_q4_0_f32", &err), err)); + CL_CHECK(clReleaseProgram(prog)); GGML_LOG_CONT("."); // Gemv 2048, 16384 @@ -2630,12 +2626,13 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size); if (backend_ctx->has_vector_subgroup_broadcast) { - CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; + CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST "; } - backend_ctx->program_CL_gemv_4096_1_11008 = build_program_from_source( + prog = build_program_from_source( backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts); - CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err)); + CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q4_0_f32_4096_1_11008 = clCreateKernel(prog, "kernel_gemv_noshuffle_q4_0_f32", &err), err)); + CL_CHECK(clReleaseProgram(prog)); GGML_LOG_CONT("."); // Gemv 5504, 44032 @@ -2646,12 +2643,13 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size); if (backend_ctx->has_vector_subgroup_broadcast) { - CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; + CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST "; } - backend_ctx->program_CL_gemv_11008_1_4096 = build_program_from_source( + prog = build_program_from_source( backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts); - CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err)); + CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q4_0_f32_11008_1_4096 = clCreateKernel(prog, "kernel_gemv_noshuffle_q4_0_f32", &err), err)); + CL_CHECK(clReleaseProgram(prog)); GGML_LOG_CONT("."); // Gemv 16000, 128000 @@ -2663,12 +2661,13 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve std::to_string(backend_ctx->adreno_wave_size); if (backend_ctx->has_vector_subgroup_broadcast) { - CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; + CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST "; } - backend_ctx->program_CL_gemv_32000_1_4096 = build_program_from_source( + prog = build_program_from_source( backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts); - CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err)); + CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q4_0_f32_32000_1_4096 = clCreateKernel(prog, "kernel_gemv_noshuffle_q4_0_f32", &err), err)); + CL_CHECK(clReleaseProgram(prog)); GGML_LOG_CONT("."); } @@ -2676,13 +2675,14 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve { #ifdef GGML_OPENCL_EMBED_KERNELS const std::string kernel_src_CL_gemm { - #include "mul_mat_Ab_Bi_8x4.cl.h" + #include "gemm_noshuffle_q4_0_f32.cl.h" }; #else - const std::string kernel_src_CL_gemm = read_file("mul_mat_Ab_Bi_8x4.cl"); + const std::string kernel_src_CL_gemm = read_file("gemm_noshuffle_q4_0_f32.cl"); #endif - backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_CL_gemm.c_str(), compile_opts); - CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err)); + cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_CL_gemm.c_str(), compile_opts); + CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q4_0_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q4_0_f32", &err), err)); + CL_CHECK(clReleaseProgram(prog)); GGML_LOG_CONT("."); } @@ -2767,14 +2767,15 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve // mul_mm_q8_0_f32_8x4 { #ifdef GGML_OPENCL_EMBED_KERNELS - const std::string kernel_src_q8_8x4_gemm { - #include "mul_mm_q8_0_f32_8x4.cl.h" + const std::string kernel_src { + #include "gemm_noshuffle_q8_0_f32.cl.h" }; #else - const std::string kernel_src_q8_8x4_gemm = read_file("mul_mm_q8_0_f32_8x4.cl"); + const std::string kernel_src = read_file("gemm_noshuffle_q8_0_f32.cl"); #endif - backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_q8_8x4_gemm.c_str(), compile_opts); - CL_CHECK((backend_ctx->kernel_mul_mm_q8_0_f32_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mm_q8_0_f32_8x4", &err), err)); + cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q8_0_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q8_0_f32", &err), err)); + CL_CHECK(clReleaseProgram(prog)); GGML_LOG_CONT("."); } @@ -2790,16 +2791,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve #ifdef GGML_OPENCL_EMBED_KERNELS const std::string kernel_src_CL_gemv_general { - #include "gemv_noshuffle_general_q8_0_f32.cl.h" + #include "gemv_noshuffle_q8_0_f32.cl.h" }; #else - const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general_q8_0_f32.cl"); + const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_q8_0_f32.cl"); #endif cl_program prog = build_program_from_source( backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts); - CL_CHECK((backend_ctx->CL_mul_mat_vec_q8_0_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q8_0_f32", &err), err)); + CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q8_0_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q8_0_f32", &err), err)); CL_CHECK(clReleaseProgram(prog)); GGML_LOG_CONT("."); } @@ -4937,164 +4938,15 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, // Only do transpose for large, non batched matrix // TODO: use preallocated images instead of sub-buffer then image if (use_adreno_kernels(backend_ctx, tensor)) { - // <----------------------------------------------------------------------------------> // - // start transpose - // <----------------------------------------------------------------------------------> // - int M = tensor->ne[1]; // ne01 - int K = tensor->ne[0]; // ne00 - - //For matrix-vector multiplication kernel, we assume K is a multiple of 32 - GGML_ASSERT(K % 32 == 0); - //For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4 - GGML_ASSERT(M % 4 == 0); - - // transpose is out of place, so we need to allocate transposed buffers - // <----------------------------------------------------------------------------------> // - // use sub_buffer of max buffer size instead - - size_t q_size_bytes = K * M / 8 * sizeof(float); - backend_ctx->prealloc_quant_trans.allocate(context, q_size_bytes); - - cl_buffer_region region; - region.origin = 0; - region.size = q_size_bytes; - cl_mem qT_d = clCreateSubBuffer( - backend_ctx->prealloc_quant_trans.buffer, - 0, - CL_BUFFER_CREATE_TYPE_REGION, - ®ion, - &err); - CL_CHECK(err); - - bool K_tile_trans = true; - if ((K / 32) % 4 != 0){ - K_tile_trans =false; - } - - size_t d_size_bytes = M * (K / 32) * 2; - backend_ctx->prealloc_scales_trans.allocate(context, d_size_bytes); - - region.origin = 0; - region.size = d_size_bytes; - cl_mem dT_d = clCreateSubBuffer( - backend_ctx->prealloc_scales_trans.buffer, - 0, - CL_BUFFER_CREATE_TYPE_REGION, - ®ion, - &err); - CL_CHECK(err); - - // <----------------------------------------------------------------------------------> // - - - // create images from the buffers - // <----------------------------------------------------------------------------------> // - cl_mem q_d_image1D; - cl_mem d_d_image1D; - cl_mem qT_d_image1D; - cl_mem dT_d_image1D; - - cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT }; - cl_image_desc img_desc_1d; - - memset(&img_desc_1d, 0, sizeof(img_desc_1d)); - img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; - img_desc_1d.image_width = M * K / 4 / 4; - img_desc_1d.buffer = extra->q; - q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); - CL_CHECK(err); - - img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT }; - memset(&img_desc_1d, 0, sizeof(img_desc_1d)); - img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; - img_desc_1d.image_width = M * K / 4 / 4; - img_desc_1d.buffer = qT_d; - qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); - CL_CHECK(err); - - memset(&img_desc_1d, 0, sizeof(img_desc_1d)); - if (K_tile_trans) { - img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT }; - img_desc_1d.image_width = M * K / 32 / 4; - } else { - img_fmt_1d = { CL_R, CL_HALF_FLOAT }; - img_desc_1d.image_width = M * K / 32; - } - img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; - img_desc_1d.buffer = extra->d; - d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); - CL_CHECK(err); - - img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT }; - memset(&img_desc_1d, 0, sizeof(img_desc_1d)); - img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; - img_desc_1d.image_width = M * K / 32 / 4; - img_desc_1d.buffer = dT_d; - dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); - CL_CHECK(err); - // <----------------------------------------------------------------------------------> // - - // set up and call the transpose kernels - // <----------------------------------------------------------------------------------> // - // weights - int height_q = M / 4; - int width_q = K / 4 / 4; - kernel = backend_ctx->kernel_transpose_16; - - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_q)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_q)); - - size_t local_size_q[3] = {4, 16, 1}; - size_t global_size_q[3] = {static_cast(width_q), static_cast(height_q), 1}; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt)); - CL_CHECK(clWaitForEvents(1, &evt)); - - // scales - int height_s = M / 4; - int width_s = K / 32 / 4; - - kernel = backend_ctx->kernel_transpose_16; - if (!K_tile_trans) { - kernel = backend_ctx->kernel_transpose_16_4x1; - width_s = K / 32; - } - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s)); - - size_t local_size_s[3] = {4, 16, 1}; - size_t global_size_s[3] = {static_cast(width_s), static_cast(height_s), 1}; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt)); - CL_CHECK(clWaitForEvents(1, &evt)); - // <----------------------------------------------------------------------------------> // + int M = tensor->ne[1]; + int K = tensor->ne[0]; - // copy transposed buffer contents to original buffers - // <----------------------------------------------------------------------------------> // - // weights - CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt)); - CL_CHECK(clWaitForEvents(1, &evt)); + GGML_ASSERT(K % 32 == 0); - // scales - CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt)); - CL_CHECK(clWaitForEvents(1, &evt)); - // <----------------------------------------------------------------------------------> // - - // deallocate transpose buffers - // <----------------------------------------------------------------------------------> // - CL_CHECK(clReleaseMemObject(qT_d)); - CL_CHECK(clReleaseMemObject(dT_d)); - - // deallocate temporary images - CL_CHECK(clReleaseMemObject(q_d_image1D)); - CL_CHECK(clReleaseMemObject(d_d_image1D)); - CL_CHECK(clReleaseMemObject(qT_d_image1D)); - CL_CHECK(clReleaseMemObject(dT_d_image1D)); - // <----------------------------------------------------------------------------------> // - // end transpose - // <----------------------------------------------------------------------------------> // + // Transpose q as ushort + transpose_2d_as_16b(backend_ctx, extra->q, extra->q, size_q, K/4, M); + // Transpose d as ushort + transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M); } #endif // GGML_OPENCL_USE_ADRENO_KERNELS @@ -5820,8 +5672,9 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, #ifdef GGML_OPENCL_USE_ADRENO_KERNELS if (use_adreno_kernels(backend_ctx, tensor)) { - cl_int err; - cl_kernel kernel; + ggml_cl_buffer buf_trans_q; + ggml_cl_buffer buf_trans_d; + ggml_cl_buffer buf_unpacked; cl_int M = tensor->ne[1]; // ne01 cl_int K = tensor->ne[0]; // ne00 @@ -5833,46 +5686,12 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t); GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size"); - cl_mem buf_trans_q; - cl_mem buf_trans_d; - - CL_CHECK((buf_trans_q = clCreateBuffer(context, CL_MEM_READ_WRITE, - size_q, NULL, &err), err)); - CL_CHECK((buf_trans_d = clCreateBuffer(context, CL_MEM_READ_WRITE, - size_d, NULL, &err), err)); - - kernel = backend_ctx->kernel_transpose_16_buf; - - // transpose q back - cl_int stride_k_q = K/4; - size_t local_size_q[3] = {64, 1, 1}; - size_t global_size_q[3] = {(size_t)M, (size_t)stride_k_q, 1}; - - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_q)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_q)); - - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, - global_size_q, local_size_q, 0, NULL, NULL)); - - // transpose scales back - cl_int stride_k_d = K/32; - size_t local_size_d[3] = {64, 1, 1}; - size_t global_size_d[3] = {(size_t)M, (size_t)stride_k_d, 1}; - - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->d)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_d)); - - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, - global_size_d, local_size_d, 0, NULL, NULL)); + buf_trans_q.allocate(backend_ctx->context, size_q); + buf_trans_d.allocate(backend_ctx->context, size_d); + buf_unpacked.allocate(backend_ctx->context, ggml_nbytes(tensor)); - // unpack - cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, - ggml_nbytes(tensor), NULL, &err); - CL_CHECK(err); + transpose_2d_as_16b(backend_ctx, extra->q, buf_trans_q.buffer, size_q, M, K/4); + transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/32); cl_uchar mask_0F = 0x0F; cl_uchar mask_F0 = 0xF0; @@ -5880,25 +5699,15 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; size_t local_work_size[] = {1, 1, 1}; - kernel = backend_ctx->kernel_restore_block_q4_0_noshuffle; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device)); + cl_kernel kernel = backend_ctx->kernel_restore_block_q4_0_noshuffle; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q.buffer)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d.buffer)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_unpacked.buffer)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_uchar), &mask_0F)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_F0)); - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, - global_work_size, local_work_size, 0, NULL, NULL)); - - // read back to host - CL_CHECK(clEnqueueReadBuffer( - queue, data_device, CL_TRUE, offset, - size, data, 0, NULL, NULL)); - - CL_CHECK(clReleaseMemObject(data_device)); - CL_CHECK(clReleaseMemObject(buf_trans_q)); - CL_CHECK(clReleaseMemObject(buf_trans_d)); - + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); + CL_CHECK(clEnqueueReadBuffer(queue, buf_unpacked.buffer, CL_TRUE, offset, size, data, 0, NULL, NULL)); return; } #endif @@ -10073,6 +9882,235 @@ static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_ten CL_CHECK(clReleaseMemObject(D_sub_buffer)); } +static void ggml_cl_mul_mat_q4_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(src1); + GGML_ASSERT(src1->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra; + + cl_ulong offset1 = extra1->offset + src1->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + + const int ne10 = src1->ne[0]; + const int ne12 = src1->ne[2]; + + const int ne0 = dst->ne[0]; + const int ne1 = dst->ne[1]; + + GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); + + cl_context context = backend_ctx->context; + cl_kernel kernel; + + cl_int err; + cl_image_format img_fmt; + cl_image_desc img_desc; + cl_buffer_region region; + + int M = ne01; + int N = ne1; + int K = ne00; + + if (ne1 == 1) { + cl_mem q_img = nullptr; + cl_mem b_sub_buf = nullptr; + cl_mem b_img = nullptr; + + // image for q + img_fmt = { CL_R, CL_UNSIGNED_INT32}; + memset(&img_desc, 0, sizeof(img_desc)); + img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc.image_width = M * K / 2 / 4; + img_desc.buffer = extra0_q4_0->q; + CL_CHECK((q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err)); + + // subbuffer for activations + region.origin = offset1; + region.size = K * N * sizeof(float); + CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + + // image for activations + img_fmt = {CL_RGBA, CL_FLOAT}; + memset(&img_desc, 0, sizeof(img_desc)); + img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc.image_width = K * N / 4; + img_desc.buffer = b_sub_buf; + CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err)); + + kernel = backend_ctx->kernel_gemv_noshuffle_q4_0_f32; + if (M == 4096 && K == 4096) { + kernel = backend_ctx->kernel_gemv_noshuffle_q4_0_f32_4096_1_4096; + } else if (M == 4096 && K == 11008) { + kernel = backend_ctx->kernel_gemv_noshuffle_q4_0_f32_4096_1_11008; + } else if (M == 11008 && K == 4096) { + kernel = backend_ctx->kernel_gemv_noshuffle_q4_0_f32_11008_1_4096; + } else if (M == 32000 && K == 4096) { + kernel = backend_ctx->kernel_gemv_noshuffle_q4_0_f32_32000_1_4096; + } + + int r2 = 1; + int r3 = 1; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_img)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &b_img)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3)); + + size_t local_work_size[3] = {64, 4, 1}; + size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + + CL_CHECK(clReleaseMemObject(q_img)); + CL_CHECK(clReleaseMemObject(b_sub_buf)); + CL_CHECK(clReleaseMemObject(b_img)); + } else { + cl_mem b_sub_buf = nullptr; + cl_mem b_sub_buf_trans = nullptr; + cl_mem b_img = nullptr; + cl_mem b_img_trans = nullptr; + cl_mem d_sub_buf = nullptr; + + // subbuffer for activations + region.origin = offset1; + region.size = K * N * sizeof(float); + CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + + // image for activations + img_fmt = {CL_RGBA, CL_FLOAT}; + memset(&img_desc, 0, sizeof(img_desc)); + img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc.image_width = K * N / 4; + img_desc.buffer = b_sub_buf; + CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err)); + + // pad N to multiple of 8 + int extra_elements = N % 8; + int padding = 0; + if (extra_elements > 0){ + padding = 8 - extra_elements; + } + + // subbuffer for transposed activations + region.origin = 0; + region.size = K * (N + padding) * sizeof(float)/2; + backend_ctx->prealloc_act_trans.allocate(context, region.size); + CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + + // image for transposed activations + img_fmt = {CL_RGBA, CL_HALF_FLOAT}; + memset(&img_desc, 0, sizeof(img_desc)); + img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc.image_width = K * (N + padding) / 4; + img_desc.buffer = b_sub_buf_trans; + CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err)); + + // subbuffer for output + region.origin = extrad->offset; // Specify the starting offset (in bytes) + region.size = M * N * sizeof(float); // Specify the size of the sub-buffer + CL_CHECK((d_sub_buf = clCreateSubBuffer(extrad->data_device, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + + // transpose activations + int height_B = N/4; + if (height_B == 0) { + height_B = 1; + } + int width_B = K/4; + int padded_height_B = (N + padding)/4; + + kernel = backend_ctx->kernel_transpose_32_16; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B)); + + size_t local_work_size_t[2] = { 1, 16 }; + size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B }; + if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) { + local_work_size_t[0]=4; + local_work_size_t[1]=8; + } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) { + local_work_size_t[0]=2; + local_work_size_t[1]=8; + } else if(ne0 == 4096 && ne1 == 128 && ne10 == 11008) { + local_work_size_t[0]=1; + local_work_size_t[1]=8; + } else if(ne0 == 32000 && ne1 == 128 && ne10 == 4096) { + local_work_size_t[0]=2; + local_work_size_t[1]=8; + } + backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst); + + // gemm + kernel = backend_ctx->kernel_gemm_noshuffle_q4_0_f32; + int padded_N = N + padding; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &b_img_trans)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_sub_buf)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &padded_N)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne1)); + + size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1}; + size_t local_work_size[3] = {1, 128, 1}; + if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) { + local_work_size[0] = 1; + local_work_size[1] = 128; + } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) { + local_work_size[0] = 2; + local_work_size[1] = 64; + } else if (ne0 == 4096 && ne1 == 128 && ne10 == 11008) { + local_work_size[0] = 2; + local_work_size[1] = 64; + } else if (ne0 == 32000 && ne1 == 128 && ne10 == 4096) { + local_work_size[0] = 2; + local_work_size[1] = 64; + } + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + + CL_CHECK(clReleaseMemObject(b_sub_buf)); + CL_CHECK(clReleaseMemObject(b_sub_buf_trans)); + CL_CHECK(clReleaseMemObject(b_img)); + CL_CHECK(clReleaseMemObject(b_img_trans)); + CL_CHECK(clReleaseMemObject(d_sub_buf)); + } +#else + GGML_UNUSED(backend); + GGML_UNUSED(src0); + GGML_UNUSED(src1); + GGML_UNUSED(dst); +#endif +} + static void ggml_cl_mul_mat_q4_1_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS GGML_ASSERT(src0); @@ -10495,7 +10533,7 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t img_desc.buffer = b_sub_buf; CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err)); - kernel = backend_ctx->CL_mul_mat_vec_q8_0_f32; + kernel = backend_ctx->kernel_gemv_noshuffle_q8_0_f32; int r2 = 1; int r3 = 1; @@ -10585,7 +10623,7 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst); // gemm - kernel = backend_ctx->kernel_mul_mm_q8_0_f32_8x4; + kernel = backend_ctx->kernel_gemm_noshuffle_q8_0_f32; int padded_N = N + padding; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q)); @@ -11195,8 +11233,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co GGML_ASSERT(dst); GGML_ASSERT(dst->extra); - const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; - const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; + const enum ggml_type src0t = src0->type; + const enum ggml_type src1t = src1->type; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; @@ -11219,28 +11257,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra; #endif - const int ne00 = src0 ? src0->ne[0] : 0; - const int ne01 = src0 ? src0->ne[1] : 0; - const int ne02 = src0 ? src0->ne[2] : 0; - const int ne03 = src0 ? src0->ne[3] : 0; - - const cl_ulong nb00 = src0 ? src0->nb[0] : 0; - const cl_ulong nb01 = src0 ? src0->nb[1] : 0; - const cl_ulong nb02 = src0 ? src0->nb[2] : 0; - const cl_ulong nb03 = src0 ? src0->nb[3] : 0; - - const int ne10 = src1 ? src1->ne[0] : 0; - const int ne11 = src1 ? src1->ne[1] : 0; - const int ne12 = src1 ? src1->ne[2] : 0; - const int ne13 = src1 ? src1->ne[3] : 0; - - const cl_ulong nb10 = src1 ? src1->nb[0] : 0; - const cl_ulong nb11 = src1 ? src1->nb[1] : 0; - const cl_ulong nb12 = src1 ? src1->nb[2] : 0; - const cl_ulong nb13 = src1 ? src1->nb[3] : 0; - - const int ne0 = dst ? dst->ne[0] : 0; - const int ne1 = dst ? dst->ne[1] : 0; + GGML_TENSOR_LOCALS(int, ne0, src0, ne); + GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb); + GGML_TENSOR_LOCALS(int, ne1, src1, ne); + GGML_TENSOR_LOCALS(cl_ulong, nb1, src1, nb); + GGML_TENSOR_LOCALS(int, ne, dst, ne); + GGML_TENSOR_LOCALS(cl_ulong, nb, dst, nb); int r2 = ne12/ne02; int r3 = ne13/ne03; @@ -11256,8 +11278,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co cl_kernel kernel; #ifdef GGML_OPENCL_USE_ADRENO_KERNELS - cl_context context = backend_ctx->context; - if(src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){ if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0 && // dst is wrapped with image1d_buffer, the size limit applies, also src0 @@ -11284,340 +11304,52 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co } if (ne01 && ne1 && use_adreno_kernels(backend_ctx, src0)) { + // NOTE: Kernels using image1d_buffer_t (e.g., src0_q) would normally require + // a limit check, but q4_0 / q4_1 tensors are very unlikely to exceed that + // limit, so the check is omitted. - // init CL objects - // <--------------------------------------------> // - cl_int status; - cl_image_format img_fmt_1d; - cl_image_desc img_desc_1d; - cl_buffer_region region; - cl_mem A_image1d = nullptr; - cl_mem B_image1d = nullptr; - cl_mem B_sub_buffer = nullptr; - cl_mem C_d = nullptr; - // for B transpose - cl_mem B_d = nullptr; - cl_mem B_d_input_image = nullptr; - // <--------------------------------------------> // - - // define matrix dimensions - // <--------------------------------------------> // - int M = ne01; - int N = ne1; - int K = ne00; - int padding; - // <--------------------------------------------> // - - // NOTE: Kernels using image1d_buffer_t (e.g., src0_q) would normally require - // a limit check, but q4_0 / q4_1 tensors are very unlikely to exceed that - // limit, so the check is omitted. + // q4_0 x fp32 + if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) { + ggml_cl_mul_mat_q4_0_f32_adreno(backend, src0, src1, dst); + return; + } - // q4_1 x fp32 - if (src0t == GGML_TYPE_Q4_1 && src1t == GGML_TYPE_F32) { + // q4_1 x fp32 + if (src0t == GGML_TYPE_Q4_1 && src1t == GGML_TYPE_F32) { ggml_cl_mul_mat_q4_1_f32_adreno(backend, src0, src1, dst); return; - } - - // iq4_nl x fp32 - if (src0t == GGML_TYPE_IQ4_NL && src1t == GGML_TYPE_F32) { - ggml_cl_mul_mat_iq4_nl_f32_adreno(backend, src0, src1, dst); - return; - } + } - // q8_0 x fp32 - if (src0t == GGML_TYPE_Q8_0 && src1t == GGML_TYPE_F32 && - enable_adreno_trans_weight(backend_ctx, src0)) { - ggml_cl_mul_mat_q8_0_f32_adreno(backend, src0, src1, dst); + // iq4_nl x fp32 + if (src0t == GGML_TYPE_IQ4_NL && src1t == GGML_TYPE_F32) { + ggml_cl_mul_mat_iq4_nl_f32_adreno(backend, src0, src1, dst); return; - } + } + + // q8_0 x fp32 + if (src0t == GGML_TYPE_Q8_0 && src1t == GGML_TYPE_F32 && + enable_adreno_trans_weight(backend_ctx, src0)) { + ggml_cl_mul_mat_q8_0_f32_adreno(backend, src0, src1, dst); + return; + } - // q4_k x fp32 - if (src0t == GGML_TYPE_Q4_K && src1t == GGML_TYPE_F32) { + // q4_k x fp32 + if (src0t == GGML_TYPE_Q4_K && src1t == GGML_TYPE_F32) { ggml_cl_mul_mat_q4_k_f32_adreno(backend, src0, src1, dst); return; - } - - // q6_K x fp32 - if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32) { - ggml_cl_mul_mat_q6_K_f32_adreno(backend, src0, src1, dst); - return; - } - - // q5_K x fp32 - if (src0t == GGML_TYPE_Q5_K && src1t == GGML_TYPE_F32) { - ggml_cl_mul_mat_q5_K_f32_adreno(backend, src0, src1, dst); - return; - } - - // q4_0 x fp32 - if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) { - // TODO: remove duplicate definitions of image description + format -- move to top - - // create an image for A - // <--------------------------------------------> // - if (N == 1) { - img_fmt_1d = { CL_R, CL_UNSIGNED_INT32}; - } else { - img_fmt_1d = { CL_R, CL_FLOAT}; - } - memset(&img_desc_1d, 0, sizeof(img_desc_1d)); - img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; - img_desc_1d.image_width = M * K / 2 / 4; // Divide by 4 for char -> float - img_desc_1d.buffer = extra0_q4_0->q; - A_image1d = clCreateImage( - context, - CL_MEM_READ_ONLY, - &img_fmt_1d, - &img_desc_1d, - NULL, - &status); - CL_CHECK(status); - // <--------------------------------------------> // - - - // create a sub_buffer for B - // <--------------------------------------------> // - region.origin = (extra1->offset); - region.size = K * N * sizeof(float); - B_sub_buffer = clCreateSubBuffer( - extra1->data_device, - 0, - CL_BUFFER_CREATE_TYPE_REGION, - ®ion, - &status); - CL_CHECK(status); - // <--------------------------------------------> // - - // transpose activation for Skyler's gemm - if (N != 1) { - //how many extra elements beyond multiple of 8 - int extra_elements = N % 8; - - //how much padding to add - padding = 0; - if (extra_elements > 0){ - padding = 8 - extra_elements; - } - - // Specify the starting offset (in bytes) - region.origin = 0; - // Specify the size of the sub-buffer (divide by 2 for FP16) - region.size = K * (N + padding) * sizeof(float)/2; - backend_ctx->prealloc_act_trans.allocate(context, region.size); - - B_d = clCreateSubBuffer( - backend_ctx->prealloc_act_trans.buffer, - 0, - CL_BUFFER_CREATE_TYPE_REGION, - ®ion, - &status); - CL_CHECK(status); - - cl_image_format image_format_B_d_input = { CL_RGBA, CL_FLOAT }; - cl_image_desc image_desc_B_d_input = { - CL_MEM_OBJECT_IMAGE1D_BUFFER, - static_cast(K * N / 4), - 0, 0, 0, 0, 0, 0, 0, { B_sub_buffer } - }; - B_d_input_image = clCreateImage( - context, - 0, - &image_format_B_d_input, - &image_desc_B_d_input, - NULL, - &status); - CL_CHECK(status); - - cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16) - cl_image_desc image_desc_B_d_output = { - CL_MEM_OBJECT_IMAGE1D_BUFFER, - static_cast(K * (N + padding)/4), - 0, 0, 0, 0, 0, 0, 0, { B_d } - }; - B_image1d = clCreateImage( - context, - 0, - &image_format_B_d_output, - &image_desc_B_d_output, - NULL, - &status); - CL_CHECK(status); - - int height_B = N/4; - if (height_B == 0) { - height_B = 1; - } - int width_B = K/4; - int padded_height_B = (N + padding)/4; - - kernel = backend_ctx->kernel_transpose_32_16; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_d_input_image)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B)); - - size_t local_size_t[2] = { 1, 16 }; - //WGS tuning - if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) { - local_size_t[0]=4; - local_size_t[1]=8; - } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) { - local_size_t[0]=2; - local_size_t[1]=8; - } else if(ne0 == 4096 && ne1 == 128 && ne10 == 11008) { - local_size_t[0]=1; - local_size_t[1]=8; - } else if(ne0 == 32000 && ne1 == 128 && ne10 == 4096) { - local_size_t[0]=2; - local_size_t[1]=8; - } - - size_t global_size_t[2] = { - static_cast(width_B), - static_cast(padded_height_B) - }; - - backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst); - } else { - // no need to transpose B in other cases - // create an image for B from sub_buffer - // <--------------------------------------------> // - img_fmt_1d = {CL_RGBA, CL_FLOAT}; - - memset(&img_desc_1d, 0, sizeof(img_desc_1d)); - img_desc_1d.image_width = K * N / 4; - img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; - img_desc_1d.buffer = B_sub_buffer; - B_image1d = clCreateImage( - context, - CL_MEM_READ_ONLY, - &img_fmt_1d, - &img_desc_1d, - NULL, - &status); - CL_CHECK(status); - // <--------------------------------------------> // - } - - // choose gemm or gemv kernel - // <--------------------------------------------> // - if (N == 1) { - kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general; - if (M == 4096 && K == 4096) { - kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096; - } else if (M == 4096 && K == 11008) { - kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008; - } else if (M == 11008 && K == 4096) { - kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096; - } else if (M == 32000 && K == 4096) { - kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096; - } - } else { - kernel = backend_ctx->CL_mul_mat_Ab_Bi_8x4; - } - // <--------------------------------------------> // - - // set kernel args - // <--------------------------------------------> // - cl_uint k_arg = 0; - - if (N == 1) { - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q4_0->d)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extra1->offset)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extrad->offset)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne01)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne10)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne0)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne1)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r2)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r3)); - } else { - region.origin = extrad->offset; // Specify the starting offset (in bytes) - region.size = M * N * sizeof(float); // Specify the size of the sub-buffer - C_d = clCreateSubBuffer(extrad->data_device, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); - CL_CHECK(status); - - int padded_N = ne1 + padding; - - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); //A_q_dextra0_q4_0->q - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); //A_s_d - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d)); //B_d - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &C_d)); //C_d - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01)); //M - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &padded_N)); //N with padding - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); //K - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne1)); //N without padding - } - // <--------------------------------------------> // - - // choose workgroup size - // <--------------------------------------------> // - size_t global_work_size[3] = { - 64, static_cast((M+63)/64), static_cast((N+31)/32)}; - size_t local_work_size[3] = {64, 2, 4}; - - global_work_size[0] = (size_t)(ceil((float)ne1/8)); - global_work_size[1] = (size_t)(ne01/4); - global_work_size[2] = (size_t)(1); - - local_work_size[0] = (size_t)(1); //4x32 for FP32 - local_work_size[1] = (size_t)(128); - local_work_size[2] = (size_t)(1); - - //WGS tuning - if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) { - local_work_size[0] = 1; - local_work_size[1] = 128; - } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) { - local_work_size[0] = 2; - local_work_size[1] = 64; - } else if (ne0 == 4096 && ne1 == 128 && ne10 == 11008) { - local_work_size[0] = 2; - local_work_size[1] = 64; - } else if (ne0 == 32000 && ne1 == 128 && ne10 == 4096) { - local_work_size[0] = 2; - local_work_size[1] = 64; } - if (N == 1) { - size_t wavesize = backend_ctx->adreno_wave_size; - local_work_size[0] = wavesize; // localsize - local_work_size[1] = 4; // reduce factor - local_work_size[2] = 1; - - global_work_size[0] = (((M / 2) + wavesize - 1) / wavesize) * wavesize; - global_work_size[1] = 4; // reduce factor - global_work_size[2] = 1; + // q6_K x fp32 + if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32) { + ggml_cl_mul_mat_q6_K_f32_adreno(backend, src0, src1, dst); + return; } - // <--------------------------------------------> // - - // enqueue kernel with profiling - // <--------------------------------------------> // - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); - // <--------------------------------------------> // - - // deallocate sub buffers and images - // <--------------------------------------------> // - CL_CHECK(clReleaseMemObject(A_image1d)); - CL_CHECK(clReleaseMemObject(B_sub_buffer)); - CL_CHECK(clReleaseMemObject(B_image1d)); - if (N != 1) { - CL_CHECK(clReleaseMemObject(B_d)); - CL_CHECK(clReleaseMemObject(B_d_input_image)); - CL_CHECK(clReleaseMemObject(C_d)); + // q5_K x fp32 + if (src0t == GGML_TYPE_Q5_K && src1t == GGML_TYPE_F32) { + ggml_cl_mul_mat_q5_K_f32_adreno(backend, src0, src1, dst); + return; } - // <--------------------------------------------> // - - return; - } } // if (ne01 && ne1) #endif // GGML_OPENCL_USE_ADRENO_KERNELS diff --git a/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl b/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_0_f32.cl similarity index 99% rename from ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl rename to ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_0_f32.cl index ecb577b9933..159378049fb 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +++ b/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_0_f32.cl @@ -17,7 +17,7 @@ REQD_SUBGROUP_SIZE_128 #endif -kernel void kernel_mul_mat_Ab_Bi_8x4( +kernel void kernel_gemm_noshuffle_q4_0_f32( global const ushort * src0_q, // quantized A global const half * src0_d, // A scales __read_only image1d_buffer_t src1, // B (1d image) diff --git a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl b/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl similarity index 98% rename from ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl rename to ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl index 51ce2121ce2..7f06a22a2cb 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +++ b/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl @@ -11,7 +11,7 @@ REQD_SUBGROUP_SIZE_128 #endif -kernel void kernel_mul_mm_q8_0_f32_8x4( +kernel void kernel_gemm_noshuffle_q8_0_f32( global const uint * src0_q, global const half * src0_d, __read_only image1d_buffer_t src1, diff --git a/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32.cl similarity index 98% rename from ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl rename to ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32.cl index 469d3edef00..10683206919 100644 --- a/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +++ b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32.cl @@ -191,7 +191,7 @@ #ifdef ADRENO_GPU REQD_SUBGROUP_SIZE_64 #endif -__kernel void kernel_gemv_noshuffle( +__kernel void kernel_gemv_noshuffle_q4_0_f32( __read_only image1d_buffer_t src0_q, // quantized A global half2 * src0_d, // A scales __read_only image1d_buffer_t src1, // B @@ -238,21 +238,21 @@ __kernel void kernel_gemv_noshuffle( regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x; regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x; regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x; -#ifdef VECTOR_SUB_GROUP_BROADCAT +#ifdef VECTOR_SUB_GROUP_BROADCAST dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB); #else dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB); -#endif // VECTOR_SUB_GROUP_BROADCAT +#endif // VECTOR_SUB_GROUP_BROADCAST regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x; regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x; regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x; regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x; -#ifdef VECTOR_SUB_GROUP_BROADCAT +#ifdef VECTOR_SUB_GROUP_BROADCAST dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB); #else dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB); -#endif // VECTOR_SUB_GROUP_BROADCAT +#endif // VECTOR_SUB_GROUP_BROADCAST } // reduction in local memory, assumes #wave=4 diff --git a/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32_spec.cl similarity index 98% rename from ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl rename to ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32_spec.cl index ee5c79f000d..571a375da7f 100644 --- a/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +++ b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32_spec.cl @@ -191,7 +191,7 @@ #ifdef ADRENO_GPU REQD_SUBGROUP_SIZE_64 #endif -__kernel void kernel_gemv_noshuffle( +__kernel void kernel_gemv_noshuffle_q4_0_f32( __read_only image1d_buffer_t src0_q, // quantized A global half2 * src0_d, // A scales __read_only image1d_buffer_t src1, // B @@ -232,21 +232,21 @@ __kernel void kernel_gemv_noshuffle( regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x; regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x; regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x; -#ifdef VECTOR_SUB_GROUP_BROADCAT +#ifdef VECTOR_SUB_GROUP_BROADCAST dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB); #else dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB); -#endif // VECTOR_SUB_GROUP_BROADCAT +#endif // VECTOR_SUB_GROUP_BROADCAST regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x; regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x; regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x; regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x; -#ifdef VECTOR_SUB_GROUP_BROADCAT +#ifdef VECTOR_SUB_GROUP_BROADCAST dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB); #else dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB); -#endif // VECTOR_SUB_GROUP_BROADCAT +#endif // VECTOR_SUB_GROUP_BROADCAST } // reduction in local memory, assumes #wave=4 diff --git a/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl similarity index 100% rename from ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl rename to ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl