diff --git a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_bf8.cpp b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_bf8.cpp index a95c0346cf..1520f2c591 100644 --- a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_bf8.cpp +++ b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_bf8.cpp @@ -4,7 +4,7 @@ #include "run_gemm_quant_example.inc" template -using GemmConfig = GemmConfigQuantPrefill; +using GemmConfig = GemmConfigQuantDecode; #define RUN_GEMM_EXAMPLE_PREC_TYPE \ run_gemm_example_prec_type, \ diff --git a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_bf8i4.cpp b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_bf8i4.cpp index d2b95d3263..a93fe15a1b 100644 --- a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_bf8i4.cpp +++ b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_bf8i4.cpp @@ -4,7 +4,7 @@ #include "run_gemm_quant_example.inc" template -using GemmConfig = GemmConfigQuantPrefill; +using GemmConfig = GemmConfigQuantDecode; #define RUN_GEMM_EXAMPLE_PREC_TYPE \ run_gemm_example_prec_type, \ diff --git a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_fp8.cpp b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_fp8.cpp index a8c13c1b3d..39747ff0bc 100644 --- a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_fp8.cpp +++ b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_fp8.cpp @@ -4,7 +4,7 @@ #include "run_gemm_quant_example.inc" template -using GemmConfig = GemmConfigQuantPrefill; +using GemmConfig = GemmConfigQuantDecode; #define RUN_GEMM_EXAMPLE_PREC_TYPE \ run_gemm_example_prec_type, \ diff --git a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_fp8i4.cpp b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_fp8i4.cpp index 6576b22c03..ed18cd8890 100644 --- a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_fp8i4.cpp +++ b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_fp8i4.cpp @@ -4,7 +4,7 @@ #include "run_gemm_quant_example.inc" template -using GemmConfig = GemmConfigQuantPrefill; +using GemmConfig = GemmConfigQuantDecode; #define RUN_GEMM_EXAMPLE_PREC_TYPE \ run_gemm_example_prec_type, \ diff --git a/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc b/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc index 540d5725dd..508f3ac8ec 100644 --- a/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc +++ b/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc @@ -215,11 +215,8 @@ float gemm_calc_quant(const ck_tile::QuantGemmHostArgs& args, const ck_tile::str const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); const dim3 blocks = Kernel::BlockSize(); - if(args.k_batch != 1) - { - throw std::runtime_error("split-k is not supported yet!"); - } - + // Split-K validation is handled by Kernel::IsSupportedArgument + // Split-K is only supported for BQuantGrouped without preshuffle if(!Kernel::IsSupportedArgument(kargs)) { throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n"); @@ -661,182 +658,6 @@ int run_gemm_example_with_layouts(const ck_tile::ArgParser& arg_parser, } } } - else if(init_method == 3) - { - if constexpr(QuantMode == ck_tile::QuantType::BQuantGrouped) - { - ck_tile::FillConstant{static_cast(0x38)}(a_m_k); - ck_tile::FillConstant{static_cast(0x22)}(b_k_n); - ck_tile::FillConstant{static_cast(0.5f)}(*bq_tensor_ptr); - } - else if constexpr(QuantMode == ck_tile::QuantType::ABQuantGrouped) - { - ck_tile::FillConstant{static_cast(0x38)}(a_m_k); - ck_tile::FillConstant{static_cast(0x22)}(b_k_n); - ck_tile::FillConstant{static_cast(0.5f)}(*aq_tensor_ptr); - ck_tile::FillConstant{static_cast(0.5f)}(*bq_tensor_ptr); - } - else - { - ck_tile::FillConstant{static_cast(0x22)}(a_m_k); - ck_tile::FillConstant{static_cast(2.0f)}(*aq_tensor_ptr); - ck_tile::FillConstant{static_cast(0x38)}(b_k_n); - - if constexpr(QuantMode == ck_tile::QuantType::RowColQuant) - { - ck_tile::FillConstant{static_cast(0.5f)}(*bq_tensor_ptr); - } - } - } - else if(init_method == 4) - { - if constexpr(QuantMode == ck_tile::QuantType::BQuantGrouped) - { - if constexpr(std::is_same_v) - { - ck_tile::FillUniformDistribution{-5.0f, 5.0f, fill_seed(gen)}( - b_k_n); - ck_tile::FillUniformDistribution{-2.0f, 2.0f, fill_seed(gen)}( - *bq_tensor_ptr); - } - else if constexpr(std::is_same_v) - { - ck_tile::FillUniformDistribution{-5.0f, 5.0f, fill_seed(gen)}(b_k_n); - ck_tile::FillUniformDistribution{125.f, 130.f, fill_seed(gen)}( - *bq_tensor_ptr); - } - else - { - ck_tile::FillUniformDistribution{-2.0f, 3.0f, fill_seed(gen)}(b_k_n); - ck_tile::FillUniformDistribution{-2.0f, 2.0f, fill_seed(gen)}( - *bq_tensor_ptr); - } - - ck_tile::FillUniformDistribution{-5.0f, 5.0f, fill_seed(gen)}(a_m_k); - } - else if constexpr(QuantMode == ck_tile::QuantType::AQuantGrouped) - { - if constexpr(std::is_same_v) - { - ck_tile::FillUniformDistribution{-5.0f, 5.0f, fill_seed(gen)}( - a_m_k); - } - else - { - ck_tile::FillUniformDistribution{-2.0f, 3.0f, fill_seed(gen)}(a_m_k); - } - ck_tile::FillUniformDistribution{2.0f, 2.0f, fill_seed(gen)}( - *aq_tensor_ptr); - ck_tile::FillUniformDistribution{-5.0f, 5.0f, fill_seed(gen)}(b_k_n); - } - else if constexpr(QuantMode == ck_tile::QuantType::ABQuantGrouped) - { - if constexpr(std::is_same_v || - std::is_same_v) - { - ck_tile::FillUniformDistribution{-5.0f, 5.0f, fill_seed(gen)}(a_m_k); - ck_tile::FillUniformDistribution{-5.0f, 5.0f, fill_seed(gen)}(b_k_n); - } - else - { - ck_tile::FillUniformDistribution{-2.0f, 3.0f, fill_seed(gen)}(a_m_k); - ck_tile::FillUniformDistribution{-2.0f, 3.0f, fill_seed(gen)}(b_k_n); - } - ck_tile::FillUniformDistribution{-2.0f, 2.0f, fill_seed(gen)}( - *aq_tensor_ptr); - ck_tile::FillUniformDistribution{-2.0f, 2.0f, fill_seed(gen)}( - *bq_tensor_ptr); - } - else - { - ck_tile::FillUniformDistribution{-2.0f, 2.0f, fill_seed(gen)}(a_m_k); - ck_tile::FillUniformDistribution{-2.0f, 2.0f, fill_seed(gen)}(b_k_n); - ck_tile::FillUniformDistribution{-2.0f, 2.0f, fill_seed(gen)}( - *aq_tensor_ptr); - ck_tile::FillUniformDistribution{-2.0f, 2.0f, fill_seed(gen)}( - *bq_tensor_ptr); - } - } - else if(init_method == 5) - { - if constexpr(QuantMode == ck_tile::QuantType::BQuantGrouped) - { - if constexpr(std::is_same_v) - { - ck_tile::FillUniformDistribution{-5.0f, 5.0f, fill_seed(gen)}( - b_k_n); - ck_tile::FillUniformDistribution{-2.0f, 2.0f, fill_seed(gen)}( - *bq_tensor_ptr); - } - else if constexpr(std::is_same_v) - { - ck_tile::FillUniformDistribution{-5.0f, 5.0f, fill_seed(gen)}(b_k_n); - ck_tile::FillUniformDistribution{125.f, 130.f, fill_seed(gen)}( - *bq_tensor_ptr); - } - else - { - ck_tile::FillUniformDistribution{-2.0f, 3.0f, fill_seed(gen)}(b_k_n); - ck_tile::FillUniformDistribution{-2.0f, 2.0f, fill_seed(gen)}( - *bq_tensor_ptr); - } - - ck_tile::FillUniformDistribution{-5.0f, 5.0f, fill_seed(gen)}(a_m_k); - } - else if constexpr(QuantMode == ck_tile::QuantType::AQuantGrouped) - { - if constexpr(std::is_same_v) - { - ck_tile::FillUniformDistribution{-5.0f, 5.0f, fill_seed(gen)}( - a_m_k); - } - else - { - ck_tile::FillUniformDistribution{1.0f, 1.0f, fill_seed(gen)}(a_m_k); - } - // Fill aquant such that column j has value 2^j (1, 2, 4, 8, ...) - for(ck_tile::index_t row = 0; - row < static_cast(aq_tensor_ptr->get_length(0)); - ++row) - { - for(ck_tile::index_t col = 0; - col < static_cast(aq_tensor_ptr->get_length(1)); - ++col) - { - (*aq_tensor_ptr)(row, col) = static_cast(col + 1); - } - } - // std::cout << "aq_tensor_ptr: " << *aq_tensor_ptr << std::endl; - ck_tile::FillUniformDistribution{1.0f, 1.0f, fill_seed(gen)}(b_k_n); - } - else if constexpr(QuantMode == ck_tile::QuantType::ABQuantGrouped) - { - if constexpr(std::is_same_v || - std::is_same_v) - { - ck_tile::FillUniformDistribution{-5.0f, 5.0f, fill_seed(gen)}(a_m_k); - ck_tile::FillUniformDistribution{-5.0f, 5.0f, fill_seed(gen)}(b_k_n); - } - else - { - ck_tile::FillUniformDistribution{-2.0f, 3.0f, fill_seed(gen)}(a_m_k); - ck_tile::FillUniformDistribution{-2.0f, 3.0f, fill_seed(gen)}(b_k_n); - } - ck_tile::FillUniformDistribution{-2.0f, 2.0f, fill_seed(gen)}( - *aq_tensor_ptr); - ck_tile::FillUniformDistribution{-2.0f, 2.0f, fill_seed(gen)}( - *bq_tensor_ptr); - } - else - { - ck_tile::FillUniformDistribution{-2.0f, 2.0f, fill_seed(gen)}(a_m_k); - ck_tile::FillUniformDistribution{-2.0f, 2.0f, fill_seed(gen)}(b_k_n); - ck_tile::FillUniformDistribution{-2.0f, 2.0f, fill_seed(gen)}( - *aq_tensor_ptr); - ck_tile::FillUniformDistribution{-2.0f, 2.0f, fill_seed(gen)}( - *bq_tensor_ptr); - } - } else { a_m_k.SetZero(); diff --git a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp index 21bd691b49..db86fdbeac 100644 --- a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp +++ b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp @@ -380,9 +380,18 @@ struct QuantGemmKernel __device__ SplitKBatchOffset(const QuantGemmKernelArgs& kargs, const std::size_t k_id = blockIdx.z) { - constexpr auto K1 = GemmPipeline::BlockGemmShape::WarpTile::at(I2); - const index_t K_t = amd_wave_read_first_lane(kargs.k_batch * K1); - const index_t KRead = amd_wave_read_first_lane((kargs.K + K_t - 1) / K_t * K1); + constexpr auto K1 = + GemmPipeline::BlockGemmShape::WarpTile::at(I2); // smallest unit of K work per block + const index_t K_t = amd_wave_read_first_lane( + kargs.k_batch * K1); // amount of K elements consumed if every split-K batch + // performs exactly one "unit" (K1) + const index_t KRead = amd_wave_read_first_lane( + (kargs.K + K_t - 1) / K_t * K1); // total k elements to be read in this batch + // offset not necessarily = KRead, because B can have packed elements (e.g. fp8i4) + constexpr index_t BPackedSize = + ck_tile::numeric_traits>::PackedSize; + const index_t b_k_offset_elements = + amd_wave_read_first_lane(k_id * KRead / BPackedSize); if constexpr(std::is_same_v) { @@ -395,11 +404,11 @@ struct QuantGemmKernel if constexpr(std::is_same_v) { - b_k_split_offset = amd_wave_read_first_lane(k_id * KRead * kargs.stride_B); + b_k_split_offset = amd_wave_read_first_lane(b_k_offset_elements * kargs.stride_B); } else if constexpr(std::is_same_v) { - b_k_split_offset = amd_wave_read_first_lane(k_id * KRead); + b_k_split_offset = amd_wave_read_first_lane(b_k_offset_elements); } if(k_id < static_cast(kargs.k_batch - 1)) @@ -410,10 +419,47 @@ struct QuantGemmKernel { splitted_k = amd_wave_read_first_lane(kargs.K - KRead * (kargs.k_batch - 1)); } + + // Compute BQ offset for BQuantGrouped mode (non-preshuffle only) + // Note: With the alignment validation in IsSupportedArgument, KRead is always + // a multiple of BQuantGroupSize::kK, so bq_k_split_offset will be correctly aligned. + if constexpr(kQuantType == QuantType::BQuantGrouped && !BPreshuffleQuant) + { + using BQuantGroupSize = remove_cvref_t; + // Compute the K offset for this batch (in terms of K elements) + const index_t k_offset = amd_wave_read_first_lane(k_id * KRead); + // Convert K offset to BQ group offset (logical offset in K/kK dimension) + bq_group_offset = amd_wave_read_first_lane(k_offset / BQuantGroupSize::kK); + + // BQ tensor layout: + // RowMajor: [K/kK, N/kN] with stride [N/kN, 1] + // ColumnMajor: [N/kN, K/kK] with stride [K/kK, 1] + if constexpr(std::is_same_v) + { + // For RowMajor BQ, K is the row dimension + // offset = bq_group_offset * stride_BQ + const index_t stride_bq = + amd_wave_read_first_lane(integer_divide_ceil(kargs.N, BQuantGroupSize::kN)); + bq_k_split_offset = amd_wave_read_first_lane(bq_group_offset * stride_bq); + } + else if constexpr(std::is_same_v) + { + // For ColumnMajor BQ, K is the column dimension + // offset = bq_group_offset + bq_k_split_offset = amd_wave_read_first_lane(bq_group_offset); + } + } + else + { + bq_group_offset = 0; + bq_k_split_offset = 0; + } } index_t a_k_split_offset; index_t b_k_split_offset; + index_t bq_group_offset; // Logical offset in K-groups (K/kK dimension) + index_t bq_k_split_offset; // Memory pointer offset (accounting for layout/stride) index_t splitted_k; }; @@ -805,10 +851,13 @@ struct QuantGemmKernel CK_TILE_DEVICE static auto MakeBQBlockWindow(const BQDataType* bq_ptr, const QuantGemmKernelArgs& kargs, + const index_t bq_group_offset, const index_t i_m, const index_t i_n) { // Step 1: Create tensor view for BQ + // Note: For split-K, the bq_ptr is already offset by bq_k_split_offset (pointer offset). + // The dimension should use the remaining K-groups from this offset position. const auto& bq_tensor_view = [&]() { if constexpr(kQuantType == QuantType::RowColQuant) { @@ -850,11 +899,12 @@ struct QuantGemmKernel "ABQuantGrouped requires ColumnMajor BQ layout"); } + using BQuantGroupSize = remove_cvref_t; if constexpr(std::is_same_v) { return make_naive_tensor_view( bq_ptr, - make_tuple(integer_divide_ceil(kargs.K, BQuantGroupSize::kK), + make_tuple(kargs.QK_B - bq_group_offset, integer_divide_ceil(kargs.N, BQuantGroupSize::kN)), make_tuple(integer_divide_ceil(kargs.N, BQuantGroupSize::kN), 1), number{}, @@ -865,8 +915,8 @@ struct QuantGemmKernel return make_naive_tensor_view( bq_ptr, make_tuple(integer_divide_ceil(kargs.N, BQuantGroupSize::kN), - integer_divide_ceil(kargs.K, BQuantGroupSize::kK)), - make_tuple(integer_divide_ceil(kargs.K, BQuantGroupSize::kK), 1), + kargs.QK_B - bq_group_offset), + make_tuple(kargs.QK_B, 1), number{}, number<1>{}); } @@ -1047,13 +1097,61 @@ struct QuantGemmKernel CK_TILE_HOST static bool IsSupportedArgument(const QuantGemmKernelArgs& kargs) { + // Split-K is supported for BQuantGrouped mode without preshuffle if(kargs.k_batch != 1) { - if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) + constexpr bool is_bquant_non_preshuffle = + (kQuantType == QuantType::BQuantGrouped) && !BPreshuffleQuant; + if constexpr(!is_bquant_non_preshuffle) { - CK_TILE_ERROR("Conditions not met for Kbatch >1 !"); + if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) + { + CK_TILE_ERROR("Conditions not met for Kbatch >1 ! " + "Split-K only supported for BQuantGrouped without preshuffle."); + } + return false; + } + else + { + using BQuantGroupSize = remove_cvref_t; + constexpr auto K1 = GemmPipeline::BlockGemmShape::WarpTile::at(I2); + const index_t K_t = kargs.k_batch * K1; + const index_t KRead = (kargs.K + K_t - 1) / K_t * K1; + constexpr index_t BPackedSize = + ck_tile::numeric_traits>::PackedSize; + + // Constraint 1: KRead must align with B packing requirements. + // For packed data types, multiple K elements are stored in each storage unit. + // Split-K advances the B pointer by (KRead / BPackedSize) storage units per batch. + // If KRead is not divisible by BPackedSize, this division produces a fractional + // offset, making it impossible to start reading from a valid storage unit boundary. + if(KRead % BPackedSize != 0) + { + if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) + { + CK_TILE_ERROR("KRead must be a multiple of B packed size for split-K!"); + } + return false; + } + + // Constraint 2: KRead must align with quantization group boundaries. + // Each split-K batch reads KRead consecutive K elements. If KRead is not + // a multiple of BQuantGroupSize::kK, the batch will span partial quantization + // groups, requiring split access to a quantization scale. This violates the + // atomic processing requirement where each batch must work with complete groups. + if(KRead % BQuantGroupSize::kK != 0) + { + if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) + { + CK_TILE_ERROR("Split-K batch size must be aligned with quantization group " + "size! KRead=" + + std::to_string(KRead) + + " is not divisible by BQuantGroupSize::kK=" + + std::to_string(BQuantGroupSize::kK)); + } + return false; + } } - return false; } if constexpr(std::is_same_v) @@ -1215,7 +1313,10 @@ struct QuantGemmKernel const auto& b_block_window = MakeBBlockWindow(b_ptr, kargs, splitk_batch_offset.splitted_k, block_idx_n); const auto& aq_block_window = MakeAQBlockWindow(aq_ptr, kargs, block_idx_m, block_idx_n); - const auto& bq_block_window = MakeBQBlockWindow(bq_ptr, kargs, block_idx_m, block_idx_n); + // Note: Pass bq_group_offset so the tensor view dimension reflects + // the remaining K-groups from the split-K offset position. + const auto& bq_block_window = MakeBQBlockWindow( + bq_ptr, kargs, splitk_batch_offset.bq_group_offset, block_idx_m, block_idx_n); const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); @@ -1343,8 +1444,9 @@ struct QuantGemmKernel const BDataType* b_ptr = static_cast(kargs.b_ptr) + splitk_batch_offset.b_k_split_offset; const AQDataType* aq_ptr = static_cast(kargs.aq_ptr); - const BQDataType* bq_ptr = static_cast(kargs.bq_ptr); - CDataType* c_ptr = static_cast(kargs.c_ptr); + const BQDataType* bq_ptr = + static_cast(kargs.bq_ptr) + splitk_batch_offset.bq_k_split_offset; + CDataType* c_ptr = static_cast(kargs.c_ptr); // allocate LDS __shared__ char smem_ptr[GetSmemSize()]; diff --git a/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp b/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp index c9e725f5fd..8b77b01e2f 100644 --- a/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp +++ b/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp @@ -387,8 +387,8 @@ struct QuantGroupedGemmKernel Base::MakeABlockWindow(a_ptr, kargs, splitk_batch_offset.splitted_k, block_idx_m); const auto& b_block_window = Base::MakeBBlockWindow(b_ptr, kargs, splitk_batch_offset.splitted_k, block_idx_n); - const auto& bq_block_window = - Base::MakeBQBlockWindow(bq_ptr, kargs, block_idx_m, block_idx_n); + const auto& bq_block_window = Base::MakeBQBlockWindow( + bq_ptr, kargs, splitk_batch_offset.bq_group_offset, block_idx_m, block_idx_n); const index_t num_loop = __builtin_amdgcn_readfirstlane( TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); @@ -453,8 +453,8 @@ struct QuantGroupedGemmKernel Base::MakeBBlockWindow(b_ptr, kargs, splitk_batch_offset.splitted_k, block_idx_n); const auto& aq_block_window = Base::MakeAQBlockWindow(aq_ptr, kargs, block_idx_m, block_idx_n); - const auto& bq_block_window = - Base::MakeBQBlockWindow(bq_ptr, kargs, block_idx_m, block_idx_n); + const auto& bq_block_window = Base::MakeBQBlockWindow( + bq_ptr, kargs, splitk_batch_offset.bq_group_offset, block_idx_m, block_idx_n); // Get hot-loop and tail configuration const index_t num_loop = __builtin_amdgcn_readfirstlane( diff --git a/test/ck_tile/gemm_block_scale/CMakeLists.txt b/test/ck_tile/gemm_block_scale/CMakeLists.txt index 8e005d588e..2b19053f41 100644 --- a/test/ck_tile/gemm_block_scale/CMakeLists.txt +++ b/test/ck_tile/gemm_block_scale/CMakeLists.txt @@ -128,6 +128,17 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12") ) target_compile_options(test_tile_gemm_quant_bquant_transpose PRIVATE ${TEST_GEMM_COMPILE_OPTIONS}) + # BQuant split-K tests (no preshuffle) + add_gtest_executable(test_tile_gemm_quant_bquant_splitk_decode + test_gemm_quant_bquant_splitk_decode.cpp + ) + target_compile_options(test_tile_gemm_quant_bquant_splitk_decode PRIVATE ${TEST_GEMM_COMPILE_OPTIONS}) + + add_gtest_executable(test_tile_gemm_quant_bquant_splitk_prefill + test_gemm_quant_bquant_splitk_prefill.cpp + ) + target_compile_options(test_tile_gemm_quant_bquant_splitk_prefill PRIVATE ${TEST_GEMM_COMPILE_OPTIONS}) + # BQuant tests (with PreshuffleB) - split into 5 files add_gtest_executable(test_tile_gemm_quant_bquant_preshuffle_decode_1d test_gemm_quant_bquant_preshuffle_decode_1d.cpp diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp new file mode 100644 index 0000000000..ea1a8a1fbb --- /dev/null +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp @@ -0,0 +1,61 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "ck_tile/host.hpp" +#include "ck_tile/ops/gemm.hpp" + +#include +#include + +#include "test_gemm_quant_fixtures.hpp" + +// Type aliases for readability +using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; +using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; +using FP8 = ck_tile::fp8_t; +using BF8 = ck_tile::bf8_t; +using Half = ck_tile::half_t; +using PkInt4 = ck_tile::pk_int4_t; +using BQuantGrouped = std::integral_constant; +using GroupSize128 = ck_tile::QuantGroupShape>; + +// Type combinations for BQuant split-K tests - Decode shape, GroupSize 128 +// Tuple format: +// clang-format off +using BQuantSplitKDecodeTypes = ::testing::Types< + std::tuple, + std::tuple, + std::tuple, + std::tuple +>; +// clang-format on + +// Test suite for BQuant split-K Decode +TYPED_TEST_SUITE(TestCkTileGemmBQuant, BQuantSplitKDecodeTypes); + +// BQuant split-K tests +TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK2Test) +{ + // K=1024 for split_k=2: 1024/2=512=4×128 ✓ + this->run_test_with_validation(32, 128, 1024, 2); +} + +TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK3Test) +{ + // K=3072 for split_k=3: 3072/3=1024=8×128 ✓ + this->run_test_with_validation(32, 128, 3072, 3); +} + +TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK4Test) +{ + // K=2048 for split_k=4: 2048/4=512=4×128 ✓ + this->run_test_with_validation(32, 128, 2048, 4); +} + +TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK5Test) +{ + // K=2560 for split_k=5: 2560/5=512=4×128 ✓ + // Also K must be divisible by K_Tile(256)*split_k(5)=1280 + this->run_test_with_validation(32, 128, 2560, 5); +} diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_prefill.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_prefill.cpp new file mode 100644 index 0000000000..f4f93dbbb6 --- /dev/null +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_prefill.cpp @@ -0,0 +1,64 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "ck_tile/host.hpp" +#include "ck_tile/ops/gemm.hpp" + +#include +#include + +#include "test_gemm_quant_fixtures.hpp" + +// Type aliases for readability +using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; +using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; +using FP8 = ck_tile::fp8_t; +using BF8 = ck_tile::bf8_t; +using Half = ck_tile::half_t; +using PkInt4 = ck_tile::pk_int4_t; +using BQuantGrouped = std::integral_constant; +using GroupSize128 = ck_tile::QuantGroupShape>; + +// Type combinations for BQuant split-K tests - Prefill shape, GroupSize 128 +// Tuple format: +// clang-format off +using BQuantSplitKPrefillTypes = ::testing::Types< + std::tuple, + std::tuple, + std::tuple, + std::tuple +>; +// clang-format on + +// Test suite for BQuant split-K Prefill +TYPED_TEST_SUITE(TestCkTileGemmBQuant, BQuantSplitKPrefillTypes); + +// BQuant split-K tests +TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK2Test) +{ + // K=1024 for split_k=2: 1024/2=512=4×128 ✓ + // K must be divisible by K_Tile(128)*split_k(2)=256 + this->run_test_with_validation(128, 128, 1024, 2); +} + +TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK3Test) +{ + // K=3072 for split_k=3: 3072/3=1024=8×128 ✓ + // K must be divisible by K_Tile(128)*split_k(3)=384 + this->run_test_with_validation(128, 128, 3072, 3); +} + +TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK4Test) +{ + // K=2048 for split_k=4: 2048/4=512=4×128 ✓ + // K must be divisible by K_Tile(128)*split_k(4)=512 + this->run_test_with_validation(128, 128, 2048, 4); +} + +TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK5Test) +{ + // K=1920 for split_k=5: 1920/5=384=3×128 ✓ + // K must be divisible by K_Tile(128)*split_k(5)=640 + this->run_test_with_validation(128, 128, 1920, 5); +} diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp index 0033bb42a8..ca21bc69b7 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp @@ -655,7 +655,10 @@ class TestCkTileGemmBQuant : public TestCkTileGemmQuantBase b_k_n_dev = b_k_n; @@ -746,12 +752,12 @@ class TestCkTileGemmBQuant : public TestCkTileGemmQuantBasetemplate calculate_rtol_atol( - K, 1, max_accumulated_value); + K, k_batch, max_accumulated_value); // Validate results bool pass = ck_tile::check_err(c_m_n_dev_result, @@ -806,7 +812,7 @@ class TestCkTileGemmBQuant : public TestCkTileGemmQuantBase{})); EXPECT_TRUE(pass) << "BQuantGrouped validation failed with M=" << M << ", N=" << N - << ", K=" << K; + << ", K=" << K << ", k_batch=" << k_batch; if(!pass) {