From 53a200559640171b20390ad314568f0a67137ce2 Mon Sep 17 00:00:00 2001 From: Pengbo Wang <221450789+pengbowang-nv@users.noreply.github.com> Date: Tue, 14 Apr 2026 00:53:11 -0700 Subject: [PATCH 1/2] add try catch to avoid missing kernel interfere with normal process Signed-off-by: Pengbo Wang <221450789+pengbowang-nv@users.noreply.github.com> --- .../trtllmGenKernels/fmha/fmhaKernels.h | 64 +++++++++++-------- 1 file changed, 37 insertions(+), 27 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h index 93e6a2c18e21..be5aca44dcb0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h @@ -205,38 +205,48 @@ class TllmGenFmhaKernel return std::make_pair(false, "Empty batch or zero sequence length"); } - // The selectKernelParams that might be updated. - SelectKernelParams selectKernelParams{params}; - - int32_t ctaDim = 512; - FmhaOptions options; - FmhaOptionsFromArgs optionsFromArgs; - parseOptionsFromRunnerParams(params, options); - options.mCudaArch = intToCudaArch(mSM); + try + { - FmhaAutoTuner autoTuner(options, optionsFromArgs, params.mMultiProcessorCount); - std::tie(options, optionsFromArgs, ctaDim) = autoTuner.selectKernel(); - // Check if the options are valid or not. - checkFmhaOptions(options, optionsFromArgs); - // Update the options if needed. - updateFmhaOptions(options, optionsFromArgs); - // The number of CtasQ and CtasKv per sequence, Ctas in the Y dimension, and Ctas in the Z - // dimension. - computeNumCtas(options, params.mMultiProcessorCount); + // The selectKernelParams that might be updated. + SelectKernelParams selectKernelParams{params}; + + int32_t ctaDim = 512; + FmhaOptions options; + FmhaOptionsFromArgs optionsFromArgs; + parseOptionsFromRunnerParams(params, options); + options.mCudaArch = intToCudaArch(mSM); + + FmhaAutoTuner autoTuner(options, optionsFromArgs, params.mMultiProcessorCount); + std::tie(options, optionsFromArgs, ctaDim) = autoTuner.selectKernel(); + // Check if the options are valid or not. + checkFmhaOptions(options, optionsFromArgs); + // Update the options if needed. + updateFmhaOptions(options, optionsFromArgs); + // The number of CtasQ and CtasKv per sequence, Ctas in the Y dimension, and Ctas in the Z + // dimension. + computeNumCtas(options, params.mMultiProcessorCount); + + // Check if a precompiled cubin exists for this configuration (same lookup as run()). + // If not, return (false, info) so the dispatcher can fall back to unfused MHA like on main. + algoFilterForCubinPath(options); + auto [hashId, info] = hashFromFmhaOptions(options); - // Check if a precompiled cubin exists for this configuration (same lookup as run()). - // If not, return (false, info) so the dispatcher can fall back to unfused MHA like on main. - algoFilterForCubinPath(options); - auto [hashId, info] = hashFromFmhaOptions(options); + if (mFunctions.find(hashId) == mFunctions.end()) + { + TLLM_LOG_WARNING("Trtllm-gen kernels not found: " + info); + return std::make_pair(false, info); + } + TLLM_LOG_DEBUG("TRTLLM-Gen kernel traits: %s", info.c_str()); - if (mFunctions.find(hashId) == mFunctions.end()) + return std::make_pair(true, info); + } + catch (std::exception const& e) { - TLLM_LOG_WARNING("Trtllm-gen kernels not found: " + info); - return std::make_pair(false, info); + std::string const errorInfo = std::string("Exception during TrtllmGen kernel existence check"); + TLLM_LOG_WARNING(errorInfo); + return std::make_pair(false, errorInfo); } - TLLM_LOG_DEBUG("TRTLLM-Gen kernel traits: %s", info.c_str()); - - return std::make_pair(true, info); } void algoFilterForCubinPath(FmhaOptions& options) const From 0f391d027ebb7ca1464091a03ba43aeb8137f4d1 Mon Sep 17 00:00:00 2001 From: Pengbo Wang <221450789+pengbowang-nv@users.noreply.github.com> Date: Thu, 16 Apr 2026 09:20:57 +0000 Subject: [PATCH 2/2] add NVRTC to checkIfKernelExist and extract shouldUseNvrtc utility function Signed-off-by: Pengbo Wang <221450789+pengbowang-nv@users.noreply.github.com> --- .../trtllmGenKernels/fmha/fmhaKernels.h | 55 ++++++++++++------- 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h index be5aca44dcb0..6912cc11b850 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h @@ -190,6 +190,16 @@ class TllmGenFmhaKernel } } + static bool shouldUseNvrtc(FmhaOptions const& options) + { + // Check if the NVRTC path should be used for a given FMHA configuration. + bool isLlama70bFp4Tp4 = options.mHeadDimQk == 128 && options.mHeadDimV == 128 + && options.mDtypeKv == tg::Dtype::E4m3 && options.mNumHeadsQ == 16 && options.mNumHeadsQPerKv == 8; + + return options.mFmhaKernelType == FmhaKernelType::SwapsMmaAbForGeneration && !options.mIsMlaGen + && options.mDtypeKv != tg::Dtype::E2m1 && options.mHeadDimQk != 64 && !isLlama70bFp4Tp4; + } + std::pair checkIfKernelExist(RunnerParams const& params) const { // Some conditions to check if the kernel is supported. @@ -207,10 +217,6 @@ class TllmGenFmhaKernel try { - - // The selectKernelParams that might be updated. - SelectKernelParams selectKernelParams{params}; - int32_t ctaDim = 512; FmhaOptions options; FmhaOptionsFromArgs optionsFromArgs; @@ -227,22 +233,35 @@ class TllmGenFmhaKernel // dimension. computeNumCtas(options, params.mMultiProcessorCount); - // Check if a precompiled cubin exists for this configuration (same lookup as run()). - // If not, return (false, info) so the dispatcher can fall back to unfused MHA like on main. - algoFilterForCubinPath(options); - auto [hashId, info] = hashFromFmhaOptions(options); - - if (mFunctions.find(hashId) == mFunctions.end()) + if (shouldUseNvrtc(options)) { - TLLM_LOG_WARNING("Trtllm-gen kernels not found: " + info); - return std::make_pair(false, info); + // For the NVRTC path, we return supported as long as autotuner successfully selected a kernel config. + std::ostringstream sstream; + populateJsonConfig(options, sstream); + std::string info = sstream.str(); + + return std::make_pair(true, info); } - TLLM_LOG_DEBUG("TRTLLM-Gen kernel traits: %s", info.c_str()); + else + { + // Check if a precompiled cubin exists for this configuration (same lookup as run()). + // If not, return (false, info) so the dispatcher can fall back to unfused MHA like on main. + algoFilterForCubinPath(options); + auto [hashId, info] = hashFromFmhaOptions(options); + + if (mFunctions.find(hashId) == mFunctions.end()) + { + TLLM_LOG_WARNING("Trtllm-gen kernels not found: " + info); + return std::make_pair(false, info); + } + TLLM_LOG_DEBUG("TRTLLM-Gen kernel traits: %s", info.c_str()); - return std::make_pair(true, info); + return std::make_pair(true, info); + } } catch (std::exception const& e) { + // Omitting e.what(), they may contain "Runtime Error" and make scripts believe a fatal error happened. std::string const errorInfo = std::string("Exception during TrtllmGen kernel existence check"); TLLM_LOG_WARNING(errorInfo); return std::make_pair(false, errorInfo); @@ -295,13 +314,7 @@ class TllmGenFmhaKernel FmhaData fmhaData; setFmhaData(params, options, fmhaData); - bool isLlama70bFp4Tp4 = options.mHeadDimQk == 128 && options.mHeadDimV == 128 - && options.mDtypeKv == tg::Dtype::E4m3 && options.mNumHeadsQ == 16 && options.mNumHeadsQPerKv == 8; - - bool shouldUseNvrtc = options.mFmhaKernelType == FmhaKernelType::SwapsMmaAbForGeneration && !options.mIsMlaGen - && options.mDtypeKv != tg::Dtype::E2m1 && options.mHeadDimQk != 64 && !isLlama70bFp4Tp4; - - if (shouldUseNvrtc) + if (shouldUseNvrtc(options)) { // nvrtc path - uses mFmhaInterface member for kernel caching FmhaConfig fmhaConfig;