From 53a200559640171b20390ad314568f0a67137ce2 Mon Sep 17 00:00:00 2001
From: Pengbo Wang <221450789+pengbowang-nv@users.noreply.github.com>
Date: Tue, 14 Apr 2026 00:53:11 -0700
Subject: [PATCH 1/2] add try catch to avoid missing kernel interfere with
 normal process

Signed-off-by: Pengbo Wang <221450789+pengbowang-nv@users.noreply.github.com>
---
 .../trtllmGenKernels/fmha/fmhaKernels.h       | 64 +++++++++++--------
 1 file changed, 37 insertions(+), 27 deletions(-)

diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h
index 93e6a2c18e21..be5aca44dcb0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h
@@ -205,38 +205,48 @@ class TllmGenFmhaKernel
             return std::make_pair(false, "Empty batch or zero sequence length");
         }
 
-        // The selectKernelParams that might be updated.
-        SelectKernelParams selectKernelParams{params};
-
-        int32_t ctaDim = 512;
-        FmhaOptions options;
-        FmhaOptionsFromArgs optionsFromArgs;
-        parseOptionsFromRunnerParams(params, options);
-        options.mCudaArch = intToCudaArch(mSM);
+        try
+        {
 
-        FmhaAutoTuner autoTuner(options, optionsFromArgs, params.mMultiProcessorCount);
-        std::tie(options, optionsFromArgs, ctaDim) = autoTuner.selectKernel();
-        // Check if the options are valid or not.
-        checkFmhaOptions(options, optionsFromArgs);
-        // Update the options if needed.
-        updateFmhaOptions(options, optionsFromArgs);
-        // The number of CtasQ and CtasKv per sequence, Ctas in the Y dimension, and Ctas in the Z
-        // dimension.
-        computeNumCtas(options, params.mMultiProcessorCount);
+            // The selectKernelParams that might be updated.
+            SelectKernelParams selectKernelParams{params};
+
+            int32_t ctaDim = 512;
+            FmhaOptions options;
+            FmhaOptionsFromArgs optionsFromArgs;
+            parseOptionsFromRunnerParams(params, options);
+            options.mCudaArch = intToCudaArch(mSM);
+
+            FmhaAutoTuner autoTuner(options, optionsFromArgs, params.mMultiProcessorCount);
+            std::tie(options, optionsFromArgs, ctaDim) = autoTuner.selectKernel();
+            // Check if the options are valid or not.
+            checkFmhaOptions(options, optionsFromArgs);
+            // Update the options if needed.
+            updateFmhaOptions(options, optionsFromArgs);
+            // The number of CtasQ and CtasKv per sequence, Ctas in the Y dimension, and Ctas in the Z
+            // dimension.
+            computeNumCtas(options, params.mMultiProcessorCount);
+
+            // Check if a precompiled cubin exists for this configuration (same lookup as run()).
+            // If not, return (false, info) so the dispatcher can fall back to unfused MHA like on main.
+            algoFilterForCubinPath(options);
+            auto [hashId, info] = hashFromFmhaOptions(options);
 
-        // Check if a precompiled cubin exists for this configuration (same lookup as run()).
-        // If not, return (false, info) so the dispatcher can fall back to unfused MHA like on main.
-        algoFilterForCubinPath(options);
-        auto [hashId, info] = hashFromFmhaOptions(options);
+            if (mFunctions.find(hashId) == mFunctions.end())
+            {
+                TLLM_LOG_WARNING("Trtllm-gen kernels not found: " + info);
+                return std::make_pair(false, info);
+            }
+            TLLM_LOG_DEBUG("TRTLLM-Gen kernel traits: %s", info.c_str());
 
-        if (mFunctions.find(hashId) == mFunctions.end())
+            return std::make_pair(true, info);
+        }
+        catch (std::exception const& e)
         {
-            TLLM_LOG_WARNING("Trtllm-gen kernels not found: " + info);
-            return std::make_pair(false, info);
+            std::string const errorInfo = std::string("Exception during TrtllmGen kernel existence check");
+            TLLM_LOG_WARNING(errorInfo);
+            return std::make_pair(false, errorInfo);
         }
-        TLLM_LOG_DEBUG("TRTLLM-Gen kernel traits: %s", info.c_str());
-
-        return std::make_pair(true, info);
     }
 
     void algoFilterForCubinPath(FmhaOptions& options) const

From 0f391d027ebb7ca1464091a03ba43aeb8137f4d1 Mon Sep 17 00:00:00 2001
From: Pengbo Wang <221450789+pengbowang-nv@users.noreply.github.com>
Date: Thu, 16 Apr 2026 09:20:57 +0000
Subject: [PATCH 2/2] add NVRTC to checkIfKernelExist and extract
 shouldUseNvrtc utility function

Signed-off-by: Pengbo Wang <221450789+pengbowang-nv@users.noreply.github.com>
---
 .../trtllmGenKernels/fmha/fmhaKernels.h       | 55 ++++++++++++-------
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h
index be5aca44dcb0..6912cc11b850 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h
@@ -190,6 +190,16 @@ class TllmGenFmhaKernel
         }
     }
 
+    static bool shouldUseNvrtc(FmhaOptions const& options)
+    {
+        // Check if the NVRTC path should be used for a given FMHA configuration.
+        bool isLlama70bFp4Tp4 = options.mHeadDimQk == 128 && options.mHeadDimV == 128
+            && options.mDtypeKv == tg::Dtype::E4m3 && options.mNumHeadsQ == 16 && options.mNumHeadsQPerKv == 8;
+
+        return options.mFmhaKernelType == FmhaKernelType::SwapsMmaAbForGeneration && !options.mIsMlaGen
+            && options.mDtypeKv != tg::Dtype::E2m1 && options.mHeadDimQk != 64 && !isLlama70bFp4Tp4;
+    }
+
     std::pair<bool, std::string> checkIfKernelExist(RunnerParams const& params) const
     {
         // Some conditions to check if the kernel is supported.
@@ -207,10 +217,6 @@ class TllmGenFmhaKernel
 
         try
         {
-
-            // The selectKernelParams that might be updated.
-            SelectKernelParams selectKernelParams{params};
-
             int32_t ctaDim = 512;
             FmhaOptions options;
             FmhaOptionsFromArgs optionsFromArgs;
@@ -227,22 +233,35 @@ class TllmGenFmhaKernel
             // dimension.
             computeNumCtas(options, params.mMultiProcessorCount);
 
-            // Check if a precompiled cubin exists for this configuration (same lookup as run()).
-            // If not, return (false, info) so the dispatcher can fall back to unfused MHA like on main.
-            algoFilterForCubinPath(options);
-            auto [hashId, info] = hashFromFmhaOptions(options);
-
-            if (mFunctions.find(hashId) == mFunctions.end())
+            if (shouldUseNvrtc(options))
             {
-                TLLM_LOG_WARNING("Trtllm-gen kernels not found: " + info);
-                return std::make_pair(false, info);
+                // For the NVRTC path, we return supported as long as autotuner successfully selected a kernel config.
+                std::ostringstream sstream;
+                populateJsonConfig(options, sstream);
+                std::string info = sstream.str();
+
+                return std::make_pair(true, info);
             }
-            TLLM_LOG_DEBUG("TRTLLM-Gen kernel traits: %s", info.c_str());
+            else
+            {
+                // Check if a precompiled cubin exists for this configuration (same lookup as run()).
+                // If not, return (false, info) so the dispatcher can fall back to unfused MHA like on main.
+                algoFilterForCubinPath(options);
+                auto [hashId, info] = hashFromFmhaOptions(options);
+
+                if (mFunctions.find(hashId) == mFunctions.end())
+                {
+                    TLLM_LOG_WARNING("Trtllm-gen kernels not found: " + info);
+                    return std::make_pair(false, info);
+                }
+                TLLM_LOG_DEBUG("TRTLLM-Gen kernel traits: %s", info.c_str());
 
-            return std::make_pair(true, info);
+                return std::make_pair(true, info);
+            }
         }
         catch (std::exception const& e)
         {
+            // Omitting e.what(), they may contain "Runtime Error" and make scripts believe a fatal error happened.
             std::string const errorInfo = std::string("Exception during TrtllmGen kernel existence check");
             TLLM_LOG_WARNING(errorInfo);
             return std::make_pair(false, errorInfo);
@@ -295,13 +314,7 @@ class TllmGenFmhaKernel
 
         FmhaData fmhaData;
         setFmhaData(params, options, fmhaData);
-        bool isLlama70bFp4Tp4 = options.mHeadDimQk == 128 && options.mHeadDimV == 128
-            && options.mDtypeKv == tg::Dtype::E4m3 && options.mNumHeadsQ == 16 && options.mNumHeadsQPerKv == 8;
-
-        bool shouldUseNvrtc = options.mFmhaKernelType == FmhaKernelType::SwapsMmaAbForGeneration && !options.mIsMlaGen
-            && options.mDtypeKv != tg::Dtype::E2m1 && options.mHeadDimQk != 64 && !isLlama70bFp4Tp4;
-
-        if (shouldUseNvrtc)
+        if (shouldUseNvrtc(options))
         {
             // nvrtc path - uses mFmhaInterface member for kernel caching
             FmhaConfig fmhaConfig;