add NVRTC to checkIfKernelExist and extract shouldUseNvrtc utility function

pengbowang-nv · pengbowang-nv · commit 1d060ec24c12 · 2026-04-16T09:20:57.000Z
Signed-off-by: Pengbo Wang &lt;221450789+pengbowang-nv@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h
@@ -190,6 +190,16 @@ class TllmGenFmhaKernel
         }
     }
 
+    static bool shouldUseNvrtc(FmhaOptions const& options)
+    {
+        // Check if the NVRTC path should be used for a given FMHA configuration.
+        bool isLlama70bFp4Tp4 = options.mHeadDimQk == 128 && options.mHeadDimV == 128
+            && options.mDtypeKv == tg::Dtype::E4m3 && options.mNumHeadsQ == 16 && options.mNumHeadsQPerKv == 8;
+
+        return options.mFmhaKernelType == FmhaKernelType::SwapsMmaAbForGeneration && !options.mIsMlaGen
+            && options.mDtypeKv != tg::Dtype::E2m1 && options.mHeadDimQk != 64 && !isLlama70bFp4Tp4;
+    }
+
     std::pair<bool, std::string> checkIfKernelExist(RunnerParams const& params) const
     {
         // Some conditions to check if the kernel is supported.
@@ -207,10 +217,6 @@ class TllmGenFmhaKernel
 
         try
         {
-
-            // The selectKernelParams that might be updated.
-            SelectKernelParams selectKernelParams{params};
-
             int32_t ctaDim = 512;
             FmhaOptions options;
             FmhaOptionsFromArgs optionsFromArgs;
@@ -227,22 +233,35 @@ class TllmGenFmhaKernel
             // dimension.
             computeNumCtas(options, params.mMultiProcessorCount);
 
-            // Check if a precompiled cubin exists for this configuration (same lookup as run()).
-            // If not, return (false, info) so the dispatcher can fall back to unfused MHA like on main.
-            algoFilterForCubinPath(options);
-            auto [hashId, info] = hashFromFmhaOptions(options);
-
-            if (mFunctions.find(hashId) == mFunctions.end())
+            if (shouldUseNvrtc(options))
             {
-                TLLM_LOG_WARNING("Trtllm-gen kernels not found: " + info);
-                return std::make_pair(false, info);
+                // For the NVRTC path, we return supported as long as autotuner successfully selected a kernel config.
+                std::ostringstream sstream;
+                populateJsonConfig(options, sstream);
+                std::string info = sstream.str();
+
+                return std::make_pair(true, info);
             }
-            TLLM_LOG_DEBUG("TRTLLM-Gen kernel traits: %s", info.c_str());
+            else
+            {
+                // Check if a precompiled cubin exists for this configuration (same lookup as run()).
+                // If not, return (false, info) so the dispatcher can fall back to unfused MHA like on main.
+                algoFilterForCubinPath(options);
+                auto [hashId, info] = hashFromFmhaOptions(options);
+
+                if (mFunctions.find(hashId) == mFunctions.end())
+                {
+                    TLLM_LOG_WARNING("Trtllm-gen kernels not found: " + info);
+                    return std::make_pair(false, info);
+                }
+                TLLM_LOG_DEBUG("TRTLLM-Gen kernel traits: %s", info.c_str());
 
-            return std::make_pair(true, info);
+                return std::make_pair(true, info);
+            }
         }
         catch (std::exception const& e)
         {
+            // Omitting e.what(), they may contain "Runtime Error" and make scripts believe a fatal error happened.
             std::string const errorInfo = std::string("Exception during TrtllmGen kernel existence check");
             TLLM_LOG_WARNING(errorInfo);
             return std::make_pair(false, errorInfo);
@@ -295,13 +314,7 @@ class TllmGenFmhaKernel
 
         FmhaData fmhaData;
         setFmhaData(params, options, fmhaData);
-        bool isLlama70bFp4Tp4 = options.mHeadDimQk == 128 && options.mHeadDimV == 128
-            && options.mDtypeKv == tg::Dtype::E4m3 && options.mNumHeadsQ == 16 && options.mNumHeadsQPerKv == 8;
-
-        bool shouldUseNvrtc = options.mFmhaKernelType == FmhaKernelType::SwapsMmaAbForGeneration && !options.mIsMlaGen
-            && options.mDtypeKv != tg::Dtype::E2m1 && options.mHeadDimQk != 64 && !isLlama70bFp4Tp4;
-
-        if (shouldUseNvrtc)
+        if (shouldUseNvrtc(options))
         {
             // nvrtc path - uses mFmhaInterface member for kernel caching
             FmhaConfig fmhaConfig;