From a9720be05fffeb819f5236a206c1b720d53d6adf Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 13 Apr 2026 22:59:46 -0700
Subject: [PATCH 1/2] Guard cross-method constant sharing behind
 share_kv_cache_across_methods compile spec

The cross-method constant sharing code in CudaBackend::init() was running
unconditionally for all multi-method models, which corrupts weights for
models like Parakeet where different methods have different sub-models
(encoder, decoder, joint) that should NOT share constants.

This change:
- Adds a new `share_kv_cache_across_methods` compile spec that must be
  explicitly set to enable cross-method constant sharing
- Guards the sharing logic behind this compile spec (previously ran for
  all models with the required AOTI APIs)
- Makes sharing failures return Error::Internal instead of just logging
- Adds generate_share_kv_cache_compile_spec() to AotiBackend Python API
- Updates Qwen3.5 MoE export to opt-in to sharing for prefill/decode

Without this spec set, each method gets its own independent constants,
fixing the Parakeet CUDA CI regression.
---
 backends/aoti/aoti_backend.py          | 11 +++++++++++
 backends/cuda/runtime/cuda_backend.cpp | 26 +++++++++++++++++++++++---
 examples/models/qwen3_5_moe/export.py  | 10 ++++++++--
 3 files changed, 42 insertions(+), 5 deletions(-)
diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index f9b4b947506..0eb775e3459 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -25,6 +25,7 @@
 
 class COMPILE_SPEC_KEYS(Enum):
     METHOD_NAME = "method_name"
+    SHARE_KV_CACHE_ACROSS_METHODS = "share_kv_cache_across_methods"
 
 
 @experimental(
@@ -286,3 +287,13 @@ def method_name_from_compile_specs(
         raise RuntimeError(
             f"Could not find method name in compile specs: {compile_specs}"
         )
+
+    @classmethod
+    def generate_share_kv_cache_compile_spec(cls) -> CompileSpec:
+        """
+        Generate a CompileSpec to enable cross-method KV cache sharing.
+        """
+        return CompileSpec(
+            COMPILE_SPEC_KEYS.SHARE_KV_CACHE_ACROSS_METHODS.value,
+            bytes([1]),
+        )
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 8a5ad285599..af50d062eac 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -80,6 +80,7 @@ namespace {
 constexpr char kSkipCopyOutputToCpuForMethod[] =
     "skip_copy_output_to_cpu_for_method";
 constexpr char kUseSharedCudaStream[] = "use_shared_cuda_stream";
+constexpr char kShareKvCacheAcrossMethods[] = "share_kv_cache_across_methods";
 } // anonymous namespace
 
 class ET_EXPERIMENTAL CudaBackend final
@@ -287,12 +288,17 @@ class ET_EXPERIMENTAL CudaBackend final
       ArrayRef<CompileSpec> compile_specs // This will be my empty list
   ) const override {
     std::string method_name;
+    bool share_kv_cache = false;
     for (const CompileSpec& spec : compile_specs) {
       if (std::strcmp(spec.key, "method_name") == 0) {
         method_name.assign(
             static_cast<const char*>(spec.value.buffer),
             spec.value.nbytes); // no nullptr guarantee, so pass size
-        break;
+      } else if (std::strcmp(spec.key, kShareKvCacheAcrossMethods) == 0) {
+        if (spec.value.nbytes >= 1) {
+          share_kv_cache =
+              static_cast<const uint8_t*>(spec.value.buffer)[0] != 0;
+        }
       }
     }
 
@@ -416,13 +422,15 @@ class ET_EXPERIMENTAL CudaBackend final
     // ---------------------------------------------------------------
     // Cross-method constant sharing (e.g., KV cache between prefill/decode).
     //
+    // Only enabled when share_kv_cache_across_methods compile spec is set.
     // The first container to initialize extracts its constants (keyed by
     // original FQN) and stores the AtenTensorHandle's. Subsequent containers
     // with matching FQNs are updated to point to the same GPU tensors via
     // UpdateUserManagedConstantBufferPairs (user_managed = true → no copy,
     // the source container retains ownership).
     // ---------------------------------------------------------------
-    if (handle->get_num_constants && handle->get_constant_name &&
+    if (share_kv_cache &&
+        handle->get_num_constants && handle->get_constant_name &&
         handle->get_constant_original_fqn && handle->extract_constants_map &&
         handle->update_user_managed_constant_buffer_pairs) {
       size_t num_constants = 0;
@@ -469,6 +477,8 @@ class ET_EXPERIMENTAL CudaBackend final
                 Error,
                 "Failed to extract constants from '%s'",
                 method_name.c_str());
+            delete handle;
+            return Error::Internal;
           }
         } else {
           // Subsequent container: share matching constants from the first.
@@ -501,14 +511,24 @@ class ET_EXPERIMENTAL CudaBackend final
                   Error,
                   "Failed to share constants into '%s'",
                   method_name.c_str());
+              delete handle;
+              return Error::Internal;
             }
           }
         }
       }
+    } else if (share_kv_cache) {
+      ET_LOG(
+          Error,
+          "share_kv_cache_across_methods requested but constant sharing APIs "
+          "not available for method '%s'",
+          method_name.c_str());
+      delete handle;
+      return Error::Internal;
     } else {
       ET_LOG(
           Info,
-          "Constant sharing APIs not available for method '%s'",
+          "Constant sharing not requested for method '%s'",
           method_name.c_str());
     }
 
diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
index 19a720a2e79..ee74f2f6880 100644
--- a/examples/models/qwen3_5_moe/export.py
+++ b/examples/models/qwen3_5_moe/export.py
@@ -458,12 +458,18 @@ def export_and_lower(model, config, args):
         partitioner={
             "decode": [
                 CudaPartitioner(
-                    [CudaBackend.generate_method_name_compile_spec("decode")]
+                    [
+                        CudaBackend.generate_method_name_compile_spec("decode"),
+                        CudaBackend.generate_share_kv_cache_compile_spec(),
+                    ]
                 )
             ],
             "prefill": [
                 CudaPartitioner(
-                    [CudaBackend.generate_method_name_compile_spec("prefill")]
+                    [
+                        CudaBackend.generate_method_name_compile_spec("prefill"),
+                        CudaBackend.generate_share_kv_cache_compile_spec(),
+                    ]
                 )
             ],
         },

From 083120f23f1fe477f4e1d2e5cef7d202bfb38035 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 13 Apr 2026 23:01:09 -0700
Subject: [PATCH 2/2] lint

---
 backends/cuda/runtime/cuda_backend.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index af50d062eac..eb0a07b8d8f 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -429,9 +429,9 @@ class ET_EXPERIMENTAL CudaBackend final
     // UpdateUserManagedConstantBufferPairs (user_managed = true → no copy,
     // the source container retains ownership).
     // ---------------------------------------------------------------
-    if (share_kv_cache &&
-        handle->get_num_constants && handle->get_constant_name &&
-        handle->get_constant_original_fqn && handle->extract_constants_map &&
+    if (share_kv_cache && handle->get_num_constants &&
+        handle->get_constant_name && handle->get_constant_original_fqn &&
+        handle->extract_constants_map &&
         handle->update_user_managed_constant_buffer_pairs) {
       size_t num_constants = 0;
       handle->get_num_constants(handle->container_handle, &num_constants);