From a9720be05fffeb819f5236a206c1b720d53d6adf Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Mon, 13 Apr 2026 22:59:46 -0700 Subject: [PATCH 1/2] Guard cross-method constant sharing behind share_kv_cache_across_methods compile spec The cross-method constant sharing code in CudaBackend::init() was running unconditionally for all multi-method models, which corrupts weights for models like Parakeet where different methods have different sub-models (encoder, decoder, joint) that should NOT share constants. This change: - Adds a new `share_kv_cache_across_methods` compile spec that must be explicitly set to enable cross-method constant sharing - Guards the sharing logic behind this compile spec (previously ran for all models with the required AOTI APIs) - Makes sharing failures return Error::Internal instead of just logging - Adds generate_share_kv_cache_compile_spec() to AotiBackend Python API - Updates Qwen3.5 MoE export to opt-in to sharing for prefill/decode Without this spec set, each method gets its own independent constants, fixing the Parakeet CUDA CI regression. --- backends/aoti/aoti_backend.py | 11 +++++++++++ backends/cuda/runtime/cuda_backend.cpp | 26 +++++++++++++++++++++++--- examples/models/qwen3_5_moe/export.py | 10 ++++++++-- 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index f9b4b947506..0eb775e3459 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -25,6 +25,7 @@ class COMPILE_SPEC_KEYS(Enum): METHOD_NAME = "method_name" + SHARE_KV_CACHE_ACROSS_METHODS = "share_kv_cache_across_methods" @experimental( @@ -286,3 +287,13 @@ def method_name_from_compile_specs( raise RuntimeError( f"Could not find method name in compile specs: {compile_specs}" ) + + @classmethod + def generate_share_kv_cache_compile_spec(cls) -> CompileSpec: + """ + Generate a CompileSpec to enable cross-method KV cache sharing. + """ + return CompileSpec( + COMPILE_SPEC_KEYS.SHARE_KV_CACHE_ACROSS_METHODS.value, + bytes([1]), + ) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 8a5ad285599..af50d062eac 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -80,6 +80,7 @@ namespace { constexpr char kSkipCopyOutputToCpuForMethod[] = "skip_copy_output_to_cpu_for_method"; constexpr char kUseSharedCudaStream[] = "use_shared_cuda_stream"; +constexpr char kShareKvCacheAcrossMethods[] = "share_kv_cache_across_methods"; } // anonymous namespace class ET_EXPERIMENTAL CudaBackend final @@ -287,12 +288,17 @@ class ET_EXPERIMENTAL CudaBackend final ArrayRef compile_specs // This will be my empty list ) const override { std::string method_name; + bool share_kv_cache = false; for (const CompileSpec& spec : compile_specs) { if (std::strcmp(spec.key, "method_name") == 0) { method_name.assign( static_cast(spec.value.buffer), spec.value.nbytes); // no nullptr guarantee, so pass size - break; + } else if (std::strcmp(spec.key, kShareKvCacheAcrossMethods) == 0) { + if (spec.value.nbytes >= 1) { + share_kv_cache = + static_cast(spec.value.buffer)[0] != 0; + } } } @@ -416,13 +422,15 @@ class ET_EXPERIMENTAL CudaBackend final // --------------------------------------------------------------- // Cross-method constant sharing (e.g., KV cache between prefill/decode). // + // Only enabled when share_kv_cache_across_methods compile spec is set. // The first container to initialize extracts its constants (keyed by // original FQN) and stores the AtenTensorHandle's. Subsequent containers // with matching FQNs are updated to point to the same GPU tensors via // UpdateUserManagedConstantBufferPairs (user_managed = true → no copy, // the source container retains ownership). // --------------------------------------------------------------- - if (handle->get_num_constants && handle->get_constant_name && + if (share_kv_cache && + handle->get_num_constants && handle->get_constant_name && handle->get_constant_original_fqn && handle->extract_constants_map && handle->update_user_managed_constant_buffer_pairs) { size_t num_constants = 0; @@ -469,6 +477,8 @@ class ET_EXPERIMENTAL CudaBackend final Error, "Failed to extract constants from '%s'", method_name.c_str()); + delete handle; + return Error::Internal; } } else { // Subsequent container: share matching constants from the first. @@ -501,14 +511,24 @@ class ET_EXPERIMENTAL CudaBackend final Error, "Failed to share constants into '%s'", method_name.c_str()); + delete handle; + return Error::Internal; } } } } + } else if (share_kv_cache) { + ET_LOG( + Error, + "share_kv_cache_across_methods requested but constant sharing APIs " + "not available for method '%s'", + method_name.c_str()); + delete handle; + return Error::Internal; } else { ET_LOG( Info, - "Constant sharing APIs not available for method '%s'", + "Constant sharing not requested for method '%s'", method_name.c_str()); } diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py index 19a720a2e79..ee74f2f6880 100644 --- a/examples/models/qwen3_5_moe/export.py +++ b/examples/models/qwen3_5_moe/export.py @@ -458,12 +458,18 @@ def export_and_lower(model, config, args): partitioner={ "decode": [ CudaPartitioner( - [CudaBackend.generate_method_name_compile_spec("decode")] + [ + CudaBackend.generate_method_name_compile_spec("decode"), + CudaBackend.generate_share_kv_cache_compile_spec(), + ] ) ], "prefill": [ CudaPartitioner( - [CudaBackend.generate_method_name_compile_spec("prefill")] + [ + CudaBackend.generate_method_name_compile_spec("prefill"), + CudaBackend.generate_share_kv_cache_compile_spec(), + ] ) ], }, From 083120f23f1fe477f4e1d2e5cef7d202bfb38035 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Mon, 13 Apr 2026 23:01:09 -0700 Subject: [PATCH 2/2] lint --- backends/cuda/runtime/cuda_backend.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index af50d062eac..eb0a07b8d8f 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -429,9 +429,9 @@ class ET_EXPERIMENTAL CudaBackend final // UpdateUserManagedConstantBufferPairs (user_managed = true → no copy, // the source container retains ownership). // --------------------------------------------------------------- - if (share_kv_cache && - handle->get_num_constants && handle->get_constant_name && - handle->get_constant_original_fqn && handle->extract_constants_map && + if (share_kv_cache && handle->get_num_constants && + handle->get_constant_name && handle->get_constant_original_fqn && + handle->extract_constants_map && handle->update_user_managed_constant_buffer_pairs) { size_t num_constants = 0; handle->get_num_constants(handle->container_handle, &num_constants);