[None][fix] Fix review findings: docstring, encoder token guard, share-last-block logic

liji-nv · liji-nv · commit c4093ba0b4c1 · 2026-04-14T06:59:02.000-07:00
- Update addSequenceBatch docstring to reflect support for both block-reuse and non-reuse paths via buildClaimResultMetadata. - Guard encoder unique token access in claimMatchingBlocks and onboardAndAllocateBlocks with hasUniqueTokens check, matching buildClaimResultMetadata and WindowBlockManager::addSequence (PR #10437) for cross-KV requests without encoder tokens (e.g., Whisper). - Align shareLastContextBlockAmongBeams in claimMatchingBlocks with the unified formula from loadOrAllocateBlocks (PR #10437): isShareLastContextBlock = kCROSS || inputLength % tokensPerBlock == 0. Signed-off-by: Jin Li <59594262+liji-nv@users.noreply.github.com>
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -1888,11 +1888,13 @@ class BaseKVCacheManager
         OptionalRef<LlmRequest> llmRequest = std::nullopt)
         = 0;
 
-    //! \brief Batch add sequences with two-phase claim-then-onboard to prevent host offloading eviction.
-    //! \details For each attention window, Phase 1 claims all matching blocks across all requests
-    //!          (protecting them from eviction), then Phase 2 onboards host blocks and allocates
-    //!          non-matching blocks. Supports variable sliding window attention (VSWA) by iterating
-    //!          over all window sizes. Requires block reuse to be enabled.
+    //! \brief Batch add sequences with two-phase claim-then-onboard strategy.
+    //! \details For each attention window, when block reuse is enabled, Phase 1 claims all matching
+    //!          blocks across all requests (protecting them from eviction via PartialClaimTracker),
+    //!          then Phase 2 onboards host blocks and allocates non-matching blocks. When block reuse
+    //!          is disabled, buildClaimResultMetadata() prepares ClaimResult metadata without radix
+    //!          tree traversal, and Phase 2 performs fresh allocation only. Supports variable sliding
+    //!          window attention (VSWA) by iterating over all window sizes.
     virtual void addSequenceBatch(
         std::vector<std::tuple<LlmRequest::RequestIdType, SizeType32, SizeType32>> const& requestInfos,
         std::vector<std::reference_wrapper<LlmRequest>> const& llmRequests)
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@@ -1409,19 +1409,26 @@ WindowBlockManager::ClaimResult WindowBlockManager::claimMatchingBlocks(Generati
     auto const [seqIt, emplaceDone] = mAllocatedBlocksPerSeq.emplace(requestId, std::vector<BlockPtr>{});
     TLLM_CHECK(emplaceDone);
 
-    // Prepare block keys — same logic as WindowBlockManager::addSequence lines 1437-1465
+    // Prepare block keys — guard for cross-KV without encoder tokens (e.g., Whisper).
     auto constexpr beamIdx = 0;
-    auto const& uniqueTokens = (mCacheType == CacheType::kSELF || mCacheType == CacheType::kSELFKONLY)
-        ? llmRequest.getUniqueTokens(beamIdx)
-        : *(llmRequest.getEncoderUniqueTokens().value());
+    bool const isSelfCache = mCacheType == CacheType::kSELF || mCacheType == CacheType::kSELFKONLY;
+    bool const hasUniqueTokens = isSelfCache
+        || (llmRequest.getEncoderUniqueTokens().has_value() && llmRequest.getEncoderUniqueTokens().value());
 
-    auto blockedUniqueTokens = chopVectorIntoBlocks<UniqueToken>(uniqueTokens, inputLength - 1, mTokensPerBlock, true);
-    if (inputLength % mTokensPerBlock == 1)
+    if (hasUniqueTokens)
     {
-        blockedUniqueTokens.emplace_back();
-    }
+        auto const& uniqueTokens
+            = isSelfCache ? llmRequest.getUniqueTokens(beamIdx) : *(llmRequest.getEncoderUniqueTokens().value());
 
-    result.blockKeys = buildBlockKeys(blockedUniqueTokens, llmRequest);
+        auto blockedUniqueTokens
+            = chopVectorIntoBlocks<UniqueToken>(uniqueTokens, inputLength - 1, mTokensPerBlock, true);
+        if (inputLength % mTokensPerBlock == 1)
+        {
+            blockedUniqueTokens.emplace_back();
+        }
+
+        result.blockKeys = buildBlockKeys(blockedUniqueTokens, llmRequest);
+    }
 
     auto config = llmRequest.getKvCacheRetentionConfig();
     result.perBlockRetentions = config.value_or(executor::KvCacheRetentionConfig())
@@ -1441,14 +1448,12 @@ WindowBlockManager::ClaimResult WindowBlockManager::claimMatchingBlocks(Generati
     // Phase 1: Walk radix tree, claim matching blocks — no onboard, no getFreeBlock
     // NOTE: Caller must hold mCachedBlocksRootMutex.
 
-    // Compute shareLastContextBlockAmongBeams — same logic as WindowBlockManager::addSequence
-    result.shareLastContextBlockAmongBeams = sequence.getBeamWidth() == 1;
-    if (isRecurrentState())
-    {
-        result.shareLastContextBlockAmongBeams |= inputLength % mTokensPerBlock == 0;
-    }
-
-    result.numSharedContextBlocks = result.shareLastContextBlockAmongBeams ? numContextBlocks : numContextBlocks - 1;
+    // Compute shareLastContextBlockAmongBeams — aligned with loadOrAllocateBlocks (PR #10437).
+    auto const beamWidth = sequence.getBeamWidth();
+    bool const isShareLastContextBlock = mCacheType == CacheType::kCROSS || inputLength % mTokensPerBlock == 0;
+    result.numSharedContextBlocks
+        = (beamWidth > 1 && !isShareLastContextBlock) ? numContextBlocks - 1 : numContextBlocks;
+    result.shareLastContextBlockAmongBeams = result.numSharedContextBlocks == numContextBlocks;
     auto searchRoot = mCachedBlocksRoot;
     auto blockItr = result.blockKeys.begin();
 
@@ -1507,19 +1512,28 @@ WindowBlockManager::ClaimResult WindowBlockManager::claimMatchingBlocks(Generati
                         auto tIt = tracker.map.find(blockId);
                         if (tIt != tracker.map.end())
                         {
-                            // Previous copier no longer responsible for release.
-                            claimResults[tIt->second.requestIdx]
-                                .claimedBlocks[tIt->second.claimedIdx]
-                                .shouldReleaseCopySource
-                                = false;
+                            if (tIt->second.fullyMatched)
+                            {
+                                // A full match holds this block — do not release.
+                                claimed.shouldReleaseCopySource = false;
+                            }
+                            else
+                            {
+                                // Previous copier no longer responsible for release.
+                                claimResults[tIt->second.requestIdx]
+                                    .claimedBlocks[tIt->second.claimedIdx]
+                                    .shouldReleaseCopySource
+                                    = false;
+                                claimed.shouldReleaseCopySource = true;
+                            }
                             tIt->second.requestIdx = requestIdx;
                             tIt->second.claimedIdx = result.claimedBlocks.size();
                         }
                         else
                         {
                             tracker.map[blockId] = {requestIdx, result.claimedBlocks.size(), /*fullyMatched=*/false};
+                            claimed.shouldReleaseCopySource = true;
                         }
-                        claimed.shouldReleaseCopySource = true;
                     }
                 }
                 else
@@ -1760,11 +1774,20 @@ SizeType32 WindowBlockManager::onboardAndAllocateBlocks(
 
     // Update stats and return prepopulated length
     mReusedTokens += static_cast<double>(numMatchedTokens);
-    auto constexpr beamIdx = 0;
-    auto const& uniqueTokens = (mCacheType == CacheType::kSELF || mCacheType == CacheType::kSELFKONLY)
-        ? llmRequest.getUniqueTokens(beamIdx)
-        : *(llmRequest.getEncoderUniqueTokens().value());
-    mTotalInputTokens += static_cast<double>(uniqueTokens.size());
+    bool const isSelfCache = mCacheType == CacheType::kSELF || mCacheType == CacheType::kSELFKONLY;
+    bool const hasUniqueTokens = isSelfCache
+        || (llmRequest.getEncoderUniqueTokens().has_value() && llmRequest.getEncoderUniqueTokens().value());
+    if (hasUniqueTokens)
+    {
+        auto constexpr beamIdx = 0;
+        auto const& uniqueTokens
+            = isSelfCache ? llmRequest.getUniqueTokens(beamIdx) : *(llmRequest.getEncoderUniqueTokens().value());
+        mTotalInputTokens += static_cast<double>(uniqueTokens.size());
+    }
+    else
+    {
+        mTotalInputTokens += static_cast<double>(claimResult.numContextBlocks * mTokensPerBlock);
+    }
 
     SizeType32 numConnectorMatchedTokens = 0;
     if (mKvCacheConnectorManager && !llmRequest.isDummyRequest())
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -68,7 +68,7 @@ std::optional<tensorrt_llm::runtime::ITensor::UniquePtr> from_torch(std::optiona
 class PyKvCacheManager : public tbk::BaseKVCacheManager
 {
 public:
-    NB_TRAMPOLINE(tbk::BaseKVCacheManager, 37);
+    NB_TRAMPOLINE(tbk::BaseKVCacheManager, 39);
 
     // using BaseKVCacheManager::BaseKVCacheManager; // Inherit constructors
     void allocatePools(bool useUvm = false) override

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
	`2`	`+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
`3`	`3`	`* SPDX-License-Identifier: Apache-2.0`
`4`	`4`	`*`
`5`	`5`	`* Licensed under the Apache License, Version 2.0 (the "License");`
`@@ -68,7 +68,7 @@ std::optional<tensorrt_llm::runtime::ITensor::UniquePtr> from_torch(std::optiona`
`68`	`68`	`class PyKvCacheManager : public tbk::BaseKVCacheManager`
`69`	`69`	`{`
`70`	`70`	`public:`
`71`		`- NB_TRAMPOLINE(tbk::BaseKVCacheManager, 37);`
	`71`	`+ NB_TRAMPOLINE(tbk::BaseKVCacheManager, 39);`
`72`	`72`
`73`	`73`	`// using BaseKVCacheManager::BaseKVCacheManager; // Inherit constructors`
`74`	`74`	`void allocatePools(bool useUvm = false) override`