triton-inference-server · whoisj · Mar 19, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 18, 2026
diff --git a/src/infer_request.cc b/src/infer_request.cc
@@ -1,4 +1,4 @@
-// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -418,7 +418,7 @@ InferRequest::Exec(const bool is_decoupled)
   py::gil_scoped_release release;
 
   // BLS should not be used in "initialize" or "finalize" function.
-  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  auto stub = Stub::GetOrCreateInstance();
   if (!stub->IsInitialized() || stub->IsFinalizing()) {
     throw PythonBackendException(
         "BLS is only supported during the 'execute' function.");

diff --git a/src/metric.cc b/src/metric.cc
@@ -1,4 +1,4 @@
-// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -147,7 +147,7 @@ void
 Metric::SendCreateMetricRequest()
 {
   // Send the request to create the Metric to the parent process
-  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  auto stub = Stub::GetOrCreateInstance();
   SaveToSharedMemory(stub->ShmPool());
   CustomMetricsMessage* custom_metrics_msg = nullptr;
   AllocatedSharedMemory<CustomMetricsMessage> custom_metrics_shm;
@@ -170,7 +170,7 @@ Metric::SendIncrementRequest(const double& value)
   py::gil_scoped_release release;
   try {
     CheckIfCleared();
-    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    auto stub = Stub::GetOrCreateInstance();
     operation_value_ = value;
     SaveToSharedMemory(stub->ShmPool());
     AllocatedSharedMemory<CustomMetricsMessage> custom_metrics_shm;
@@ -189,7 +189,7 @@ Metric::SendSetValueRequest(const double& value)
 {
   try {
     CheckIfCleared();
-    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    auto stub = Stub::GetOrCreateInstance();
     operation_value_ = value;
     SaveToSharedMemory(stub->ShmPool());
     AllocatedSharedMemory<CustomMetricsMessage> custom_metrics_shm;
@@ -208,7 +208,7 @@ Metric::SendObserveRequest(const double& value)
   py::gil_scoped_release release;
   try {
     CheckIfCleared();
-    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    auto stub = Stub::GetOrCreateInstance();
     operation_value_ = value;
     SaveToSharedMemory(stub->ShmPool());
     AllocatedSharedMemory<CustomMetricsMessage> custom_metrics_shm;
@@ -228,7 +228,7 @@ Metric::SendGetValueRequest()
   AllocatedSharedMemory<CustomMetricsMessage> custom_metrics_shm;
   try {
     CheckIfCleared();
-    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    auto stub = Stub::GetOrCreateInstance();
     SaveToSharedMemory(stub->ShmPool());
     stub->SendMessage<CustomMetricsMessage>(
         custom_metrics_shm, PYTHONSTUB_MetricRequestValue, shm_handle_);
@@ -251,7 +251,7 @@ Metric::Clear()
   // scope/being deleted.
   if (!is_cleared_) {
     is_cleared_ = true;
-    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    auto stub = Stub::GetOrCreateInstance();
     SaveToSharedMemory(stub->ShmPool());
     AllocatedSharedMemory<CustomMetricsMessage> custom_metrics_shm;
     try {

diff --git a/src/metric_family.cc b/src/metric_family.cc
@@ -1,4 +1,4 @@
-// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -55,7 +55,7 @@ MetricFamily::~MetricFamily()
   }
 
   // Send the request to delete the MetricFamily to the parent process
-  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  auto stub = Stub::GetOrCreateInstance();
   SaveToSharedMemory(stub->ShmPool());
   AllocatedSharedMemory<CustomMetricsMessage> custom_metrics_shm;
   try {
@@ -147,7 +147,7 @@ MetricFamily::CreateMetricFamily(
 void
 MetricFamily::SendCreateMetricFamilyRequest()
 {
-  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  auto stub = Stub::GetOrCreateInstance();
   SaveToSharedMemory(stub->ShmPool());
   CustomMetricsMessage* custom_metrics_msg = nullptr;
   AllocatedSharedMemory<CustomMetricsMessage> custom_metrics_shm;

diff --git a/src/model_loader.cc b/src/model_loader.cc
@@ -1,4 +1,4 @@
-// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -101,7 +101,7 @@ ModelLoader::ModelLoader(
 void
 ModelLoader::SendLoadModelRequest()
 {
-  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  auto stub = Stub::GetOrCreateInstance();
   SaveToSharedMemory(stub->ShmPool());
   AllocatedSharedMemory<ModelLoaderMessage> model_loader_msg_shm;
 
@@ -118,7 +118,7 @@ ModelLoader::SendLoadModelRequest()
 void
 ModelLoader::SendUnloadModelRequest()
 {
-  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  auto stub = Stub::GetOrCreateInstance();
   SaveToSharedMemory(stub->ShmPool());
   AllocatedSharedMemory<ModelLoaderMessage> model_loader_msg_shm;
   try {
@@ -134,7 +134,7 @@ ModelLoader::SendUnloadModelRequest()
 bool
 ModelLoader::SendModelReadinessRequest()
 {
-  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  auto stub = Stub::GetOrCreateInstance();
   SaveToSharedMemory(stub->ShmPool());
   ModelLoaderMessage* model_loader_msg = nullptr;
   AllocatedSharedMemory<ModelLoaderMessage> model_loader_msg_shm;

diff --git a/src/pb_bls_cancel.cc b/src/pb_bls_cancel.cc
@@ -1,4 +1,4 @@
-// Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -67,7 +67,7 @@ PbBLSCancel::Cancel()
     return;
   }
   if (!updating_) {
-    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    auto stub = Stub::GetOrCreateInstance();
     if (!stub->StubToParentServiceActive()) {
       LOG_ERROR << "Cannot communicate with parent service";
       return;

diff --git a/src/pb_cancel.cc b/src/pb_cancel.cc
@@ -1,4 +1,4 @@
-// Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -68,7 +68,7 @@ PbCancel::IsCancelled()
     return is_cancelled_;
   }
   if (!updating_) {
-    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    auto stub = Stub::GetOrCreateInstance();
     if (!stub->StubToParentServiceActive()) {
       LOG_ERROR << "Cannot communicate with parent service";
       return false;

diff --git a/src/pb_response_iterator.cc b/src/pb_response_iterator.cc
@@ -1,4 +1,4 @@
-// Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -133,7 +133,7 @@ ResponseIterator::Id()
 void
 ResponseIterator::Clear()
 {
-  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  auto stub = Stub::GetOrCreateInstance();
   stub->EnqueueCleanupId(id_, PYTHONSTUB_BLSDecoupledInferPayloadCleanup);
   {
     std::lock_guard<std::mutex> lock{mu_};

diff --git a/src/pb_stub.cc b/src/pb_stub.cc
@@ -130,7 +130,7 @@ PyParametersToJSON(const py::dict& parameters)
 void
 AsyncEventFutureDoneCallback(const py::object& py_future)
 {
-  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  auto stub = Stub::GetOrCreateInstance();
   stub->BackgroundFutureDone(py_future);
 }
 
@@ -514,7 +514,7 @@ Stub::AutoCompleteModelConfig(
   python_backend_utils.def(
       "get_model_dir",
       []() {
-        std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+        auto stub = Stub::GetOrCreateInstance();
         return stub->GetModelDir();
       },
       py::return_value_policy::reference);
@@ -568,7 +568,7 @@ Stub::Initialize(bi::managed_external_buffer::handle_t map_handle)
   python_backend_utils.def(
       "get_model_dir",
       []() {
-        std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+        auto stub = Stub::GetOrCreateInstance();
         return stub->GetModelDir();
       },
       py::return_value_policy::reference);
@@ -1073,16 +1073,22 @@ Stub::~Stub()
   memory_manager_message_queue_.reset();
 }
 
-std::unique_ptr<Stub> Stub::stub_instance_;
+static std::shared_ptr<triton::backend::python::Stub> stub_instance{nullptr};
 
-std::unique_ptr<Stub>&
+std::shared_ptr<triton::backend::python::Stub>
 Stub::GetOrCreateInstance()
 {
-  if (Stub::stub_instance_.get() == nullptr) {
-    Stub::stub_instance_ = std::make_unique<Stub>();
+  if (!stub_instance) {
+    stub_instance.reset(new triton::backend::python::Stub());
   }
 
-  return Stub::stub_instance_;
+  return stub_instance;
+}
+
+void
+Stub::DestroyInstance()
+{
+  stub_instance.reset();
 }
 
 void
@@ -1822,7 +1828,7 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
           "exec",
           [](std::shared_ptr<InferRequest>& infer_request,
              const bool decoupled) {
-            std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+            auto stub = Stub::GetOrCreateInstance();
             std::shared_ptr<InferResponse> response =
                 infer_request->Exec(decoupled);
             py::object response_object;
@@ -1840,7 +1846,7 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
           "async_exec",
           [](std::shared_ptr<InferRequest>& infer_request,
              const bool decoupled) {
-            std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+            auto stub = Stub::GetOrCreateInstance();
             py::object loop =
                 py::module_::import("asyncio").attr("get_running_loop")();
             py::cpp_function callback = [&stub, infer_request, decoupled]() {
@@ -2125,7 +2131,7 @@ main(int argc, char** argv)
   std::string name = argv[8];
   std::string runtime_modeldir = argv[9];
 
-  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  auto stub = Stub::GetOrCreateInstance();
   try {
     stub->Instantiate(
         shm_growth_size, shm_default_size, shm_region_name, model_path,
@@ -2135,7 +2141,7 @@ main(int argc, char** argv)
   catch (const PythonBackendException& pb_exception) {
     LOG_INFO << "Failed to preinitialize Python stub: " << pb_exception.what();
     logger.reset();
-    stub.reset();
+    Stub::DestroyInstance();
     exit(1);
   }
 
@@ -2148,7 +2154,7 @@ main(int argc, char** argv)
 #endif
   std::atomic<bool> background_thread_running = {true};
   std::thread background_thread =
-      std::thread([&parent_pid, &background_thread_running, &stub, &logger] {
+      std::thread([stub, &parent_pid, &background_thread_running, &logger] {
         // Send a dummy message after the stub process is launched to notify the
         // parent process that the health thread has started.
         std::unique_ptr<IPCMessage> ipc_message = IPCMessage::Create(
@@ -2180,7 +2186,7 @@ main(int argc, char** argv)
 
             // Destroy stub and exit.
             logger.reset();
-            stub.reset();
+            Stub::DestroyInstance();
             exit(1);
           }
         }
@@ -2213,7 +2219,7 @@ main(int argc, char** argv)
   // this process will no longer hold the GIL lock and destruction of the stub
   // will result in segfault.
   logger.reset();
-  stub.reset();
+  Stub::DestroyInstance();
 
   return 0;
 }

diff --git a/src/pb_stub.h b/src/pb_stub.h
@@ -96,7 +96,9 @@ struct UtilsMessagePayload {
 class Stub {
  public:
   Stub() : stub_to_parent_thread_(false), parent_to_stub_thread_(false){};
-  static std::unique_ptr<Stub>& GetOrCreateInstance();
+  static std::shared_ptr<Stub> GetOrCreateInstance();
+
+  static void DestroyInstance();
 
   /// Instantiate a new Python backend Stub.
   void Instantiate(
@@ -296,7 +298,6 @@ class Stub {
   std::unique_ptr<MessageQueue<uint64_t>> memory_manager_message_queue_;
   bool initialized_;
   bool finalizing_;
-  static std::unique_ptr<Stub> stub_instance_;
   std::vector<std::shared_ptr<PbTensor>> gpu_tensors_;
   std::queue<std::unique_ptr<UtilsMessagePayload>> stub_to_parent_buffer_;
   std::thread stub_to_parent_queue_monitor_;

diff --git a/src/pb_stub_log.cc b/src/pb_stub_log.cc
@@ -1,4 +1,4 @@
-// Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -51,7 +51,7 @@ Logger::GetOrCreateInstance()
 void
 Logger::Log(const std::string& message, LogLevel level)
 {
-  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  auto stub = Stub::GetOrCreateInstance();
   py::object frame = py::module_::import("inspect").attr("currentframe");
   py::object caller_frame = frame();
   py::object info = py::module_::import("inspect").attr("getframeinfo");
@@ -110,7 +110,7 @@ Logger::Log(
 #endif
   } else {
     // Ensure we do not create a stub instance before it has initialized
-    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    auto stub = Stub::GetOrCreateInstance();
     std::unique_ptr<PbLog> log_msg(new PbLog(filename, lineno, message, level));
     stub->EnqueueLogRequest(log_msg);
   }

diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc
@@ -1,4 +1,4 @@
-// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -419,7 +419,7 @@ PbTensor::FromDLPack(const std::string& name, const py::object& tensor)
 #ifdef TRITON_ENABLE_GPU
     int current_device;
     cudaError_t err = cudaGetDevice(&current_device);
-    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    auto stub = Stub::GetOrCreateInstance();
     if (err != cudaSuccess) {
       throw PythonBackendException("Failed to get current CUDA device id.");
     }

diff --git a/src/response_sender.cc b/src/response_sender.cc
@@ -1,4 +1,4 @@
-// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -129,7 +129,7 @@ ResponseSender::Send(
     infer_response->PruneOutputTensors(requested_output_names_);
   }
 
-  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  auto stub = Stub::GetOrCreateInstance();
 
   AllocatedSharedMemory<ResponseSendMessage> response_send_message =
       shm_pool_->Construct<ResponseSendMessage>(
@@ -279,7 +279,7 @@ ResponseSender::DeleteResponseFactory()
 {
   bool already_deleted = response_factory_deleted_.exchange(true);
   if (!already_deleted) {
-    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    auto stub = Stub::GetOrCreateInstance();
     stub->EnqueueCleanupId(
         reinterpret_cast<void*>(response_factory_address_),
         PYTHONSTUB_DecoupledResponseFactoryCleanup);