triton-inference-server · pskiran1 · Apr 2, 2026 · Mar 21, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/protobuf/model_config.proto b/protobuf/model_config.proto
@@ -1,4 +1,4 @@
-// Copyright 2018-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2018-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -1662,11 +1662,13 @@ message ModelEnsembling
 
   //@@  .. cpp:var:: uint32 max_inflight_requests
   //@@
-  //@@     The maximum number of concurrent inflight requests allowed at each
-  //@@     ensemble step per inference request. This limit prevents unbounded
-  //@@     memory growth when ensemble steps produce responses faster than
-  //@@     downstream steps can consume, e.g. decoupled models.
-  //@@     Default value is 0, which indicates that no limit is enforced.
+  //@@     BETA  (Subject to change)
+  //@@     The maximum number of concurrent in-flight requests allowed at each
+  //@@     ensemble step across all ongoing ensemble requests for this model
+  //@@     instance. This per-step limit prevents unbounded memory growth when
+  //@@     ensemble steps produce responses faster than downstream steps can
+  //@@     consume them (for example, in decoupled models).
+  //@@     The default value is 0, which indicates that no limit is enforced.
   //@@
   //@@     Note: Applying this limit may block upstream steps while they wait
   //@@     for downstream capacity. This blocking does not cancel or internally