@@ -7693,40 +7693,8 @@ private CosmosEndToEndOperationLatencyPolicyConfig evaluatePpafEnforcedE2eLatenc
76937693
76947694 /**
76957695 * Evaluates whether a PPAF-enforced E2E latency policy should be applied for write operations.
7696- *
7697- * <h3>Timeout Design Rationale (DIRECT mode)</h3>
7698- * <p>With default {@code tcpNetworkRequestTimeout} of 5s:
7699- * <ul>
7700- * <li><b>Overall E2E timeout = 4x networkRequestTimeout (20s)</b>:
7701- * When a 410 (Gone) is returned, {@code GoneAndRetryWithRetryPolicy} retries with
7702- * exponential backoff (typically 3-4 retries over ~6-8s). The primary Mono consumes
7703- * this budget. The hedge fires at 1x timeout (5s), leaving 15s for the hedged request
7704- * to complete — ample margin for address resolution + transport + response.</li>
7705- * <li><b>Threshold = 1x networkRequestTimeout (5s)</b>:
7706- * Gives the primary write region a full network-timeout window before hedging.
7707- * This avoids premature hedging on transiently slow writes (which are common under
7708- * high throughput) while still firing the hedge well before the E2E timeout.</li>
7709- * <li><b>ThresholdStep = min(threshold/2, 1s)</b>:
7710- * Standard step between consecutive hedge regions (matching read pattern).</li>
7711- * </ul>
7712- *
7713- * <h3>Timeout Design Rationale (GATEWAY mode)</h3>
7714- * <p>With default {@code httpNetworkRequestTimeout} of 60s:
7715- * <ul>
7716- * <li><b>Overall E2E timeout = 1x httpNetworkRequestTimeout (60s)</b>:
7717- * Gateway mode already bounds requests at the HTTP layer. Using the full timeout
7718- * ensures the E2E timeout doesn't prematurely kill writes that are proceeding
7719- * normally but slowly.</li>
7720- * <li><b>Threshold = min(timeout/2, 2s) = 2s</b>:
7721- * Aggressively hedges after 2s — justified because gateway mode has higher inherent
7722- * latency and the 60s overall budget provides ample room.</li>
7723- * </ul>
7724- *
7725- * <h3>Why separate from reads</h3>
7726- * <p>Writes are inherently more expensive (quorum replication, conflict resolution).
7727- * The cost of a redundant hedged write is higher than a redundant read. The more generous
7728- * threshold (1x vs 0.5x for reads) ensures hedging only activates when the primary
7729- * region is genuinely unresponsive, not merely slow under load.</p>
7696+ * Uses the same timeout/threshold values as the read policy — the availability strategy
7697+ * hedging behavior should be symmetric for reads and writes under PPAF.
77307698 */
77317699 private CosmosEndToEndOperationLatencyPolicyConfig evaluatePpafEnforcedE2eLatencyPolicyCfgForWrites (
77327700 GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover globalPartitionEndpointManagerForPerPartitionAutomaticFailover ,
@@ -7745,14 +7713,9 @@ private CosmosEndToEndOperationLatencyPolicyConfig evaluatePpafEnforcedE2eLatenc
77457713
77467714 checkNotNull (networkRequestTimeout , "Argument 'networkRequestTimeout' cannot be null!" );
77477715
7748- // For writes, use 4x network timeout as the overall E2E timeout.
7749- // This needs to be generous because GoneAndRetryWithRetryPolicy (for 410 errors)
7750- // retries with backoff, consuming budget. The hedge fires at 1x network timeout,
7751- // leaving 3x for the hedged request to complete after retry overhead.
7752- Duration overallE2eLatencyTimeout = networkRequestTimeout .multipliedBy (4 );
7753- // Threshold = full network timeout — give primary region a fair chance
7754- Duration threshold = networkRequestTimeout ;
7755- Duration thresholdStep = Utils .min (threshold .dividedBy (2 ), Utils .ONE_SECOND );
7716+ Duration overallE2eLatencyTimeout = networkRequestTimeout .plus (Utils .ONE_SECOND );
7717+ Duration threshold = Utils .min (networkRequestTimeout .dividedBy (2 ), Utils .ONE_SECOND );
7718+ Duration thresholdStep = Utils .min (threshold .dividedBy (2 ), Utils .HALF_SECOND );
77567719
77577720 return new CosmosEndToEndOperationLatencyPolicyConfigBuilder (overallE2eLatencyTimeout )
77587721 .availabilityStrategy (new ThresholdBasedAvailabilityStrategy (threshold , thresholdStep ))
@@ -7763,11 +7726,10 @@ private CosmosEndToEndOperationLatencyPolicyConfig evaluatePpafEnforcedE2eLatenc
77637726
77647727 checkNotNull (httpNetworkRequestTimeout , "Argument 'httpNetworkRequestTimeout' cannot be null!" );
77657728
7766- // For gateway writes, use the full HTTP request timeout as the E2E timeout
7767- Duration overallE2eLatencyTimeout = httpNetworkRequestTimeout ;
7729+ Duration overallE2eLatencyTimeout = Utils .min (Utils .SIX_SECONDS , httpNetworkRequestTimeout );
77687730
7769- Duration threshold = Utils .min (overallE2eLatencyTimeout .dividedBy (2 ), Duration . ofSeconds ( 2 ) );
7770- Duration thresholdStep = Utils .min (threshold .dividedBy (2 ), Utils .ONE_SECOND );
7731+ Duration threshold = Utils .min (overallE2eLatencyTimeout .dividedBy (2 ), Utils . ONE_SECOND );
7732+ Duration thresholdStep = Utils .min (threshold .dividedBy (2 ), Utils .HALF_SECOND );
77717733
77727734 return new CosmosEndToEndOperationLatencyPolicyConfigBuilder (overallE2eLatencyTimeout )
77737735 .availabilityStrategy (new ThresholdBasedAvailabilityStrategy (threshold , thresholdStep ))
0 commit comments