From 973c476477248a25547a26751ae98a648d6b671f Mon Sep 17 00:00:00 2001 From: Albert Wu Date: Fri, 26 Jun 2026 19:14:52 -0700 Subject: [PATCH] fix(orchestrator): scope batch claim-error metric to process op MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the batch controller, every infrastructure-error counter is emitted under the `process` operation subscope via metrics.NamedCounter (e.g. deserialize_errors, storage_errors, counter_errors, batch_store_errors, conflict_analyzer_errors, batch_dependent_store_errors, publish_errors), landing at batch_controller.process.*_errors. The request-claim failure path was the lone exception: it incremented c.metricsScope.Counter( "request_claim_errors") directly, emitting at batch_controller. request_claim_errors — one scope level above its siblings. The deferred op.Complete(err) still bumps process.failed on that path, so when an operator attributes a process.failed spike by summing process.*_errors, the total is silently short by the request_claim_errors count. Routing it through metrics.NamedCounter restores the per-category breakdown so it reconciles with process.failed. The ack-path outcome counters (skipped_halted, request_claim_lost_race) intentionally stay at controller scope, matching the skipped_* convention in the score and speculate controllers; only error counters belong under the operation subscope. Co-Authored-By: Claude Opus 4.8 (1M context) --- submitqueue/orchestrator/controller/batch/batch.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submitqueue/orchestrator/controller/batch/batch.go b/submitqueue/orchestrator/controller/batch/batch.go index 0af0d68c..28cdf5eb 100644 --- a/submitqueue/orchestrator/controller/batch/batch.go +++ b/submitqueue/orchestrator/controller/batch/batch.go @@ -270,7 +270,7 @@ func (c *Controller) Process(ctx context.Context, delivery consumer.Delivery) (r ) return nil } - c.metricsScope.Counter("request_claim_errors").Inc(1) + metrics.NamedCounter(c.metricsScope, opName, "request_claim_errors", 1) return fmt.Errorf("failed to claim request %s for batch %s: %w", request.ID, batch.ID, err) } request.Version = newRequestVersion