From 25a5aa21a36d8e1801b003224f7f7541c264aa46 Mon Sep 17 00:00:00 2001 From: sergeyb Date: Fri, 26 Jun 2026 22:44:31 +0000 Subject: [PATCH] fix(orchestrator): stop force-marking publish failures retryable The validate, buildsignal, and merge controllers wrapped every queue publish failure with errs.NewRetryableError, unconditionally retrying regardless of the actual cause. That defeats the classifier framework: only known-transient infra errors should retry, otherwise genuinely non-retryable failures replay forever instead of dead-lettering. Return the raw error from these publish paths and let the consumer's ErrorProcessor classify it. Drop the now-unused errs imports (gazelle removes the corresponding BUILD deps). Controller tests no longer assert IsRetryable on the publish path, since classification is the consumer's contract, not the controller's. Docs: clarify in CLAUDE.md and platform/errs/README.md that classifiers own retryability, controllers override only with knowledge a classifier lacks (never just because replaying is convenient), and IsRetryable is detected only after the ErrorProcessor pass. Co-Authored-By: Claude Opus 4.8 --- CLAUDE.md | 5 ++--- platform/errs/README.md | 6 ++++++ submitqueue/orchestrator/controller/buildsignal/BUILD.bazel | 1 - .../orchestrator/controller/buildsignal/buildsignal.go | 6 +----- .../orchestrator/controller/buildsignal/buildsignal_test.go | 6 ++---- submitqueue/orchestrator/controller/merge/BUILD.bazel | 1 - submitqueue/orchestrator/controller/merge/merge.go | 4 +--- submitqueue/orchestrator/controller/merge/merge_test.go | 3 +-- submitqueue/orchestrator/controller/validate/validate.go | 3 +-- .../orchestrator/controller/validate/validate_test.go | 3 --- 10 files changed, 14 insertions(+), 24 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 9b836119..6fb4ef6e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -331,9 +331,8 @@ CI runs on every PR and enforces all checks via a `required-checks` gate. **Befo Errors are classified by origin (user vs infra) and retryability. The framework lives in `platform/errs/`. See [platform/errs/README.md](platform/errs/README.md) for full details. **Key rules:** -1. **Non-retryable by default** — a plain `fmt.Errorf(...)` is non-retryable. Wrap with `errs.NewRetryableError(...)` to opt in to retry. +1. **Non-retryable by default** — a plain `fmt.Errorf(...)` is non-retryable. Retryability is opted into explicitly, but that decision is almost always made by a classifier, not a controller (see rule 4). 2. **Infra by default** — any error not wrapped with `NewUserError` is infra. There is no `NewInfraError`. 3. **Extensions return plain errors** — extension interfaces (`MergeChecker`, `Storage`, `Publisher`) return standard `error` values with their own domain sentinels (e.g. `storage.ErrNotFound`). They do NOT classify errors as user or infra. -4. **Controllers classify errors** — the service controller that calls an extension decides whether the failure is user-caused or infrastructure-caused. The same extension error may be classified differently depending on context. +4. **Classifiers do the bulk of classification; controllers override only with knowledge a classifier lacks** — primary pipeline consumers compose per-backend classifiers into `errs.NewClassifierProcessor(...)`; the processor runs once per chain in the consumer and decides retryability from the raw error. So the common case is a controller returning the raw error (`fmt.Errorf("...: %w", err)`) and letting the classifier verdict stand. Reserve an explicit `errs.New*Error` wrap for the rare case where the controller knows something the classifier cannot infer from the error value alone (e.g. `storage.ErrNotFound` meaning "user asked for a missing resource" *in this call site*). Do **not** wrap a failure as retryable just because replaying it is convenient (e.g. a failed queue publish) — that turns permanent failures into infinite retries instead of dead-lettering. DLQ reconciliation consumers use `errs.AlwaysRetryableProcessor` instead. See [platform/errs/README.md](platform/errs/README.md). 5. **Error chain works end-to-end** — extensions wrap custom errors, controllers wrap with `errs.New*Error`, and `errors.Is`/`errors.As` walks the full chain. -6. **Default classifiers** — primary pipeline consumers compose one or more classifiers (each owning a focused concern such as transport-level signals or a specific driver/library's errors) into `errs.NewClassifierProcessor(...)`. Pick classifiers that match the failure surfaces the consumer actually touches; add a new classifier package when a backend introduces error shapes that no existing one understands, rather than teaching an unrelated classifier about them. DLQ reconciliation consumers use `errs.AlwaysRetryableProcessor` instead so any failure is redelivered rather than dropped. diff --git a/platform/errs/README.md b/platform/errs/README.md index 7b09a1ff..188406aa 100644 --- a/platform/errs/README.md +++ b/platform/errs/README.md @@ -129,6 +129,12 @@ Two practical rules fall out of the short-circuit semantics: - **Wrap with a framework constructor as soon as the controller knows the right verdict.** Any wrap added later in the chain still wins, but wrapping early keeps the intent close to the decision. - **A wrap anywhere in the chain blocks all classifiers — including for nodes deeper than the wrap.** If you want a classifier to still get a look at the cause, do not wrap above it. (In practice this is rare: controllers wrap because they have the final answer.) +### When *not* to classify in a controller + +The controller-override path is for the rare case where the controller has certain knowledge a classifier cannot derive from the error value alone — typically a sentinel (`storage.ErrNotFound`) that means "the user asked for something missing" *in this specific call site*. The default and overwhelmingly common case is the opposite: the controller returns the raw error (`return fmt.Errorf("...: %w", err)`) and lets the consumer's `ErrorProcessor` classify it. + +In particular, **do not reach for `NewRetryableError` just because replaying the message would be convenient.** A failed queue publish, a failed enqueue, a "the hand-off that keeps this alive" step — these are *not* a license to mark the error retryable. Whether such a failure is transient is exactly what a classifier exists to decide: a transport-level classifier wraps genuine connection/timeout blips as retryable, while a malformed-request or permission failure stays non-retryable and dead-letters instead of replaying forever. Blanket `NewRetryableError` on a publish path defeats that and turns every permanent failure into an infinite retry loop. + ## Extensions Return Plain Go Errors Extension interfaces (`MergeChecker`, `Storage`, `Publisher`) return standard `error` values. They may define their own domain-specific sentinel errors (e.g. `storage.ErrNotFound`, `storage.ErrVersionMismatch`) but they do **not** classify errors as user or infra — that is the controller's (and the consumer's `ErrorProcessor`'s) job. diff --git a/submitqueue/orchestrator/controller/buildsignal/BUILD.bazel b/submitqueue/orchestrator/controller/buildsignal/BUILD.bazel index 6d421038..81d631fe 100644 --- a/submitqueue/orchestrator/controller/buildsignal/BUILD.bazel +++ b/submitqueue/orchestrator/controller/buildsignal/BUILD.bazel @@ -8,7 +8,6 @@ go_library( deps = [ "//platform/base/messagequeue", "//platform/consumer", - "//platform/errs", "//platform/metrics", "//submitqueue/core/topickey", "//submitqueue/entity", diff --git a/submitqueue/orchestrator/controller/buildsignal/buildsignal.go b/submitqueue/orchestrator/controller/buildsignal/buildsignal.go index e96a897c..9be2d484 100644 --- a/submitqueue/orchestrator/controller/buildsignal/buildsignal.go +++ b/submitqueue/orchestrator/controller/buildsignal/buildsignal.go @@ -29,7 +29,6 @@ import ( "github.com/uber-go/tally" entityqueue "github.com/uber/submitqueue/platform/base/messagequeue" "github.com/uber/submitqueue/platform/consumer" - "github.com/uber/submitqueue/platform/errs" "github.com/uber/submitqueue/platform/metrics" "github.com/uber/submitqueue/submitqueue/core/topickey" "github.com/uber/submitqueue/submitqueue/entity" @@ -194,10 +193,7 @@ func (c *Controller) Process(ctx context.Context, delivery consumer.Delivery) (r metrics.NamedCounter(c.metricsScope, opName, "rescheduled", 1, metrics.NewTag("status", string(status))) if err := c.publishBuild(ctx, c.topicKey, build, delayMs); err != nil { metrics.NamedCounter(c.metricsScope, opName, "publish_errors", 1) - // Retryable: this is the poll loop's heartbeat. A transient enqueue - // failure should nack and replay rather than DLQ the only message - // keeping this build's status loop alive. - return errs.NewRetryableError(fmt.Errorf("failed to re-publish to buildsignal: %w", err)) + return fmt.Errorf("failed to re-publish to buildsignal: %w", err) } c.logger.Debugw("rescheduled build status poll", diff --git a/submitqueue/orchestrator/controller/buildsignal/buildsignal_test.go b/submitqueue/orchestrator/controller/buildsignal/buildsignal_test.go index 4daa13cf..affd0cbf 100644 --- a/submitqueue/orchestrator/controller/buildsignal/buildsignal_test.go +++ b/submitqueue/orchestrator/controller/buildsignal/buildsignal_test.go @@ -233,9 +233,8 @@ func TestController_Process_UpdateStatusError(t *testing.T) { } // TestController_Process_RepublishError verifies that a failure to re-publish -// the delayed poll message is retryable: the re-schedule is the loop's -// heartbeat, so it should nack and replay rather than reject straight to DLQ. -// The preceding status/persist/speculate steps all succeed. +// the delayed poll message surfaces an error. The preceding +// status/persist/speculate steps all succeed. func TestController_Process_RepublishError(t *testing.T) { ctrl := gomock.NewController(t) h := newTestHarness(t, ctrl) @@ -253,7 +252,6 @@ func TestController_Process_RepublishError(t *testing.T) { err := h.controller.Process(context.Background(), buildDelivery(t, ctrl, build)) require.Error(t, err) - assert.True(t, errs.IsRetryable(err)) } // TestController_Process_GetError verifies that a failure to load the Build diff --git a/submitqueue/orchestrator/controller/merge/BUILD.bazel b/submitqueue/orchestrator/controller/merge/BUILD.bazel index b31f408c..f4925283 100644 --- a/submitqueue/orchestrator/controller/merge/BUILD.bazel +++ b/submitqueue/orchestrator/controller/merge/BUILD.bazel @@ -12,7 +12,6 @@ go_library( "//platform/base/mergestrategy", "//platform/base/messagequeue", "//platform/consumer", - "//platform/errs", "//platform/metrics", "//submitqueue/entity", "//submitqueue/extension/storage", diff --git a/submitqueue/orchestrator/controller/merge/merge.go b/submitqueue/orchestrator/controller/merge/merge.go index 9bdbb5fc..d1d886ce 100644 --- a/submitqueue/orchestrator/controller/merge/merge.go +++ b/submitqueue/orchestrator/controller/merge/merge.go @@ -34,7 +34,6 @@ import ( "github.com/uber/submitqueue/platform/base/mergestrategy" entityqueue "github.com/uber/submitqueue/platform/base/messagequeue" "github.com/uber/submitqueue/platform/consumer" - "github.com/uber/submitqueue/platform/errs" "github.com/uber/submitqueue/platform/metrics" "github.com/uber/submitqueue/submitqueue/entity" "github.com/uber/submitqueue/submitqueue/extension/storage" @@ -145,8 +144,7 @@ func (c *Controller) Process(ctx context.Context, delivery consumer.Delivery) (r if err := c.publish(ctx, c.runwayTopicKey, req, batch.Queue); err != nil { metrics.NamedCounter(c.metricsScope, opName, "publish_errors", 1) - // Retryable: the hand-off to runway is what keeps this merge alive. - return errs.NewRetryableError(fmt.Errorf("failed to publish to runway merge: %w", err)) + return fmt.Errorf("failed to publish to runway merge: %w", err) } c.logger.Infow("published merge to runway", diff --git a/submitqueue/orchestrator/controller/merge/merge_test.go b/submitqueue/orchestrator/controller/merge/merge_test.go index 3476f6fb..1b8eab35 100644 --- a/submitqueue/orchestrator/controller/merge/merge_test.go +++ b/submitqueue/orchestrator/controller/merge/merge_test.go @@ -190,7 +190,7 @@ func TestProcess_HaltedBatchSkips(t *testing.T) { } } -func TestProcess_PublishFailureIsRetryable(t *testing.T) { +func TestProcess_PublishFailureReturnsError(t *testing.T) { ctrl := gomock.NewController(t) const batchID = "test-queue/batch/2" @@ -218,7 +218,6 @@ func TestProcess_PublishFailureIsRetryable(t *testing.T) { c := newController(t, store, registry) err = c.Process(context.Background(), newDelivery(t, ctrl, batchID, batch.Queue)) require.Error(t, err) - assert.True(t, errs.IsRetryable(err)) } func TestProcess_BatchStoreGetFailureNotRetryable(t *testing.T) { diff --git a/submitqueue/orchestrator/controller/validate/validate.go b/submitqueue/orchestrator/controller/validate/validate.go index d030a86f..222cdac3 100644 --- a/submitqueue/orchestrator/controller/validate/validate.go +++ b/submitqueue/orchestrator/controller/validate/validate.go @@ -187,8 +187,7 @@ func (c *Controller) Process(ctx context.Context, delivery consumer.Delivery) (r } if err := c.publishMergeCheck(ctx, req); err != nil { coremetrics.NamedCounter(c.metricsScope, "process", "publish_errors", 1) - // Retryable: the hand-off to runway is what keeps this check alive. - return errs.NewRetryableError(fmt.Errorf("failed to publish to runway merge-conflict-check: %w", err)) + return fmt.Errorf("failed to publish to runway merge-conflict-check: %w", err) } c.logger.Infow("published merge conflict check to runway", diff --git a/submitqueue/orchestrator/controller/validate/validate_test.go b/submitqueue/orchestrator/controller/validate/validate_test.go index 0a009485..8f0c6fbc 100644 --- a/submitqueue/orchestrator/controller/validate/validate_test.go +++ b/submitqueue/orchestrator/controller/validate/validate_test.go @@ -310,11 +310,8 @@ func TestController_Process_PublishFailure(t *testing.T) { delivery.EXPECT().Message().Return(msg).AnyTimes() delivery.EXPECT().Attempt().Return(1).AnyTimes() - // The hand-off to runway is retryable: a transient enqueue blip should replay - // rather than strand the request. err := controller.Process(context.Background(), delivery) require.Error(t, err) - assert.True(t, errs.IsRetryable(err)) } func TestController_InterfaceImplementation(t *testing.T) {