diff --git a/internal/operator-controller/controllers/clusterobjectset_controller.go b/internal/operator-controller/controllers/clusterobjectset_controller.go index 2d81b71c9..bc081b676 100644 --- a/internal/operator-controller/controllers/clusterobjectset_controller.go +++ b/internal/operator-controller/controllers/clusterobjectset_controller.go @@ -14,6 +14,7 @@ import ( "strings" "time" + "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -23,6 +24,7 @@ import ( "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/client-go/util/workqueue" "k8s.io/utils/clock" "pkg.package-operator.run/boxcutter" "pkg.package-operator.run/boxcutter/machinery" @@ -32,8 +34,8 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/predicate" @@ -83,29 +85,6 @@ func (c *ClusterObjectSetReconciler) Reconcile(ctx context.Context, req ctrl.Req reconciledRev := existingRev.DeepCopy() res, reconcileErr := c.reconcile(ctx, reconciledRev) - if pd := existingRev.Spec.ProgressDeadlineMinutes; pd > 0 { - cnd := meta.FindStatusCondition(reconciledRev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing) - isStillProgressing := cnd != nil && cnd.Status == metav1.ConditionTrue && cnd.Reason != ocv1.ReasonSucceeded - succeeded := meta.IsStatusConditionTrue(reconciledRev.Status.Conditions, ocv1.ClusterObjectSetTypeSucceeded) - // check if we reached the progress deadline only if the revision is still progressing and has not succeeded yet - if isStillProgressing && !succeeded { - timeout := time.Duration(pd) * time.Minute - if c.Clock.Since(existingRev.CreationTimestamp.Time) > timeout { - // progress deadline reached, reset any errors and stop reconciling this revision - markAsNotProgressing(reconciledRev, ocv1.ReasonProgressDeadlineExceeded, fmt.Sprintf("Revision has not rolled out for %d minute(s).", pd)) - reconcileErr = nil - res = ctrl.Result{} - } else if reconcileErr == nil { - // We want to requeue so far in the future that the next reconciliation - // can detect if the revision did not progress within the given timeout. - // Thus, we plan the next reconcile slightly after (+2secs) the timeout is passed. - drift := 2 * time.Second - requeueAfter := existingRev.CreationTimestamp.Time.Add(timeout).Add(drift).Sub(c.Clock.Now()).Round(time.Second) - l.Info(fmt.Sprintf("ProgressDeadline not exceeded, requeue after ~%v to check again.", requeueAfter)) - res = ctrl.Result{RequeueAfter: requeueAfter} - } - } - } // Do checks before any Update()s, as Update() may modify the resource structure! updateStatus := !equality.Semantic.DeepEqual(existingRev.Status, reconciledRev.Status) @@ -146,6 +125,10 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl return c.delete(ctx, cos) } + remaining, hasDeadline := durationUntilDeadline(c.Clock, cos) + isDeadlineExceeded := hasDeadline && remaining <= 0 + + // Blocked takes precedence over ProgressDeadlineExceeded: it is more actionable for the user. if err := c.verifyReferencedSecretsImmutable(ctx, cos); err != nil { l.Error(err, "referenced Secret verification failed, blocking reconciliation") markAsNotProgressing(cos, ocv1.ClusterObjectSetReasonBlocked, err.Error()) @@ -154,7 +137,7 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl phases, currentPhases, opts, err := c.buildBoxcutterPhases(ctx, cos) if err != nil { - setRetryingConditions(cos, err.Error()) + setRetryingConditions(l, cos, err.Error(), isDeadlineExceeded) return ctrl.Result{}, fmt.Errorf("converting to boxcutter revision: %v", err) } @@ -168,7 +151,7 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl revisionEngine, err := c.RevisionEngineFactory.CreateRevisionEngine(ctx, cos) if err != nil { - setRetryingConditions(cos, err.Error()) + setRetryingConditions(l, cos, err.Error(), isDeadlineExceeded) return ctrl.Result{}, fmt.Errorf("failed to create revision engine: %v", err) } @@ -194,7 +177,7 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl if err := c.establishWatch(ctx, cos, revision); err != nil { werr := fmt.Errorf("establish watch: %v", err) - setRetryingConditions(cos, werr.Error()) + setRetryingConditions(l, cos, werr.Error(), isDeadlineExceeded) return ctrl.Result{}, werr } @@ -204,7 +187,7 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl // Log detailed reconcile reports only in debug mode (V(1)) to reduce verbosity. l.V(1).Info("reconcile report", "report", rres.String()) } - setRetryingConditions(cos, err.Error()) + setRetryingConditions(l, cos, err.Error(), isDeadlineExceeded) return ctrl.Result{}, fmt.Errorf("revision reconcile: %v", err) } @@ -212,14 +195,14 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl // TODO: report status, backoff? if verr := rres.GetValidationError(); verr != nil { l.Error(fmt.Errorf("%w", verr), "preflight validation failed, retrying after 10s") - setRetryingConditions(cos, fmt.Sprintf("revision validation error: %s", verr)) + setRetryingConditions(l, cos, fmt.Sprintf("revision validation error: %s", verr), isDeadlineExceeded) return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } for i, pres := range rres.GetPhases() { if verr := pres.GetValidationError(); verr != nil { l.Error(fmt.Errorf("%w", verr), "phase preflight validation failed, retrying after 10s", "phase", i) - setRetryingConditions(cos, fmt.Sprintf("phase %d validation error: %s", i, verr)) + setRetryingConditions(l, cos, fmt.Sprintf("phase %d validation error: %s", i, verr), isDeadlineExceeded) return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } @@ -232,14 +215,14 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl if len(collidingObjs) > 0 { l.Error(fmt.Errorf("object collision detected"), "object collision, retrying after 10s", "phase", i, "collisions", collidingObjs) - setRetryingConditions(cos, fmt.Sprintf("revision object collisions in phase %d\n%s", i, strings.Join(collidingObjs, "\n\n"))) + setRetryingConditions(l, cos, fmt.Sprintf("revision object collisions in phase %d\n%s", i, strings.Join(collidingObjs, "\n\n")), isDeadlineExceeded) return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } } revVersion := cos.GetAnnotations()[labels.BundleVersionKey] if rres.InTransition() { - markAsProgressing(cos, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion)) + markAsProgressing(l, cos, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion), isDeadlineExceeded) } //nolint:nestif @@ -259,7 +242,7 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl } } - markAsProgressing(cos, ocv1.ReasonSucceeded, fmt.Sprintf("Revision %s has rolled out.", revVersion)) + markAsProgressing(l, cos, ocv1.ReasonSucceeded, fmt.Sprintf("Revision %s has rolled out.", revVersion), isDeadlineExceeded) markAsAvailable(cos, ocv1.ClusterObjectSetReasonProbesSucceeded, "Objects are available and pass all probes.") // We'll probably only want to remove this once we are done updating the ClusterExtension conditions @@ -304,8 +287,9 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl } else { markAsUnavailable(cos, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion)) } - if meta.FindStatusCondition(cos.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing) == nil { - markAsProgressing(cos, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion)) + markAsProgressing(l, cos, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion), isDeadlineExceeded) + if hasDeadline && !isDeadlineExceeded { + return ctrl.Result{RequeueAfter: remaining}, nil } } @@ -324,14 +308,15 @@ func (c *ClusterObjectSetReconciler) delete(ctx context.Context, cos *ocv1.Clust } func (c *ClusterObjectSetReconciler) archive(ctx context.Context, revisionEngine RevisionEngine, cos *ocv1.ClusterObjectSet, revision boxcutter.RevisionBuilder) (ctrl.Result, error) { + l := log.FromContext(ctx) tdres, err := revisionEngine.Teardown(ctx, revision) if err != nil { err = fmt.Errorf("error archiving revision: %v", err) - setRetryingConditions(cos, err.Error()) + setRetryingConditions(l, cos, err.Error(), false) return ctrl.Result{}, err } if tdres != nil && !tdres.IsComplete() { - setRetryingConditions(cos, "removing revision resources that are not owned by another revision") + setRetryingConditions(l, cos, "removing revision resources that are not owned by another revision", false) return ctrl.Result{RequeueAfter: 5 * time.Second}, nil } // Ensure conditions are set before removing the finalizer when archiving @@ -349,29 +334,19 @@ type Sourcoser interface { } func (c *ClusterObjectSetReconciler) SetupWithManager(mgr ctrl.Manager) error { - skipProgressDeadlineExceededPredicate := predicate.Funcs{ - UpdateFunc: func(e event.UpdateEvent) bool { - rev, ok := e.ObjectNew.(*ocv1.ClusterObjectSet) - if !ok { - return true - } - // allow deletions to happen - if !rev.DeletionTimestamp.IsZero() { - return true - } - if cnd := meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing); cnd != nil && cnd.Status == metav1.ConditionFalse && cnd.Reason == ocv1.ReasonProgressDeadlineExceeded { - return false - } - return true - }, - } c.Clock = clock.RealClock{} return ctrl.NewControllerManagedBy(mgr). + WithOptions(controller.Options{ + RateLimiter: newDeadlineAwareRateLimiter( + workqueue.DefaultTypedControllerRateLimiter[ctrl.Request](), + mgr.GetClient(), + c.Clock, + ), + }). For( &ocv1.ClusterObjectSet{}, builder.WithPredicates( predicate.ResourceVersionChangedPredicate{}, - skipProgressDeadlineExceededPredicate, ), ). WatchesRawSource( @@ -659,14 +634,32 @@ func buildProgressionProbes(progressionProbes []ocv1.ProgressionProbe) (probing. return userProbes, nil } -func setRetryingConditions(cos *ocv1.ClusterObjectSet, message string) { - markAsProgressing(cos, ocv1.ClusterObjectSetReasonRetrying, message) +func setRetryingConditions(l logr.Logger, cos *ocv1.ClusterObjectSet, message string, isDeadlineExceeded bool) { + markAsProgressing(l, cos, ocv1.ClusterObjectSetReasonRetrying, message, isDeadlineExceeded) if meta.FindStatusCondition(cos.Status.Conditions, ocv1.ClusterObjectSetTypeAvailable) != nil { markAsAvailableUnknown(cos, ocv1.ClusterObjectSetReasonReconciling, message) } } -func markAsProgressing(cos *ocv1.ClusterObjectSet, reason, message string) { +var nonTerminalProgressingReasons = map[string]struct{}{ + ocv1.ReasonRollingOut: {}, + ocv1.ClusterObjectSetReasonRetrying: {}, +} + +func markAsProgressing(l logr.Logger, cos *ocv1.ClusterObjectSet, reason, message string, isDeadlineExceeded bool) { + switch reason { + case ocv1.ReasonSucceeded: + // Terminal — always apply. + default: + if _, known := nonTerminalProgressingReasons[reason]; !known { + l.Error(fmt.Errorf("unregistered progressing reason: %q", reason), "treating as non-terminal for deadline enforcement") + } + if isDeadlineExceeded { + markAsNotProgressing(cos, ocv1.ReasonProgressDeadlineExceeded, + fmt.Sprintf("Revision has not rolled out for %d minute(s). Last status: %s", cos.Spec.ProgressDeadlineMinutes, message)) + return + } + } meta.SetStatusCondition(&cos.Status.Conditions, metav1.Condition{ Type: ocv1.ClusterObjectSetTypeProgressing, Status: metav1.ConditionTrue, diff --git a/internal/operator-controller/controllers/clusterobjectset_controller_test.go b/internal/operator-controller/controllers/clusterobjectset_controller_test.go index b3b10f575..75e7a2f8e 100644 --- a/internal/operator-controller/controllers/clusterobjectset_controller_test.go +++ b/internal/operator-controller/controllers/clusterobjectset_controller_test.go @@ -988,7 +988,7 @@ func Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadline(t *testing.T) { revisionResult: &mockRevisionResult{ inTransition: true, }, - reconcileResult: ctrl.Result{RequeueAfter: 62 * time.Second}, + reconcileResult: ctrl.Result{RequeueAfter: 60 * time.Second}, validate: func(t *testing.T, c client.Client) { rev := &ocv1.ClusterObjectSet{} err := c.Get(t.Context(), client.ObjectKey{ @@ -1000,6 +1000,39 @@ func Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadline(t *testing.T) { require.Equal(t, ocv1.ReasonRollingOut, cnd.Reason) }, }, + { + name: "recovery from ProgressDeadlineExceeded to Succeeded when revision completes", + existingObjs: func() []client.Object { + ext := newTestClusterExtension() + rev1 := newTestClusterObjectSet(t, clusterObjectSetName, ext, testScheme) + rev1.Spec.ProgressDeadlineMinutes = 1 + rev1.CreationTimestamp = metav1.NewTime(time.Date(2022, 1, 1, 0, 0, 0, 0, time.UTC)) + meta.SetStatusCondition(&rev1.Status.Conditions, metav1.Condition{ + Type: ocv1.ClusterObjectSetTypeProgressing, + Status: metav1.ConditionFalse, + Reason: ocv1.ReasonProgressDeadlineExceeded, + Message: "Revision has not rolled out for 1 minute(s). Last status: Revision 1.0.0 is rolling out.", + ObservedGeneration: rev1.Generation, + }) + return []client.Object{rev1, ext} + }, + clock: clocktesting.NewFakeClock(time.Date(2022, 1, 1, 0, 5, 0, 0, time.UTC)), + revisionResult: &mockRevisionResult{ + isComplete: true, + }, + validate: func(t *testing.T, c client.Client) { + rev := &ocv1.ClusterObjectSet{} + err := c.Get(t.Context(), client.ObjectKey{ + Name: clusterObjectSetName, + }, rev) + require.NoError(t, err) + cnd := meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing) + require.NotNil(t, cnd) + require.Equal(t, metav1.ConditionTrue, cnd.Status) + require.Equal(t, ocv1.ReasonSucceeded, cnd.Reason) + require.Equal(t, "Revision 1.0.0 has rolled out.", cnd.Message) + }, + }, { name: "no progression deadline checks on revision recovery", existingObjs: func() []client.Object { diff --git a/internal/operator-controller/controllers/progress_deadline.go b/internal/operator-controller/controllers/progress_deadline.go new file mode 100644 index 000000000..5bce12307 --- /dev/null +++ b/internal/operator-controller/controllers/progress_deadline.go @@ -0,0 +1,110 @@ +//go:build !standard + +package controllers + +import ( + "context" + "sync" + "time" + + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/client-go/util/workqueue" + "k8s.io/utils/clock" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + ocv1 "github.com/operator-framework/operator-controller/api/v1" +) + +// deadlineAwareRateLimiter wraps a delegate rate limiter and caps the backoff +// duration to the time remaining until the COS progress deadline, ensuring +// that ProgressDeadlineExceeded is set promptly even during exponential backoff. +// +// After the deadline passes, it allows one immediate requeue (returning 0) so +// the reconciler can set the ProgressDeadlineExceeded condition, then falls +// back to the delegate's normal backoff. This avoids both tight-looping and +// coupling to the COS's status conditions. +type deadlineAwareRateLimiter struct { + delegate workqueue.TypedRateLimiter[ctrl.Request] + client client.Reader + clock clock.Clock + pastDeadline sync.Map +} + +func newDeadlineAwareRateLimiter( + delegate workqueue.TypedRateLimiter[ctrl.Request], + c client.Reader, + clk clock.Clock, +) *deadlineAwareRateLimiter { + return &deadlineAwareRateLimiter{delegate: delegate, client: c, clock: clk} +} + +func (r *deadlineAwareRateLimiter) When(item ctrl.Request) time.Duration { + backoff := r.delegate.When(item) + + cos := &ocv1.ClusterObjectSet{} + if err := r.client.Get(context.Background(), item.NamespacedName, cos); err != nil { + return backoff + } + + remaining, hasDeadline := durationUntilDeadline(r.clock, cos) + if !hasDeadline { + return backoff + } + if remaining > 0 { + if remaining < backoff { + return remaining + } + return backoff + } + + // Deadline has passed — allow one immediate requeue, then delegate. + if _, already := r.pastDeadline.LoadOrStore(item, struct{}{}); !already { + return 0 + } + return backoff +} + +func (r *deadlineAwareRateLimiter) Forget(item ctrl.Request) { + r.delegate.Forget(item) + r.pastDeadline.Delete(item) +} + +func (r *deadlineAwareRateLimiter) NumRequeues(item ctrl.Request) int { + return r.delegate.NumRequeues(item) +} + +// durationUntilDeadline returns how much time remains before the progress deadline +// expires. A negative duration means the deadline has already passed. +// +// It derives the deadline from spec and metadata only, with one exception: +// it checks the Succeeded status condition so that a revision recovering +// from drift is not penalised by the original deadline. +// +// Succeeded is a latch: there is no way to deduce from current cluster state +// alone that a COS succeeded in the past. If Succeeded is removed or set to +// False, this function will return a deadline and the reconciler will set +// ProgressDeadlineExceeded even though the revision previously succeeded. +// +// Returns (0, false) when there is no active deadline: +// - progressDeadlineMinutes is 0 +// - the revision has already succeeded +// - the revision is archived (deadline is irrelevant) +// - the revision is being deleted +func durationUntilDeadline(clk clock.Clock, cos *ocv1.ClusterObjectSet) (time.Duration, bool) { + pd := cos.Spec.ProgressDeadlineMinutes + if pd <= 0 { + return 0, false + } + if meta.IsStatusConditionTrue(cos.Status.Conditions, ocv1.ClusterObjectSetTypeSucceeded) { + return 0, false + } + if cos.Spec.LifecycleState == ocv1.ClusterObjectSetLifecycleStateArchived { + return 0, false + } + if !cos.DeletionTimestamp.IsZero() { + return 0, false + } + deadline := cos.CreationTimestamp.Add(time.Duration(pd) * time.Minute) + return deadline.Sub(clk.Now()), true +} diff --git a/internal/operator-controller/controllers/progress_deadline_test.go b/internal/operator-controller/controllers/progress_deadline_test.go new file mode 100644 index 000000000..73102e9ab --- /dev/null +++ b/internal/operator-controller/controllers/progress_deadline_test.go @@ -0,0 +1,266 @@ +//go:build !standard + +package controllers + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + clocktesting "k8s.io/utils/clock/testing" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + ocv1 "github.com/operator-framework/operator-controller/api/v1" +) + +func TestDurationUntilDeadline(t *testing.T) { + creation := time.Date(2022, 1, 1, 0, 0, 0, 0, time.UTC) + now := creation.Add(30 * time.Second) + clk := clocktesting.NewFakeClock(now) + + for _, tc := range []struct { + name string + cos ocv1.ClusterObjectSet + expectDuration time.Duration + expectHasDeadline bool + }{ + { + name: "progressDeadlineMinutes is 0 — no deadline", + cos: ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{CreationTimestamp: metav1.NewTime(creation)}, + Spec: ocv1.ClusterObjectSetSpec{ProgressDeadlineMinutes: 0, LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive}, + }, + expectDuration: 0, + expectHasDeadline: false, + }, + { + name: "Succeeded is true — no deadline", + cos: ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{CreationTimestamp: metav1.NewTime(creation)}, + Spec: ocv1.ClusterObjectSetSpec{ProgressDeadlineMinutes: 1, LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive}, + Status: ocv1.ClusterObjectSetStatus{ + Conditions: []metav1.Condition{{ + Type: ocv1.ClusterObjectSetTypeSucceeded, + Status: metav1.ConditionTrue, + }}, + }, + }, + expectDuration: 0, + expectHasDeadline: false, + }, + { + name: "lifecycleState is Archived — no deadline", + cos: ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{CreationTimestamp: metav1.NewTime(creation)}, + Spec: ocv1.ClusterObjectSetSpec{ProgressDeadlineMinutes: 1, LifecycleState: ocv1.ClusterObjectSetLifecycleStateArchived}, + }, + expectDuration: 0, + expectHasDeadline: false, + }, + { + name: "DeletionTimestamp is set — no deadline", + cos: ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{ + CreationTimestamp: metav1.NewTime(creation), + DeletionTimestamp: &metav1.Time{Time: now}, + }, + Spec: ocv1.ClusterObjectSetSpec{ProgressDeadlineMinutes: 1, LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive}, + }, + expectDuration: 0, + expectHasDeadline: false, + }, + { + name: "deadline not yet exceeded — returns positive remaining", + cos: ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{CreationTimestamp: metav1.NewTime(creation)}, + Spec: ocv1.ClusterObjectSetSpec{ProgressDeadlineMinutes: 1, LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive}, + }, + expectDuration: 30 * time.Second, + expectHasDeadline: true, + }, + { + name: "deadline already exceeded — returns negative remaining", + cos: ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{CreationTimestamp: metav1.NewTime(creation.Add(-2 * time.Minute))}, + Spec: ocv1.ClusterObjectSetSpec{ProgressDeadlineMinutes: 1, LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive}, + }, + expectDuration: -90 * time.Second, + expectHasDeadline: true, + }, + } { + t.Run(tc.name, func(t *testing.T) { + duration, hasDeadline := durationUntilDeadline(clk, &tc.cos) + require.Equal(t, tc.expectHasDeadline, hasDeadline) + require.Equal(t, tc.expectDuration, duration) + }) + } +} + +type fixedRateLimiter struct { + duration time.Duration +} + +func (f *fixedRateLimiter) When(_ ctrl.Request) time.Duration { return f.duration } +func (f *fixedRateLimiter) Forget(_ ctrl.Request) {} +func (f *fixedRateLimiter) NumRequeues(_ ctrl.Request) int { return 0 } + +func TestDeadlineAwareRateLimiter(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, ocv1.AddToScheme(scheme)) + + creation := time.Date(2022, 1, 1, 0, 0, 0, 0, time.UTC) + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: "test-cos"}} + + for _, tc := range []struct { + name string + backoff time.Duration + cos *ocv1.ClusterObjectSet + clockTime time.Time + expectDuration time.Duration + }{ + { + name: "no deadline configured — uses delegate backoff", + backoff: 30 * time.Second, + clockTime: creation, + cos: &ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cos", + CreationTimestamp: metav1.NewTime(creation), + }, + Spec: ocv1.ClusterObjectSetSpec{ + LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive, + }, + }, + expectDuration: 30 * time.Second, + }, + { + name: "deadline not exceeded and backoff is shorter — uses delegate backoff", + backoff: 5 * time.Second, + clockTime: creation, + cos: &ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cos", + CreationTimestamp: metav1.NewTime(creation), + }, + Spec: ocv1.ClusterObjectSetSpec{ + LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive, + ProgressDeadlineMinutes: 1, + }, + }, + expectDuration: 5 * time.Second, + }, + { + name: "deadline not exceeded and backoff is longer — caps at deadline", + backoff: 5 * time.Minute, + clockTime: creation, + cos: &ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cos", + CreationTimestamp: metav1.NewTime(creation), + }, + Spec: ocv1.ClusterObjectSetSpec{ + LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive, + ProgressDeadlineMinutes: 1, + }, + }, + expectDuration: 60 * time.Second, + }, + { + name: "deadline exceeded — first call returns immediate requeue", + backoff: 30 * time.Second, + clockTime: creation.Add(61 * time.Second), + cos: &ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cos", + CreationTimestamp: metav1.NewTime(creation), + }, + Spec: ocv1.ClusterObjectSetSpec{ + LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive, + ProgressDeadlineMinutes: 1, + }, + }, + expectDuration: 0, + }, + { + name: "COS not found — uses delegate backoff", + backoff: 30 * time.Second, + clockTime: creation, + cos: nil, + expectDuration: 30 * time.Second, + }, + } { + t.Run(tc.name, func(t *testing.T) { + builder := fake.NewClientBuilder().WithScheme(scheme) + if tc.cos != nil { + builder = builder.WithObjects(tc.cos) + } + + limiter := newDeadlineAwareRateLimiter( + &fixedRateLimiter{duration: tc.backoff}, + builder.Build(), + clocktesting.NewFakeClock(tc.clockTime), + ) + + testReq := req + if tc.cos == nil { + testReq.Name = "nonexistent" + } + + result := limiter.When(testReq) + require.Equal(t, tc.expectDuration, result) + }) + } + + t.Run("deadline exceeded — second call uses delegate backoff (one-shot)", func(t *testing.T) { + cos := &ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cos", + CreationTimestamp: metav1.NewTime(creation), + }, + Spec: ocv1.ClusterObjectSetSpec{ + LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive, + ProgressDeadlineMinutes: 1, + }, + } + limiter := newDeadlineAwareRateLimiter( + &fixedRateLimiter{duration: 30 * time.Second}, + fake.NewClientBuilder().WithScheme(scheme).WithObjects(cos).Build(), + clocktesting.NewFakeClock(creation.Add(61*time.Second)), + ) + + first := limiter.When(req) + require.Equal(t, time.Duration(0), first) + + second := limiter.When(req) + require.Equal(t, 30*time.Second, second) + }) + + t.Run("Forget resets the one-shot flag", func(t *testing.T) { + cos := &ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cos", + CreationTimestamp: metav1.NewTime(creation), + }, + Spec: ocv1.ClusterObjectSetSpec{ + LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive, + ProgressDeadlineMinutes: 1, + }, + } + limiter := newDeadlineAwareRateLimiter( + &fixedRateLimiter{duration: 30 * time.Second}, + fake.NewClientBuilder().WithScheme(scheme).WithObjects(cos).Build(), + clocktesting.NewFakeClock(creation.Add(61*time.Second)), + ) + + require.Equal(t, time.Duration(0), limiter.When(req)) + require.Equal(t, 30*time.Second, limiter.When(req)) + + limiter.Forget(req) + + require.Equal(t, time.Duration(0), limiter.When(req)) + }) +} diff --git a/test/e2e/features/install.feature b/test/e2e/features/install.feature index ceb5e32c2..6265a8eff 100644 --- a/test/e2e/features/install.feature +++ b/test/e2e/features/install.feature @@ -435,7 +435,7 @@ Feature: Install ClusterExtension Then ClusterObjectSet "${NAME}-1" reports Progressing as False with Reason ProgressDeadlineExceeded And ClusterExtension reports Progressing as False with Reason ProgressDeadlineExceeded and Message: """ - Revision has not rolled out for 1 minute(s). + Revision has not rolled out for 1 minute(s). Last status: Revision 1.0.2 is rolling out. """ And ClusterExtension reports Progressing transition between 1 and 2 minutes since its creation @@ -471,7 +471,7 @@ Feature: Install ClusterExtension Then ClusterObjectSet "${NAME}-1" reports Progressing as False with Reason ProgressDeadlineExceeded And ClusterExtension reports Progressing as False with Reason ProgressDeadlineExceeded and Message: """ - Revision has not rolled out for 1 minute(s). + Revision has not rolled out for 1 minute(s). Last status: Revision 1.0.3 is rolling out. """ And ClusterExtension reports Progressing transition between 1 and 2 minutes since its creation diff --git a/test/e2e/features/revision.feature b/test/e2e/features/revision.feature index 38ed16027..866e8195f 100644 --- a/test/e2e/features/revision.feature +++ b/test/e2e/features/revision.feature @@ -601,3 +601,141 @@ Feature: Install ClusterObjectSet """ And ClusterObjectSet "${COS_NAME}" reconciliation is triggered Then ClusterObjectSet "${COS_NAME}" reports Progressing as True with Reason Succeeded + + + @ProgressDeadline + Scenario: Archiving a COS with ProgressDeadlineExceeded cleans up its resources + Given min value for ClusterObjectSet .spec.progressDeadlineMinutes is set to 1 + And ServiceAccount "olm-sa" with needed permissions is available in test namespace + When ClusterObjectSet is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterObjectSet + metadata: + annotations: + olm.operatorframework.io/service-account-name: olm-sa + olm.operatorframework.io/service-account-namespace: ${TEST_NAMESPACE} + name: ${COS_NAME} + spec: + lifecycleState: Active + collisionProtection: Prevent + progressDeadlineMinutes: 1 + progressionProbes: + - selector: + type: GroupKind + groupKind: + group: apps + kind: Deployment + assertions: + - type: ConditionEqual + conditionEqual: + type: Available + status: "True" + phases: + - name: resources + objects: + - object: + apiVersion: v1 + kind: ConfigMap + metadata: + name: test-configmap + namespace: ${TEST_NAMESPACE} + data: + foo: bar + - object: + apiVersion: apps/v1 + kind: Deployment + metadata: + name: test-deployment + namespace: ${TEST_NAMESPACE} + spec: + replicas: 1 + selector: + matchLabels: + app: never-ready + template: + metadata: + labels: + app: never-ready + spec: + containers: + - name: never-ready + image: does-not-exist:latest + revision: 1 + """ + Then resource "configmap/test-configmap" is installed + And resource "deployment/test-deployment" is installed + And ClusterObjectSet "${COS_NAME}" reports Progressing as False with Reason ProgressDeadlineExceeded + When ClusterObjectSet "${COS_NAME}" lifecycle is set to "Archived" + Then ClusterObjectSet "${COS_NAME}" is archived + And resource "configmap/test-configmap" is eventually not found + And resource "deployment/test-deployment" is eventually not found + + @ProgressDeadline + Scenario: COS recovers from ProgressDeadlineExceeded to Succeeded when probes pass + Given min value for ClusterObjectSet .spec.progressDeadlineMinutes is set to 1 + And ServiceAccount "olm-sa" with needed permissions is available in test namespace + When ClusterObjectSet is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterObjectSet + metadata: + annotations: + olm.operatorframework.io/service-account-name: olm-sa + olm.operatorframework.io/service-account-namespace: ${TEST_NAMESPACE} + name: ${COS_NAME} + spec: + lifecycleState: Active + collisionProtection: Prevent + progressDeadlineMinutes: 1 + progressionProbes: + - selector: + type: GroupKind + groupKind: + group: apps + kind: Deployment + assertions: + - type: ConditionEqual + conditionEqual: + type: Available + status: "True" + phases: + - name: resources + objects: + - object: + apiVersion: apps/v1 + kind: Deployment + metadata: + name: test-deployment + namespace: ${TEST_NAMESPACE} + spec: + replicas: 1 + selector: + matchLabels: + app: delayed-ready + template: + metadata: + labels: + app: delayed-ready + spec: + containers: + - name: delayed-ready + image: busybox:1.36 + command: ["sleep", "1000"] + readinessProbe: + exec: + command: ["true"] + initialDelaySeconds: 65 + securityContext: + runAsNonRoot: true + runAsUser: 1000 + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + revision: 1 + """ + Then ClusterObjectSet "${COS_NAME}" reports Progressing as False with Reason ProgressDeadlineExceeded + And ClusterObjectSet "${COS_NAME}" reports Progressing as True with Reason Succeeded diff --git a/test/e2e/steps/steps.go b/test/e2e/steps/steps.go index 5466e2fc0..a91bfbacb 100644 --- a/test/e2e/steps/steps.go +++ b/test/e2e/steps/steps.go @@ -99,6 +99,7 @@ func RegisterSteps(sc *godog.ScenarioContext) { sc.Step(`^(?i)ClusterExtension is applied(?:\s+.*)?$`, ResourceIsApplied) sc.Step(`^(?i)ClusterExtension is updated to version "([^"]+)"$`, ClusterExtensionVersionUpdate) sc.Step(`^(?i)ClusterExtension is updated(?:\s+.*)?$`, ResourceIsApplied) + sc.Step(`^(?i)ClusterObjectSet "([^"]+)" lifecycle is set to "([^"]+)"$`, ClusterObjectSetLifecycleUpdate) sc.Step(`^(?i)ClusterExtension is available$`, ClusterExtensionIsAvailable) sc.Step(`^(?i)ClusterExtension is rolled out$`, ClusterExtensionIsRolledOut) sc.Step(`^(?i)ClusterExtension resources are created and labeled$`, ClusterExtensionResourcesCreatedAndAreLabeled) @@ -377,6 +378,23 @@ func ClusterExtensionVersionUpdate(ctx context.Context, version string) error { return err } +// ClusterObjectSetLifecycleUpdate patches the ClusterObjectSet's lifecycleState to the specified value. +func ClusterObjectSetLifecycleUpdate(ctx context.Context, cosName, lifecycle string) error { + sc := scenarioCtx(ctx) + cosName = substituteScenarioVars(cosName, sc) + patch := map[string]any{ + "spec": map[string]any{ + "lifecycleState": lifecycle, + }, + } + pb, err := json.Marshal(patch) + if err != nil { + return err + } + _, err = k8sClient("patch", "clusterobjectset", cosName, "--type", "merge", "-p", string(pb)) + return err +} + // ResourceIsApplied applies the provided YAML resource to the cluster and in case of ClusterExtension or ClusterObjectSet it captures // its name in the test context so that it can be referred to in later steps with ${NAME} or ${COS_NAME}, respectively func ResourceIsApplied(ctx context.Context, yamlTemplate *godog.DocString) error {