Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions pkg/apps/deployment/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,20 @@ func PodContainersStatus(deployment *appsv1.Deployment, podClient corelistersv1.
if err != nil {
return nil, err
}
return ContainerMessagesForPods(deployment, deploymentPods), nil
}

// ContainerMessagesForPods returns human-readable container status messages for the given pods.
// If pods is empty, a single message is included describing that no pods matched (using the deployment template labels).
func ContainerMessagesForPods(deployment *appsv1.Deployment, pods []*corev1.Pod) []string {
containerStates := containerStatusMessagesForPods(pods)
if len(pods) == 0 {
containerStates = append(containerStates, fmt.Sprintf("no pods found with labels %q", labels.SelectorFromSet(deployment.Spec.Template.Labels).String()))
}
return containerStates
}

func containerStatusMessagesForPods(deploymentPods []*corev1.Pod) []string {
containerStates := []string{}

for i := range deploymentPods {
Expand Down Expand Up @@ -66,10 +80,7 @@ func PodContainersStatus(deployment *appsv1.Deployment, podClient corelistersv1.
}
}

if len(deploymentPods) == 0 {
containerStates = append(containerStates, fmt.Sprintf("no pods found with labels %q", labels.SelectorFromSet(deployment.Spec.Template.Labels).String()))
}
return containerStates, nil
return containerStates
}

func containerPlural(c int, crashloop bool) string {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ import (
//
// <name>Available: indicates that the CSI Controller Service was successfully deployed and at least one Deployment replica is available.
// <name>Progressing: indicates that the CSI Controller Service is being deployed.
// <name>Degraded: produced when the sync() method returns an error.
// <name>Degraded: true when the deployment has timed out progressing, when failing pods reduce availability (while not mid-rollout), or when sync returns an error.

func NewCSIDriverControllerServiceController(
name string,
Expand Down
228 changes: 184 additions & 44 deletions pkg/operator/deploymentcontroller/deployment_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

opv1 "github.com/openshift/api/operator/v1"
applyoperatorv1 "github.com/openshift/client-go/operator/applyconfigurations/operator/v1"
dpm "github.com/openshift/library-go/pkg/apps/deployment"
"github.com/openshift/library-go/pkg/controller/factory"
"github.com/openshift/library-go/pkg/operator/events"
"github.com/openshift/library-go/pkg/operator/management"
Expand Down Expand Up @@ -42,7 +43,7 @@ type ManifestHookFunc func(*opv1.OperatorSpec, []byte) ([]byte, error)
// This controller optionally produces the following conditions:
// <name>Available: indicates that the deployment controller was successfully deployed and at least one Deployment replica is available.
// <name>Progressing: indicates that the Deployment is in progress.
// <name>Degraded: produced when the sync() method returns an error.
// <name>Degraded: true when the deployment has timed out progressing, when failing pods reduce availability (while not mid-rollout), or when sync returns an error.
type DeploymentController struct {
// instanceName is the name to identify what instance this belongs too: FooDriver for instance
instanceName string
Expand Down Expand Up @@ -193,7 +194,9 @@ func (c *DeploymentController) ToController() (factory.Controller, error) {
).ResyncEvery(
time.Minute,
)
if slices.Contains(c.conditions, opv1.OperatorStatusTypeDegraded) {
// When this controller owns <name>Degraded in status, do not use WithSyncDegradedOnError: reconcile would set
// Degraded=False on every successful sync and clear deployment operand degradation (see openshift/library-go#2128).
if !slices.Contains(c.conditions, opv1.OperatorStatusTypeDegraded) {
controller = controller.WithSyncDegradedOnError(c.operatorClient)
}
return controller.ToController(
Expand All @@ -207,7 +210,18 @@ func (c *DeploymentController) Name() string {
return c.instanceName
}

func (c *DeploymentController) sync(ctx context.Context, syncContext factory.SyncContext) error {
func (c *DeploymentController) sync(ctx context.Context, syncContext factory.SyncContext) (err error) {
if slices.Contains(c.conditions, opv1.OperatorStatusTypeDegraded) {
defer func() {
if err != nil {
applyErr := c.applySyncErrorDegraded(ctx, err)
if applyErr != nil {
klog.V(2).Infof("failed to apply sync error degraded status: %v", applyErr)
}
}
}()
}

opSpec, opStatus, _, err := c.operatorClient.GetOperatorState()
if apierrors.IsNotFound(err) && management.IsOperatorRemovable() {
return nil
Expand All @@ -217,7 +231,7 @@ func (c *DeploymentController) sync(ctx context.Context, syncContext factory.Syn
}

if opSpec.ManagementState == opv1.Removed && management.IsOperatorRemovable() {
return c.syncDeleting(ctx, opSpec, opStatus, syncContext)
return c.syncDeleting(ctx, opSpec, syncContext)
}

if opSpec.ManagementState != opv1.Managed {
Expand All @@ -229,11 +243,21 @@ func (c *DeploymentController) sync(ctx context.Context, syncContext factory.Syn
return err
}
if management.IsOperatorRemovable() && meta.DeletionTimestamp != nil {
return c.syncDeleting(ctx, opSpec, opStatus, syncContext)
return c.syncDeleting(ctx, opSpec, syncContext)
}
return c.syncManaged(ctx, opSpec, opStatus, syncContext)
}

func (c *DeploymentController) applySyncErrorDegraded(ctx context.Context, syncErr error) error {
degraded := applyoperatorv1.OperatorCondition().
WithType(c.instanceName + opv1.OperatorStatusTypeDegraded).
WithStatus(opv1.ConditionTrue).
WithReason("SyncError").
WithMessage(syncErr.Error())
status := applyoperatorv1.OperatorStatus().WithConditions(degraded)
return c.operatorClient.ApplyOperatorStatus(ctx, c.controllerInstanceName, status)
}

func (c *DeploymentController) syncManaged(ctx context.Context, opSpec *opv1.OperatorSpec, opStatus *opv1.OperatorStatus, syncContext factory.SyncContext) error {
klog.V(4).Infof("syncManaged")

Expand All @@ -257,7 +281,7 @@ func (c *DeploymentController) syncManaged(ctx context.Context, opSpec *opv1.Ope
if err != nil {
return err
}
// Create an OperatorStatusApplyConfiguration with generations

status := applyoperatorv1.OperatorStatus().
WithGenerations(&applyoperatorv1.GenerationStatusApplyConfiguration{
Group: ptr.To("apps"),
Expand All @@ -267,7 +291,8 @@ func (c *DeploymentController) syncManaged(ctx context.Context, opSpec *opv1.Ope
LastGeneration: ptr.To(deployment.Generation),
})

// Set Available condition
now := time.Now()

if slices.Contains(c.conditions, opv1.OperatorStatusTypeAvailable) {
availableCondition := applyoperatorv1.
OperatorCondition().WithType(c.instanceName + opv1.OperatorStatusTypeAvailable)
Expand All @@ -276,38 +301,92 @@ func (c *DeploymentController) syncManaged(ctx context.Context, opSpec *opv1.Ope
WithStatus(opv1.ConditionTrue).
WithMessage("Deployment is available").
WithReason("AsExpected")

} else {
availableCondition = availableCondition.
WithStatus(opv1.ConditionFalse).
WithMessage("Waiting for Deployment").
WithReason("Deploying")
WithReason("NoPod").
WithMessage(fmt.Sprintf("no %s.%s pods available on any node", deployment.Name, deployment.Namespace))
}
status = status.WithConditions(availableCondition)
}

// Set Progressing condition
desiredReplicas := ptr.Deref(deployment.Spec.Replicas, 1)

progressTimedOutMessage, workloadIsBeingUpdatedTooLong := hasDeploymentTimedOutProgressing(deployment.Status)
workloadIsBeingUpdated := !hasDeploymentProgressed(deployment.Status) && !workloadIsBeingUpdatedTooLong

var progressDeadlineExceededMessage string
if workloadIsBeingUpdatedTooLong {
progressDeadlineExceededMessage = fmt.Sprintf("deployment/%s.%s has timed out progressing: %s", deployment.Name, deployment.Namespace, progressTimedOutMessage)
}

if slices.Contains(c.conditions, opv1.OperatorStatusTypeProgressing) {
progressingCondition := applyoperatorv1.OperatorCondition().
WithType(c.instanceName + opv1.OperatorStatusTypeProgressing).
WithStatus(opv1.ConditionFalse).
WithMessage("Deployment is not progressing").
WithReason("AsExpected")

if ok, msg := isProgressing(deployment); ok {
switch {
case workloadIsBeingUpdated:
progressingCondition = progressingCondition.
WithStatus(opv1.ConditionTrue).
WithMessage(msg).
WithReason("Deploying")
WithReason("PodsUpdating").
WithMessage(fmt.Sprintf("deployment/%s.%s: %d/%d pods have been updated to the latest revision and %d/%d pods are available", deployment.Name, deployment.Namespace, deployment.Status.UpdatedReplicas, desiredReplicas, deployment.Status.AvailableReplicas, desiredReplicas))
case workloadIsBeingUpdatedTooLong:
progressingCondition = progressingCondition.
WithStatus(opv1.ConditionFalse).
WithReason("ProgressDeadlineExceeded").
WithMessage(progressDeadlineExceededMessage)
default:
progressingCondition = progressingCondition.
WithStatus(opv1.ConditionFalse).
WithReason("AsExpected")
}
status = status.WithConditions(progressingCondition)
}

if slices.Contains(c.conditions, opv1.OperatorStatusTypeDegraded) {
degradedCondition := applyoperatorv1.OperatorCondition().
WithType(c.instanceName + opv1.OperatorStatusTypeDegraded).
WithStatus(opv1.ConditionFalse).
WithReason("AsExpected")

// Degrade when operator is progressing too long.
// Only do this if we would continue to be in the Progressing state, otherwise, we'll never get out
if v1helpers.IsUpdatingTooLong(opStatus, c.instanceName+opv1.OperatorStatusTypeProgressing) {
return fmt.Errorf("Deployment was progressing too long")
switch {
case workloadIsBeingUpdatedTooLong:
degradedCondition = degradedCondition.
WithStatus(opv1.ConditionTrue).
WithReason("ProgressDeadlineExceeded").
WithMessage(progressDeadlineExceededMessage)

case !workloadIsBeingUpdated && deployment.Status.AvailableReplicas < desiredReplicas:
operandPods := c.listOperandPodsForDiagnostics(ctx, deployment, syncContext)
livePods := nonDeletingPods(operandPods)
hasFailing := hasFailingPods(deployment, livePods, now)
if hasFailing || deployment.Status.AvailableReplicas == 0 {
containerMessages := dpm.ContainerMessagesForPods(deployment, livePods)
Comment on lines +362 to +367
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Don’t let a failed pod lookup clear Degraded for partial outages.

When AvailableReplicas < desiredReplicas and rollout is already done, listOperandPodsForDiagnostics() collapses selector/list failures into nil, so hasFailingPods() becomes false and this branch reports Degraded=False unless availability dropped all the way to zero. That makes operator health depend on best-effort diagnostics instead of the deployment shortage itself.

Suggested direction
- operandPods := c.listOperandPodsForDiagnostics(ctx, deployment, syncContext)
+ operandPods, diagnosticsOK := c.listOperandPodsForDiagnostics(ctx, deployment, syncContext)
  livePods := nonDeletingPods(operandPods)
- hasFailing := hasFailingPods(deployment, livePods, now)
- if hasFailing || deployment.Status.AvailableReplicas == 0 {
-   containerMessages := dpm.ContainerMessagesForPods(deployment, livePods)
+ hasFailing := diagnosticsOK && hasFailingPods(deployment, livePods, now)
+ if deployment.Status.AvailableReplicas < desiredReplicas && (!diagnosticsOK || hasFailing || deployment.Status.AvailableReplicas == 0) {
+   var containerMessages []string
+   if diagnosticsOK {
+     containerMessages = dpm.ContainerMessagesForPods(deployment, livePods)
+   }

Also applies to: 401-417

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@pkg/operator/deploymentcontroller/deployment_controller.go` around lines 362
- 367, When AvailableReplicas < desiredReplicas and rollout is not updating,
don't let a nil/failed pod lookup from listOperandPodsForDiagnostics clear
Degraded; treat missing operandPods as a failure. In the branch around
listOperandPodsForDiagnostics -> operandPods, set hasFailing to true if
operandPods is nil or the diagnostics list indicates a selector/list error (in
addition to the existing hasFailingPods(deployment, livePods, now) check), and
still build containerMessages with whatever diagnostics are available via
dpm.ContainerMessagesForPods(deployment, livePods) so the Degraded condition
reflects the deployment shortage rather than best‑effort pod lookup success.
Ensure you reference listOperandPodsForDiagnostics, nonDeletingPods,
hasFailingPods, dpm.ContainerMessagesForPods,
deployment.Status.AvailableReplicas and desiredReplicas when making the change.

var failureDescription string
if len(containerMessages) > 0 {
failureDescription = ` (` + strings.Join(containerMessages, ", ") + `)`
}
numUnavailable := desiredReplicas - deployment.Status.AvailableReplicas
message := fmt.Sprintf("%d of %d requested instances are unavailable for %s.%s%s", numUnavailable, desiredReplicas, deployment.Name, deployment.Namespace, failureDescription)
degradedCondition = degradedCondition.
WithStatus(opv1.ConditionTrue).
WithReason("UnavailablePod").
WithMessage(message)
} else {
degradedCondition = degradedCondition.
WithStatus(opv1.ConditionFalse).
WithReason("AsExpected")
}
}

status = status.WithConditions(progressingCondition)
default:
degradedCondition = degradedCondition.
WithStatus(opv1.ConditionFalse).
WithReason("AsExpected")
}
status = status.WithConditions(degradedCondition)
}

return c.operatorClient.ApplyOperatorStatus(
Expand All @@ -317,7 +396,34 @@ func (c *DeploymentController) syncManaged(ctx context.Context, opSpec *opv1.Ope
)
}

func (c *DeploymentController) syncDeleting(ctx context.Context, opSpec *opv1.OperatorSpec, opStatus *opv1.OperatorStatus, syncContext factory.SyncContext) error {
// listOperandPodsForDiagnostics lists pods matched by the deployment selector for UnavailablePod diagnostics.
// A nil selector, selector conversion errors, or API list errors are logged and recorded as warnings; no error is returned.
func (c *DeploymentController) listOperandPodsForDiagnostics(ctx context.Context, deploymentObj *appsv1.Deployment, syncContext factory.SyncContext) []*corev1.Pod {
if deploymentObj.Spec.Selector == nil {
klog.Warningf("deployment/%s/%s has no spec.selector, skipping pod diagnostics", deploymentObj.Namespace, deploymentObj.Name)
syncContext.Recorder().Warningf("DeploymentSelectorMissing", "deployment %s/%s has no spec.selector, skipping pod diagnostics", deploymentObj.Namespace, deploymentObj.Name)
return nil
}
selector, err := metav1.LabelSelectorAsSelector(deploymentObj.Spec.Selector)
if err != nil {
klog.Warningf("deployment/%s/%s has invalid spec.selector: %v", deploymentObj.Namespace, deploymentObj.Name, err)
syncContext.Recorder().Warningf("DeploymentSelectorInvalid", "deployment %s/%s has invalid spec.selector: %v", deploymentObj.Namespace, deploymentObj.Name, err)
return nil
}
podList, err := c.kubeClient.CoreV1().Pods(deploymentObj.Namespace).List(ctx, metav1.ListOptions{LabelSelector: selector.String()})
if err != nil {
klog.Warningf("deployment/%s/%s: pod list for diagnostics failed: %v", deploymentObj.Namespace, deploymentObj.Name, err)
syncContext.Recorder().Warningf("PodListFailed", "listing pods for deployment %s/%s diagnostics: %v", deploymentObj.Namespace, deploymentObj.Name, err)
return nil
}
out := make([]*corev1.Pod, len(podList.Items))
for i := range podList.Items {
out[i] = &podList.Items[i]
}
return out
}

func (c *DeploymentController) syncDeleting(ctx context.Context, opSpec *opv1.OperatorSpec, syncContext factory.SyncContext) error {
klog.V(4).Infof("syncDeleting")
required, err := c.getDeployment(opSpec)
if err != nil {
Expand Down Expand Up @@ -356,36 +462,70 @@ func (c *DeploymentController) getDeployment(opSpec *opv1.OperatorSpec) (*appsv1
return required, nil
}

func isProgressing(deployment *appsv1.Deployment) (bool, string) {
// hasDeploymentProgressed returns true if the deployment reports NewReplicaSetAvailable
// via the DeploymentProgressing condition.
func hasDeploymentProgressed(status appsv1.DeploymentStatus) bool {
for _, cond := range status.Conditions {
if cond.Type == appsv1.DeploymentProgressing {
return cond.Status == corev1.ConditionTrue && cond.Reason == "NewReplicaSetAvailable"
}
}
return false
}

var deploymentExpectedReplicas int32
if deployment.Spec.Replicas != nil {
deploymentExpectedReplicas = *deployment.Spec.Replicas
// hasDeploymentTimedOutProgressing returns true if the deployment reports ProgressDeadlineExceeded.
// The function returns the Progressing condition message as the first return value.
func hasDeploymentTimedOutProgressing(status appsv1.DeploymentStatus) (string, bool) {
for _, cond := range status.Conditions {
if cond.Type == appsv1.DeploymentProgressing {
return cond.Message, cond.Status == corev1.ConditionFalse && cond.Reason == "ProgressDeadlineExceeded"
}
}
return "", false
}

switch {
case deployment.Generation != deployment.Status.ObservedGeneration:
return true, "Waiting for Deployment to act on changes"
case hasFinishedProgressing(deployment):
return false, ""
case deployment.Status.UnavailableReplicas > 0:
return true, "Waiting for Deployment to deploy pods"
case deployment.Status.UpdatedReplicas < deploymentExpectedReplicas:
return true, "Waiting for Deployment to update pods"
case deployment.Status.AvailableReplicas < deploymentExpectedReplicas:
return true, "Waiting for Deployment to deploy pods"
// nonDeletingPods returns pods that are not terminating (no deletion timestamp).
func nonDeletingPods(pods []*corev1.Pod) []*corev1.Pod {
out := make([]*corev1.Pod, 0, len(pods))
for _, p := range pods {
if p != nil && p.DeletionTimestamp == nil {
out = append(out, p)
}
}
return false, ""
return out
}

func hasFinishedProgressing(deployment *appsv1.Deployment) bool {
// Deployment whose rollout is complete gets Progressing condition with Reason NewReplicaSetAvailable condition.
// https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#complete-deployment
// Any subsequent missing replicas (e.g. caused by a node reboot) must not not change the Progressing condition.
for _, cond := range deployment.Status.Conditions {
if cond.Type == appsv1.DeploymentProgressing {
return cond.Status == corev1.ConditionTrue && cond.Reason == "NewReplicaSetAvailable"
func hasFailingPods(workload *appsv1.Deployment, pods []*corev1.Pod, now time.Time) bool {
progressDeadline := time.Duration(ptr.Deref(workload.Spec.ProgressDeadlineSeconds, 600)) * time.Second
minReady := time.Duration(workload.Spec.MinReadySeconds) * time.Second

for _, pod := range pods {
if pod.DeletionTimestamp != nil {
continue
}

readyCond := findPodReadyCondition(pod)
deadline := pod.CreationTimestamp.Time.Add(progressDeadline)

if (readyCond == nil || readyCond.Status != corev1.ConditionTrue) && now.After(deadline) {
return true
}

if minReady > 0 && readyCond != nil && readyCond.Status == corev1.ConditionTrue {
isRelevant := now.After(pod.CreationTimestamp.Time.Add(progressDeadline + minReady))
if isRelevant && now.Sub(readyCond.LastTransitionTime.Time) < minReady {
return true
}
}
}
return false
}

func findPodReadyCondition(pod *corev1.Pod) *corev1.PodCondition {
for i := range pod.Status.Conditions {
if pod.Status.Conditions[i].Type == corev1.PodReady {
return &pod.Status.Conditions[i]
}
}
return nil
}
Loading