diff --git a/internal/controller/metal3.io/baremetalhost_controller.go b/internal/controller/metal3.io/baremetalhost_controller.go index 4170849722..a80f77c942 100644 --- a/internal/controller/metal3.io/baremetalhost_controller.go +++ b/internal/controller/metal3.io/baremetalhost_controller.go @@ -1447,6 +1447,7 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner var hfcDirty bool var hfc *metal3api.HostFirmwareComponents var liveFirmwareSettingsAllowed, liveFirmwareUpdatesAllowed bool + var hasSettingsSpec, hasComponentsSpec bool if hup != nil { liveFirmwareSettingsAllowed = (hup.Spec.FirmwareSettings == metal3api.HostUpdatePolicyOnReboot) @@ -1470,6 +1471,9 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner servicingData.ActualFirmwareSettings = hfs.Status.Settings servicingData.TargetFirmwareSettings = hfs.Spec.Settings } + + // Track if HFS spec has actual settings (for abort decision) + hasSettingsSpec = (hfs != nil && len(hfs.Spec.Settings) > 0) } if liveFirmwareUpdatesAllowed { @@ -1486,6 +1490,9 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner servicingData.TargetFirmwareComponents = hfc.Spec.Updates } } + + // Track if HFC spec has actual updates (for abort decision) + hasComponentsSpec = (hfc != nil && len(hfc.Spec.Updates) > 0) } hasChanges := fwDirty || hfsDirty || hfcDirty @@ -1496,6 +1503,34 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner return nil } + // If we're in a servicing-related state and all specs have been removed, + // abort the servicing operation. Check both new (HFS/HFC) and old (Spec.Firmware) APIs. + hasFirmwareSpec := info.host.Spec.Firmware != nil + specsRemoved := !hasSettingsSpec && !hasComponentsSpec && !hasFirmwareSpec + inServicingState := info.host.Status.OperationalStatus == metal3api.OperationalStatusServicing || + info.host.Status.ErrorType == metal3api.ServicingError + if specsRemoved && inServicingState { + info.log.Info("specs removed while in servicing state, attempting to abort") + provResult, _, err := prov.AbortServicing() + if err != nil { + return actionError{fmt.Errorf("failed to abort servicing: %w", err)} + } + if provResult.ErrorMessage != "" { + info.log.Info("failed to abort servicing", "error", provResult.ErrorMessage) + return actionError{fmt.Errorf("failed to abort servicing: %s", provResult.ErrorMessage)} + } + if provResult.Dirty { + info.log.Info("abort operation in progress, checking back later") + return actionContinue{provResult.RequeueAfter} + } + // Abort completed successfully, clear error state + info.log.Info("successfully aborted servicing") + info.host.Status.ErrorType = "" + info.host.Status.ErrorMessage = "" + info.host.Status.OperationalStatus = metal3api.OperationalStatusOK + return actionComplete{} + } + // FIXME(janders/dtantsur): this implementation may lead to a scenario where if we never actually // succeed before leaving this state (e.g. by deprovisioning) we lose the signal that the // update didn't actually happen. This is deemed an acceptable risk for the moment since it is only @@ -2148,7 +2183,7 @@ func (r *BareMetalHostReconciler) getHostFirmwareSettings(info *reconcileInfo) ( } if !valid { info.log.Info("hostFirmwareSettings not valid", "namespacename", info.request.NamespacedName) - return false, nil, nil + return false, hfs, nil } if changed { @@ -2162,7 +2197,7 @@ func (r *BareMetalHostReconciler) getHostFirmwareSettings(info *reconcileInfo) ( } info.log.Info("hostFirmwareSettings no updates", "namespacename", info.request.NamespacedName) - return false, nil, nil + return false, hfs, nil } // Get the stored firmware settings if there are valid changes. @@ -2185,7 +2220,7 @@ func (r *BareMetalHostReconciler) getHostFirmwareComponents(info *reconcileInfo) } if !valid { info.log.Info("hostFirmwareComponents not valid", "namespacename", info.request.NamespacedName) - return false, nil, nil + return false, hfc, nil } if changed { info.log.Info("hostFirmwareComponents indicating ChangeDetected", "namespacename", info.request.NamespacedName) @@ -2193,7 +2228,7 @@ func (r *BareMetalHostReconciler) getHostFirmwareComponents(info *reconcileInfo) } info.log.Info("hostFirmwareComponents no updates", "namespacename", info.request.NamespacedName) - return false, nil, nil + return false, hfc, nil } func (r *BareMetalHostReconciler) saveHostStatus(ctx context.Context, host *metal3api.BareMetalHost) error { diff --git a/internal/controller/metal3.io/host_state_machine_test.go b/internal/controller/metal3.io/host_state_machine_test.go index a2229c220d..c6bd7f8b5b 100644 --- a/internal/controller/metal3.io/host_state_machine_test.go +++ b/internal/controller/metal3.io/host_state_machine_test.go @@ -1366,6 +1366,10 @@ func (m *mockProvisioner) Service(_ provisioner.ServicingData, _ bool, _ bool) ( return m.getNextResultByMethod("Service"), m.nextResults["Service"].Dirty, err } +func (m *mockProvisioner) AbortServicing() (result provisioner.Result, started bool, err error) { + return m.getNextResultByMethod("AbortServicing"), m.nextResults["AbortServicing"].Dirty, err +} + func (m *mockProvisioner) Adopt(_ provisioner.AdoptData, _ bool) (result provisioner.Result, err error) { return m.getNextResultByMethod("Adopt"), err } diff --git a/pkg/provisioner/demo/demo.go b/pkg/provisioner/demo/demo.go index 9234e99217..8b6f8beb59 100644 --- a/pkg/provisioner/demo/demo.go +++ b/pkg/provisioner/demo/demo.go @@ -234,6 +234,12 @@ func (p *demoProvisioner) Service(_ provisioner.ServicingData, unprepared bool, return } +// AbortServicing aborts an in-progress or failed servicing operation. +func (p *demoProvisioner) AbortServicing() (result provisioner.Result, started bool, err error) { + p.log.Info("aborting servicing") + return +} + // Adopt notifies the provisioner that the state machine believes the host // to be currently provisioned, and that it should be managed as such. func (p *demoProvisioner) Adopt(_ provisioner.AdoptData, _ bool) (result provisioner.Result, err error) { diff --git a/pkg/provisioner/fixture/fixture.go b/pkg/provisioner/fixture/fixture.go index 9ddea89533..6bf02b4ddd 100644 --- a/pkg/provisioner/fixture/fixture.go +++ b/pkg/provisioner/fixture/fixture.go @@ -255,6 +255,12 @@ func (p *fixtureProvisioner) Service(_ provisioner.ServicingData, unprepared boo return } +// AbortServicing aborts an in-progress or failed servicing operation. +func (p *fixtureProvisioner) AbortServicing() (result provisioner.Result, started bool, err error) { + p.log.Info("aborting servicing") + return +} + // Adopt notifies the provisioner that the state machine believes the host // to be currently provisioned, and that it should be managed as such. func (p *fixtureProvisioner) Adopt(_ provisioner.AdoptData, _ bool) (result provisioner.Result, err error) { diff --git a/pkg/provisioner/ironic/servicing.go b/pkg/provisioner/ironic/servicing.go index b31d9d0bc2..37f7bdbb2f 100644 --- a/pkg/provisioner/ironic/servicing.go +++ b/pkg/provisioner/ironic/servicing.go @@ -76,6 +76,38 @@ func (p *ironicProvisioner) startServicing(bmcAccess bmc.AccessDetails, ironicNo return } +// AbortServicing aborts an in-progress or failed servicing operation. +func (p *ironicProvisioner) AbortServicing() (result provisioner.Result, started bool, err error) { + ironicNode, err := p.getNode() + if err != nil { + result, err = transientError(err) + return result, started, err + } + + // Only abort if in a servicing-related state + state := nodes.ProvisionState(ironicNode.ProvisionState) + if state != nodes.ServiceFail && state != nodes.ServiceWait { + p.log.Info("cannot abort servicing in current state", "state", ironicNode.ProvisionState) + result, err = operationComplete() + return result, started, err + } + + // Clear maintenance flag first if it's set + if ironicNode.Maintenance { + p.log.Info("clearing maintenance flag before aborting servicing") + result, err = p.setMaintenanceFlag(ironicNode, false, "") + return result, started, err + } + + // Set started to let the controller know about the change + p.log.Info("aborting servicing") + started, result, err = p.tryChangeNodeProvisionState( + ironicNode, + nodes.ProvisionStateOpts{Target: nodes.TargetAbort}, + ) + return +} + func (p *ironicProvisioner) Service(data provisioner.ServicingData, unprepared, restartOnFailure bool) (result provisioner.Result, started bool, err error) { bmcAccess, err := p.bmcAccess() if err != nil { @@ -119,7 +151,13 @@ func (p *ironicProvisioner) Service(data provisioner.ServicingData, unprepared, // Servicing finished p.log.Info("servicing finished on the host") result, err = operationComplete() - case nodes.Servicing, nodes.ServiceWait: + case nodes.Servicing: + p.log.Info("waiting for host to finish servicing", + "state", ironicNode.ProvisionState, + "serviceStep", ironicNode.ServiceStep) + result, err = operationContinuing(provisionRequeueDelay) + + case nodes.ServiceWait: p.log.Info("waiting for host to become active", "state", ironicNode.ProvisionState, "serviceStep", ironicNode.ServiceStep) diff --git a/pkg/provisioner/ironic/servicing_test.go b/pkg/provisioner/ironic/servicing_test.go index a728c709bc..c74d4eb54f 100644 --- a/pkg/provisioner/ironic/servicing_test.go +++ b/pkg/provisioner/ironic/servicing_test.go @@ -164,7 +164,7 @@ func TestService(t *testing.T) { host.Status.Provisioning.ID = nodeUUID prepData := provisioner.ServicingData{} if !tc.skipConfig { - host.Spec.BMC.Address = "raid-test://test.bmc/" + host.Spec.BMC.Address = "bios-test://test.bmc/" prepData.ActualFirmwareSettings = metal3api.SettingsMap{ "Answer": "unknown", } diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index 7b1246ea61..0156251f93 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -172,6 +172,11 @@ type Provisioner interface { // Servicing updates configuration for a provisioned host. Service(data ServicingData, unprepared, restartOnFailure bool) (result Result, started bool, err error) + // AbortServicing aborts an in-progress or failed servicing operation, + // returning the host to an active state. This is used when the user + // removes the firmware update/settings specifications. + AbortServicing() (result Result, started bool, err error) + // Provision writes the image from the host spec to the host. It // may be called multiple times, and should return true for its // dirty flag until the provisioning operation is completed.