From b628dd504e8f00100153228992facd6061c3c46e Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Wed, 6 Aug 2025 16:49:08 +1000 Subject: [PATCH 1/4] Initial attempt at using wait step to back out of failed servicing. Generated-By: Claude Code Sonnet 4 Signed-off-by: Jacob Anders (cherry picked from commit c40d0ad4c80587ec223b37a599f58ca4e484f81e) --- .../metal3.io/baremetalhost_controller.go | 42 +++++++++++++++++ pkg/provisioner/ironic/servicing.go | 46 ++++++++++++++++++- pkg/provisioner/provisioner.go | 3 ++ 3 files changed, 90 insertions(+), 1 deletion(-) diff --git a/internal/controller/metal3.io/baremetalhost_controller.go b/internal/controller/metal3.io/baremetalhost_controller.go index 4170849722..065b15af7e 100644 --- a/internal/controller/metal3.io/baremetalhost_controller.go +++ b/internal/controller/metal3.io/baremetalhost_controller.go @@ -1470,6 +1470,12 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner servicingData.ActualFirmwareSettings = hfs.Status.Settings servicingData.TargetFirmwareSettings = hfs.Spec.Settings } + + // Track if HFS spec has actual settings - check independently since getHostFirmwareSettings + // returns nil when no changes even if object exists + hfsExists := &metal3api.HostFirmwareSettings{} + hfsExistsErr := r.Get(info.ctx, info.request.NamespacedName, hfsExists) + servicingData.HasFirmwareSettingsSpec = (hfsExistsErr == nil && len(hfsExists.Spec.Settings) > 0) } if liveFirmwareUpdatesAllowed { @@ -1486,16 +1492,52 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner servicingData.TargetFirmwareComponents = hfc.Spec.Updates } } + + // Track if HFC spec has actual updates - check independently since getHostFirmwareComponents + // returns nil when no changes even if object exists + hfcExists := &metal3api.HostFirmwareComponents{} + hfcExistsErr := r.Get(info.ctx, info.request.NamespacedName, hfcExists) + servicingData.HasFirmwareComponentsSpec = (hfcExistsErr == nil && len(hfcExists.Spec.Updates) > 0) } hasChanges := fwDirty || hfsDirty || hfcDirty + info.log.Info("janders_debug: servicing data flags", + "hasSettingsSpec", servicingData.HasFirmwareSettingsSpec, + "hasComponentsSpec", servicingData.HasFirmwareComponentsSpec, + "fwDirty", fwDirty, "hfsDirty", hfsDirty, "hfcDirty", hfcDirty, + "hasChanges", hasChanges, "note", "hasSpec flags check if spec.settings/spec.updates have content") + // Even if settings are clean, we need to check the result of the current servicing. if !hasChanges && info.host.Status.OperationalStatus != metal3api.OperationalStatusServicing && info.host.Status.ErrorType != metal3api.ServicingError { // If nothing is going on, return control to the power management. return nil } + // If we're in a servicing error state and updates were removed from spec, + // let the provisioner handle the transition back to active + if info.host.Status.ErrorType == metal3api.ServicingError && !hasChanges { + info.log.Info("updates removed from spec while in servicing error state, attempting recovery") + provResult, _, err := prov.Service(servicingData, false, false) + if err != nil { + return actionError{fmt.Errorf("failed to recover from servicing error: %w", err)} + } + if provResult.ErrorMessage != "" { + info.log.Info("failed to recover from servicing error", "error", provResult.ErrorMessage) + return actionError{fmt.Errorf("failed to recover from servicing error: %s", provResult.ErrorMessage)} + } + if provResult.Dirty { + info.log.Info("abort operation in progress, checking back later") + return actionContinue{provResult.RequeueAfter} + } + // If abort completed and no error, we've successfully recovered + info.log.Info("successfully recovered from servicing error") + info.host.Status.ErrorType = "" + info.host.Status.ErrorMessage = "" + info.host.Status.OperationalStatus = metal3api.OperationalStatusOK + return actionComplete{} + } + // FIXME(janders/dtantsur): this implementation may lead to a scenario where if we never actually // succeed before leaving this state (e.g. by deprovisioning) we lose the signal that the // update didn't actually happen. This is deemed an acceptable risk for the moment since it is only diff --git a/pkg/provisioner/ironic/servicing.go b/pkg/provisioner/ironic/servicing.go index b31d9d0bc2..8793ef8ee4 100644 --- a/pkg/provisioner/ironic/servicing.go +++ b/pkg/provisioner/ironic/servicing.go @@ -76,6 +76,24 @@ func (p *ironicProvisioner) startServicing(bmcAccess bmc.AccessDetails, ironicNo return } +func (p *ironicProvisioner) abortServicing(ironicNode *nodes.Node) (result provisioner.Result, started bool, err error) { + // Clear maintenance flag first if it's set + if ironicNode.Maintenance { + p.log.Info("clearing maintenance flag before aborting servicing") + result, err = p.setMaintenanceFlag(ironicNode, false, "") + return result, started, err + } + + // Set started to let the controller know about the change + p.log.Info("aborting servicing due to removal of spec.updates/spec.settings") + started, result, err = p.tryChangeNodeProvisionState( + ironicNode, + nodes.ProvisionStateOpts{Target: nodes.TargetAbort}, + ) + p.log.Info("janders_debug: abort result", "started", started, "result", result, "error", err) + return +} + func (p *ironicProvisioner) Service(data provisioner.ServicingData, unprepared, restartOnFailure bool) (result provisioner.Result, started bool, err error) { bmcAccess, err := p.bmcAccess() if err != nil { @@ -89,9 +107,29 @@ func (p *ironicProvisioner) Service(data provisioner.ServicingData, unprepared, return result, started, err } + // Check if there are any pending updates + serviceSteps, err := p.buildServiceSteps(bmcAccess, data) + if err != nil { + result, err = operationFailed(err.Error()) + return result, started, err + } + + p.log.Info("janders_debug: servicing state check", + "hasSettingsSpec", data.HasFirmwareSettingsSpec, + "hasComponentsSpec", data.HasFirmwareComponentsSpec, + "serviceStepsCount", len(serviceSteps), + "nodeState", ironicNode.ProvisionState) + switch nodes.ProvisionState(ironicNode.ProvisionState) { case nodes.ServiceFail: - // When servicing failed, we need to clean host provisioning settings. + // When servicing failed and user actually removed the specs (not just no updates calculated), + // we need to abort the servicing operation to back out + if !data.HasFirmwareSettingsSpec && !data.HasFirmwareComponentsSpec { + p.log.Info("aborting servicing because spec.updates/spec.settings was removed") + return p.abortServicing(ironicNode) + } + + // When servicing failed and there are pending updates, we need to clean host provisioning settings // If restartOnFailure is false, it means the settings aren't cleared. if !restartOnFailure { result, err = operationFailed(ironicNode.LastError) @@ -120,6 +158,12 @@ func (p *ironicProvisioner) Service(data provisioner.ServicingData, unprepared, p.log.Info("servicing finished on the host") result, err = operationComplete() case nodes.Servicing, nodes.ServiceWait: + // If user actually removed spec.updates/spec.settings while servicing is in progress, abort immediately + if !data.HasFirmwareSettingsSpec && !data.HasFirmwareComponentsSpec { + p.log.Info("aborting in-progress servicing because spec.updates/spec.settings was removed") + return p.abortServicing(ironicNode) + } + p.log.Info("waiting for host to become active", "state", ironicNode.ProvisionState, "serviceStep", ironicNode.ServiceStep) diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index 7b1246ea61..7eb621187a 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -120,6 +120,9 @@ type ServicingData struct { TargetFirmwareSettings metal3api.DesiredSettingsMap ActualFirmwareSettings metal3api.SettingsMap TargetFirmwareComponents []metal3api.FirmwareUpdate + // Flags to track if specs exist (vs. just no updates calculated) + HasFirmwareSettingsSpec bool + HasFirmwareComponentsSpec bool } type ProvisionData struct { From b32ba238311b3de35e968585a821973a05433ee3 Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Wed, 18 Feb 2026 03:57:11 +0100 Subject: [PATCH 2/4] Address first review round (except provisioner/Abort) --- .../metal3.io/baremetalhost_controller.go | 28 ++++++------------- pkg/provisioner/ironic/servicing.go | 21 +++++++------- 2 files changed, 18 insertions(+), 31 deletions(-) diff --git a/internal/controller/metal3.io/baremetalhost_controller.go b/internal/controller/metal3.io/baremetalhost_controller.go index 065b15af7e..eef7f25f8c 100644 --- a/internal/controller/metal3.io/baremetalhost_controller.go +++ b/internal/controller/metal3.io/baremetalhost_controller.go @@ -1471,11 +1471,8 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner servicingData.TargetFirmwareSettings = hfs.Spec.Settings } - // Track if HFS spec has actual settings - check independently since getHostFirmwareSettings - // returns nil when no changes even if object exists - hfsExists := &metal3api.HostFirmwareSettings{} - hfsExistsErr := r.Get(info.ctx, info.request.NamespacedName, hfsExists) - servicingData.HasFirmwareSettingsSpec = (hfsExistsErr == nil && len(hfsExists.Spec.Settings) > 0) + // Track if HFS spec has actual settings + servicingData.HasFirmwareSettingsSpec = (hfs != nil && len(hfs.Spec.Settings) > 0) } if liveFirmwareUpdatesAllowed { @@ -1493,21 +1490,12 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner } } - // Track if HFC spec has actual updates - check independently since getHostFirmwareComponents - // returns nil when no changes even if object exists - hfcExists := &metal3api.HostFirmwareComponents{} - hfcExistsErr := r.Get(info.ctx, info.request.NamespacedName, hfcExists) - servicingData.HasFirmwareComponentsSpec = (hfcExistsErr == nil && len(hfcExists.Spec.Updates) > 0) + // Track if HFC spec has actual updates + servicingData.HasFirmwareComponentsSpec = (hfc != nil && len(hfc.Spec.Updates) > 0) } hasChanges := fwDirty || hfsDirty || hfcDirty - info.log.Info("janders_debug: servicing data flags", - "hasSettingsSpec", servicingData.HasFirmwareSettingsSpec, - "hasComponentsSpec", servicingData.HasFirmwareComponentsSpec, - "fwDirty", fwDirty, "hfsDirty", hfsDirty, "hfcDirty", hfcDirty, - "hasChanges", hasChanges, "note", "hasSpec flags check if spec.settings/spec.updates have content") - // Even if settings are clean, we need to check the result of the current servicing. if !hasChanges && info.host.Status.OperationalStatus != metal3api.OperationalStatusServicing && info.host.Status.ErrorType != metal3api.ServicingError { // If nothing is going on, return control to the power management. @@ -2190,7 +2178,7 @@ func (r *BareMetalHostReconciler) getHostFirmwareSettings(info *reconcileInfo) ( } if !valid { info.log.Info("hostFirmwareSettings not valid", "namespacename", info.request.NamespacedName) - return false, nil, nil + return false, hfs, nil } if changed { @@ -2204,7 +2192,7 @@ func (r *BareMetalHostReconciler) getHostFirmwareSettings(info *reconcileInfo) ( } info.log.Info("hostFirmwareSettings no updates", "namespacename", info.request.NamespacedName) - return false, nil, nil + return false, hfs, nil } // Get the stored firmware settings if there are valid changes. @@ -2227,7 +2215,7 @@ func (r *BareMetalHostReconciler) getHostFirmwareComponents(info *reconcileInfo) } if !valid { info.log.Info("hostFirmwareComponents not valid", "namespacename", info.request.NamespacedName) - return false, nil, nil + return false, hfc, nil } if changed { info.log.Info("hostFirmwareComponents indicating ChangeDetected", "namespacename", info.request.NamespacedName) @@ -2235,7 +2223,7 @@ func (r *BareMetalHostReconciler) getHostFirmwareComponents(info *reconcileInfo) } info.log.Info("hostFirmwareComponents no updates", "namespacename", info.request.NamespacedName) - return false, nil, nil + return false, hfc, nil } func (r *BareMetalHostReconciler) saveHostStatus(ctx context.Context, host *metal3api.BareMetalHost) error { diff --git a/pkg/provisioner/ironic/servicing.go b/pkg/provisioner/ironic/servicing.go index 8793ef8ee4..cab97554b0 100644 --- a/pkg/provisioner/ironic/servicing.go +++ b/pkg/provisioner/ironic/servicing.go @@ -90,7 +90,6 @@ func (p *ironicProvisioner) abortServicing(ironicNode *nodes.Node) (result provi ironicNode, nodes.ProvisionStateOpts{Target: nodes.TargetAbort}, ) - p.log.Info("janders_debug: abort result", "started", started, "result", result, "error", err) return } @@ -114,12 +113,6 @@ func (p *ironicProvisioner) Service(data provisioner.ServicingData, unprepared, return result, started, err } - p.log.Info("janders_debug: servicing state check", - "hasSettingsSpec", data.HasFirmwareSettingsSpec, - "hasComponentsSpec", data.HasFirmwareComponentsSpec, - "serviceStepsCount", len(serviceSteps), - "nodeState", ironicNode.ProvisionState) - switch nodes.ProvisionState(ironicNode.ProvisionState) { case nodes.ServiceFail: // When servicing failed and user actually removed the specs (not just no updates calculated), @@ -157,13 +150,19 @@ func (p *ironicProvisioner) Service(data provisioner.ServicingData, unprepared, // Servicing finished p.log.Info("servicing finished on the host") result, err = operationComplete() - case nodes.Servicing, nodes.ServiceWait: - // If user actually removed spec.updates/spec.settings while servicing is in progress, abort immediately + case nodes.Servicing: + p.log.Info("waiting for host to finish servicing", + "state", ironicNode.ProvisionState, + "serviceStep", ironicNode.ServiceStep) + result, err = operationContinuing(provisionRequeueDelay) + + case nodes.ServiceWait: + // If user removed spec.updates/spec.settings while in ServiceWait, abort if !data.HasFirmwareSettingsSpec && !data.HasFirmwareComponentsSpec { - p.log.Info("aborting in-progress servicing because spec.updates/spec.settings was removed") + p.log.Info("aborting servicing because spec.updates/spec.settings was removed") return p.abortServicing(ironicNode) } - + p.log.Info("waiting for host to become active", "state", ironicNode.ProvisionState, "serviceStep", ironicNode.ServiceStep) From 05396cdeda110056eb14571eab173f4f84bdf863 Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Wed, 18 Feb 2026 05:12:03 +0100 Subject: [PATCH 3/4] Required post-rebase changes --- pkg/provisioner/ironic/servicing.go | 7 ------- pkg/provisioner/ironic/servicing_test.go | 3 ++- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/pkg/provisioner/ironic/servicing.go b/pkg/provisioner/ironic/servicing.go index cab97554b0..5cc2e5fc00 100644 --- a/pkg/provisioner/ironic/servicing.go +++ b/pkg/provisioner/ironic/servicing.go @@ -106,13 +106,6 @@ func (p *ironicProvisioner) Service(data provisioner.ServicingData, unprepared, return result, started, err } - // Check if there are any pending updates - serviceSteps, err := p.buildServiceSteps(bmcAccess, data) - if err != nil { - result, err = operationFailed(err.Error()) - return result, started, err - } - switch nodes.ProvisionState(ironicNode.ProvisionState) { case nodes.ServiceFail: // When servicing failed and user actually removed the specs (not just no updates calculated), diff --git a/pkg/provisioner/ironic/servicing_test.go b/pkg/provisioner/ironic/servicing_test.go index a728c709bc..2ce84a257a 100644 --- a/pkg/provisioner/ironic/servicing_test.go +++ b/pkg/provisioner/ironic/servicing_test.go @@ -164,13 +164,14 @@ func TestService(t *testing.T) { host.Status.Provisioning.ID = nodeUUID prepData := provisioner.ServicingData{} if !tc.skipConfig { - host.Spec.BMC.Address = "raid-test://test.bmc/" + host.Spec.BMC.Address = "bios-test://test.bmc/" prepData.ActualFirmwareSettings = metal3api.SettingsMap{ "Answer": "unknown", } prepData.TargetFirmwareSettings = metal3api.DesiredSettingsMap{ "Answer": intstr.FromInt(42), } + prepData.HasFirmwareSettingsSpec = true } publisher := func(reason, message string) {} From 67a536935e2a49bc14a61fb13dea4e7eda01978e Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Wed, 18 Feb 2026 07:13:26 +0100 Subject: [PATCH 4/4] Refactor multi use abort implementation --- .../metal3.io/baremetalhost_controller.go | 33 ++++++++++-------- .../metal3.io/host_state_machine_test.go | 4 +++ pkg/provisioner/demo/demo.go | 6 ++++ pkg/provisioner/fixture/fixture.go | 6 ++++ pkg/provisioner/ironic/servicing.go | 34 ++++++++++--------- pkg/provisioner/ironic/servicing_test.go | 1 - pkg/provisioner/provisioner.go | 8 +++-- 7 files changed, 58 insertions(+), 34 deletions(-) diff --git a/internal/controller/metal3.io/baremetalhost_controller.go b/internal/controller/metal3.io/baremetalhost_controller.go index eef7f25f8c..a80f77c942 100644 --- a/internal/controller/metal3.io/baremetalhost_controller.go +++ b/internal/controller/metal3.io/baremetalhost_controller.go @@ -1447,6 +1447,7 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner var hfcDirty bool var hfc *metal3api.HostFirmwareComponents var liveFirmwareSettingsAllowed, liveFirmwareUpdatesAllowed bool + var hasSettingsSpec, hasComponentsSpec bool if hup != nil { liveFirmwareSettingsAllowed = (hup.Spec.FirmwareSettings == metal3api.HostUpdatePolicyOnReboot) @@ -1471,8 +1472,8 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner servicingData.TargetFirmwareSettings = hfs.Spec.Settings } - // Track if HFS spec has actual settings - servicingData.HasFirmwareSettingsSpec = (hfs != nil && len(hfs.Spec.Settings) > 0) + // Track if HFS spec has actual settings (for abort decision) + hasSettingsSpec = (hfs != nil && len(hfs.Spec.Settings) > 0) } if liveFirmwareUpdatesAllowed { @@ -1490,8 +1491,8 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner } } - // Track if HFC spec has actual updates - servicingData.HasFirmwareComponentsSpec = (hfc != nil && len(hfc.Spec.Updates) > 0) + // Track if HFC spec has actual updates (for abort decision) + hasComponentsSpec = (hfc != nil && len(hfc.Spec.Updates) > 0) } hasChanges := fwDirty || hfsDirty || hfcDirty @@ -1502,24 +1503,28 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner return nil } - // If we're in a servicing error state and updates were removed from spec, - // let the provisioner handle the transition back to active - if info.host.Status.ErrorType == metal3api.ServicingError && !hasChanges { - info.log.Info("updates removed from spec while in servicing error state, attempting recovery") - provResult, _, err := prov.Service(servicingData, false, false) + // If we're in a servicing-related state and all specs have been removed, + // abort the servicing operation. Check both new (HFS/HFC) and old (Spec.Firmware) APIs. + hasFirmwareSpec := info.host.Spec.Firmware != nil + specsRemoved := !hasSettingsSpec && !hasComponentsSpec && !hasFirmwareSpec + inServicingState := info.host.Status.OperationalStatus == metal3api.OperationalStatusServicing || + info.host.Status.ErrorType == metal3api.ServicingError + if specsRemoved && inServicingState { + info.log.Info("specs removed while in servicing state, attempting to abort") + provResult, _, err := prov.AbortServicing() if err != nil { - return actionError{fmt.Errorf("failed to recover from servicing error: %w", err)} + return actionError{fmt.Errorf("failed to abort servicing: %w", err)} } if provResult.ErrorMessage != "" { - info.log.Info("failed to recover from servicing error", "error", provResult.ErrorMessage) - return actionError{fmt.Errorf("failed to recover from servicing error: %s", provResult.ErrorMessage)} + info.log.Info("failed to abort servicing", "error", provResult.ErrorMessage) + return actionError{fmt.Errorf("failed to abort servicing: %s", provResult.ErrorMessage)} } if provResult.Dirty { info.log.Info("abort operation in progress, checking back later") return actionContinue{provResult.RequeueAfter} } - // If abort completed and no error, we've successfully recovered - info.log.Info("successfully recovered from servicing error") + // Abort completed successfully, clear error state + info.log.Info("successfully aborted servicing") info.host.Status.ErrorType = "" info.host.Status.ErrorMessage = "" info.host.Status.OperationalStatus = metal3api.OperationalStatusOK diff --git a/internal/controller/metal3.io/host_state_machine_test.go b/internal/controller/metal3.io/host_state_machine_test.go index a2229c220d..c6bd7f8b5b 100644 --- a/internal/controller/metal3.io/host_state_machine_test.go +++ b/internal/controller/metal3.io/host_state_machine_test.go @@ -1366,6 +1366,10 @@ func (m *mockProvisioner) Service(_ provisioner.ServicingData, _ bool, _ bool) ( return m.getNextResultByMethod("Service"), m.nextResults["Service"].Dirty, err } +func (m *mockProvisioner) AbortServicing() (result provisioner.Result, started bool, err error) { + return m.getNextResultByMethod("AbortServicing"), m.nextResults["AbortServicing"].Dirty, err +} + func (m *mockProvisioner) Adopt(_ provisioner.AdoptData, _ bool) (result provisioner.Result, err error) { return m.getNextResultByMethod("Adopt"), err } diff --git a/pkg/provisioner/demo/demo.go b/pkg/provisioner/demo/demo.go index 9234e99217..8b6f8beb59 100644 --- a/pkg/provisioner/demo/demo.go +++ b/pkg/provisioner/demo/demo.go @@ -234,6 +234,12 @@ func (p *demoProvisioner) Service(_ provisioner.ServicingData, unprepared bool, return } +// AbortServicing aborts an in-progress or failed servicing operation. +func (p *demoProvisioner) AbortServicing() (result provisioner.Result, started bool, err error) { + p.log.Info("aborting servicing") + return +} + // Adopt notifies the provisioner that the state machine believes the host // to be currently provisioned, and that it should be managed as such. func (p *demoProvisioner) Adopt(_ provisioner.AdoptData, _ bool) (result provisioner.Result, err error) { diff --git a/pkg/provisioner/fixture/fixture.go b/pkg/provisioner/fixture/fixture.go index 9ddea89533..6bf02b4ddd 100644 --- a/pkg/provisioner/fixture/fixture.go +++ b/pkg/provisioner/fixture/fixture.go @@ -255,6 +255,12 @@ func (p *fixtureProvisioner) Service(_ provisioner.ServicingData, unprepared boo return } +// AbortServicing aborts an in-progress or failed servicing operation. +func (p *fixtureProvisioner) AbortServicing() (result provisioner.Result, started bool, err error) { + p.log.Info("aborting servicing") + return +} + // Adopt notifies the provisioner that the state machine believes the host // to be currently provisioned, and that it should be managed as such. func (p *fixtureProvisioner) Adopt(_ provisioner.AdoptData, _ bool) (result provisioner.Result, err error) { diff --git a/pkg/provisioner/ironic/servicing.go b/pkg/provisioner/ironic/servicing.go index 5cc2e5fc00..37f7bdbb2f 100644 --- a/pkg/provisioner/ironic/servicing.go +++ b/pkg/provisioner/ironic/servicing.go @@ -76,7 +76,22 @@ func (p *ironicProvisioner) startServicing(bmcAccess bmc.AccessDetails, ironicNo return } -func (p *ironicProvisioner) abortServicing(ironicNode *nodes.Node) (result provisioner.Result, started bool, err error) { +// AbortServicing aborts an in-progress or failed servicing operation. +func (p *ironicProvisioner) AbortServicing() (result provisioner.Result, started bool, err error) { + ironicNode, err := p.getNode() + if err != nil { + result, err = transientError(err) + return result, started, err + } + + // Only abort if in a servicing-related state + state := nodes.ProvisionState(ironicNode.ProvisionState) + if state != nodes.ServiceFail && state != nodes.ServiceWait { + p.log.Info("cannot abort servicing in current state", "state", ironicNode.ProvisionState) + result, err = operationComplete() + return result, started, err + } + // Clear maintenance flag first if it's set if ironicNode.Maintenance { p.log.Info("clearing maintenance flag before aborting servicing") @@ -85,7 +100,7 @@ func (p *ironicProvisioner) abortServicing(ironicNode *nodes.Node) (result provi } // Set started to let the controller know about the change - p.log.Info("aborting servicing due to removal of spec.updates/spec.settings") + p.log.Info("aborting servicing") started, result, err = p.tryChangeNodeProvisionState( ironicNode, nodes.ProvisionStateOpts{Target: nodes.TargetAbort}, @@ -108,14 +123,7 @@ func (p *ironicProvisioner) Service(data provisioner.ServicingData, unprepared, switch nodes.ProvisionState(ironicNode.ProvisionState) { case nodes.ServiceFail: - // When servicing failed and user actually removed the specs (not just no updates calculated), - // we need to abort the servicing operation to back out - if !data.HasFirmwareSettingsSpec && !data.HasFirmwareComponentsSpec { - p.log.Info("aborting servicing because spec.updates/spec.settings was removed") - return p.abortServicing(ironicNode) - } - - // When servicing failed and there are pending updates, we need to clean host provisioning settings + // When servicing failed, we need to clean host provisioning settings. // If restartOnFailure is false, it means the settings aren't cleared. if !restartOnFailure { result, err = operationFailed(ironicNode.LastError) @@ -150,12 +158,6 @@ func (p *ironicProvisioner) Service(data provisioner.ServicingData, unprepared, result, err = operationContinuing(provisionRequeueDelay) case nodes.ServiceWait: - // If user removed spec.updates/spec.settings while in ServiceWait, abort - if !data.HasFirmwareSettingsSpec && !data.HasFirmwareComponentsSpec { - p.log.Info("aborting servicing because spec.updates/spec.settings was removed") - return p.abortServicing(ironicNode) - } - p.log.Info("waiting for host to become active", "state", ironicNode.ProvisionState, "serviceStep", ironicNode.ServiceStep) diff --git a/pkg/provisioner/ironic/servicing_test.go b/pkg/provisioner/ironic/servicing_test.go index 2ce84a257a..c74d4eb54f 100644 --- a/pkg/provisioner/ironic/servicing_test.go +++ b/pkg/provisioner/ironic/servicing_test.go @@ -171,7 +171,6 @@ func TestService(t *testing.T) { prepData.TargetFirmwareSettings = metal3api.DesiredSettingsMap{ "Answer": intstr.FromInt(42), } - prepData.HasFirmwareSettingsSpec = true } publisher := func(reason, message string) {} diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index 7eb621187a..0156251f93 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -120,9 +120,6 @@ type ServicingData struct { TargetFirmwareSettings metal3api.DesiredSettingsMap ActualFirmwareSettings metal3api.SettingsMap TargetFirmwareComponents []metal3api.FirmwareUpdate - // Flags to track if specs exist (vs. just no updates calculated) - HasFirmwareSettingsSpec bool - HasFirmwareComponentsSpec bool } type ProvisionData struct { @@ -175,6 +172,11 @@ type Provisioner interface { // Servicing updates configuration for a provisioned host. Service(data ServicingData, unprepared, restartOnFailure bool) (result Result, started bool, err error) + // AbortServicing aborts an in-progress or failed servicing operation, + // returning the host to an active state. This is used when the user + // removes the firmware update/settings specifications. + AbortServicing() (result Result, started bool, err error) + // Provision writes the image from the host spec to the host. It // may be called multiple times, and should return true for its // dirty flag until the provisioning operation is completed.