Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 39 additions & 4 deletions internal/controller/metal3.io/baremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1447,6 +1447,7 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner
var hfcDirty bool
var hfc *metal3api.HostFirmwareComponents
var liveFirmwareSettingsAllowed, liveFirmwareUpdatesAllowed bool
var hasSettingsSpec, hasComponentsSpec bool

if hup != nil {
liveFirmwareSettingsAllowed = (hup.Spec.FirmwareSettings == metal3api.HostUpdatePolicyOnReboot)
Expand All @@ -1470,6 +1471,9 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner
servicingData.ActualFirmwareSettings = hfs.Status.Settings
servicingData.TargetFirmwareSettings = hfs.Spec.Settings
}

// Track if HFS spec has actual settings (for abort decision)
hasSettingsSpec = (hfs != nil && len(hfs.Spec.Settings) > 0)
}

if liveFirmwareUpdatesAllowed {
Expand All @@ -1486,6 +1490,9 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner
servicingData.TargetFirmwareComponents = hfc.Spec.Updates
}
}

// Track if HFC spec has actual updates (for abort decision)
hasComponentsSpec = (hfc != nil && len(hfc.Spec.Updates) > 0)
}

hasChanges := fwDirty || hfsDirty || hfcDirty
Expand All @@ -1496,6 +1503,34 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner
return nil
}

// If we're in a servicing-related state and all specs have been removed,
// abort the servicing operation. Check both new (HFS/HFC) and old (Spec.Firmware) APIs.
hasFirmwareSpec := info.host.Spec.Firmware != nil
specsRemoved := !hasSettingsSpec && !hasComponentsSpec && !hasFirmwareSpec
inServicingState := info.host.Status.OperationalStatus == metal3api.OperationalStatusServicing ||
info.host.Status.ErrorType == metal3api.ServicingError
if specsRemoved && inServicingState {
info.log.Info("specs removed while in servicing state, attempting to abort")
provResult, _, err := prov.AbortServicing()
if err != nil {
return actionError{fmt.Errorf("failed to abort servicing: %w", err)}
}
if provResult.ErrorMessage != "" {
info.log.Info("failed to abort servicing", "error", provResult.ErrorMessage)
return actionError{fmt.Errorf("failed to abort servicing: %s", provResult.ErrorMessage)}
}
if provResult.Dirty {
info.log.Info("abort operation in progress, checking back later")
return actionContinue{provResult.RequeueAfter}
}
// Abort completed successfully, clear error state
info.log.Info("successfully aborted servicing")
info.host.Status.ErrorType = ""
info.host.Status.ErrorMessage = ""
info.host.Status.OperationalStatus = metal3api.OperationalStatusOK
return actionComplete{}
}

// FIXME(janders/dtantsur): this implementation may lead to a scenario where if we never actually
// succeed before leaving this state (e.g. by deprovisioning) we lose the signal that the
// update didn't actually happen. This is deemed an acceptable risk for the moment since it is only
Expand Down Expand Up @@ -2148,7 +2183,7 @@ func (r *BareMetalHostReconciler) getHostFirmwareSettings(info *reconcileInfo) (
}
if !valid {
info.log.Info("hostFirmwareSettings not valid", "namespacename", info.request.NamespacedName)
return false, nil, nil
return false, hfs, nil
}

if changed {
Expand All @@ -2162,7 +2197,7 @@ func (r *BareMetalHostReconciler) getHostFirmwareSettings(info *reconcileInfo) (
}

info.log.Info("hostFirmwareSettings no updates", "namespacename", info.request.NamespacedName)
return false, nil, nil
return false, hfs, nil
}

// Get the stored firmware settings if there are valid changes.
Expand All @@ -2185,15 +2220,15 @@ func (r *BareMetalHostReconciler) getHostFirmwareComponents(info *reconcileInfo)
}
if !valid {
info.log.Info("hostFirmwareComponents not valid", "namespacename", info.request.NamespacedName)
return false, nil, nil
return false, hfc, nil
}
if changed {
info.log.Info("hostFirmwareComponents indicating ChangeDetected", "namespacename", info.request.NamespacedName)
return true, hfc, nil
}

info.log.Info("hostFirmwareComponents no updates", "namespacename", info.request.NamespacedName)
return false, nil, nil
return false, hfc, nil
}

func (r *BareMetalHostReconciler) saveHostStatus(ctx context.Context, host *metal3api.BareMetalHost) error {
Expand Down
4 changes: 4 additions & 0 deletions internal/controller/metal3.io/host_state_machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1366,6 +1366,10 @@ func (m *mockProvisioner) Service(_ provisioner.ServicingData, _ bool, _ bool) (
return m.getNextResultByMethod("Service"), m.nextResults["Service"].Dirty, err
}

func (m *mockProvisioner) AbortServicing() (result provisioner.Result, started bool, err error) {
return m.getNextResultByMethod("AbortServicing"), m.nextResults["AbortServicing"].Dirty, err
}

func (m *mockProvisioner) Adopt(_ provisioner.AdoptData, _ bool) (result provisioner.Result, err error) {
return m.getNextResultByMethod("Adopt"), err
}
Expand Down
6 changes: 6 additions & 0 deletions pkg/provisioner/demo/demo.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,12 @@ func (p *demoProvisioner) Service(_ provisioner.ServicingData, unprepared bool,
return
}

// AbortServicing aborts an in-progress or failed servicing operation.
func (p *demoProvisioner) AbortServicing() (result provisioner.Result, started bool, err error) {
p.log.Info("aborting servicing")
return
}

// Adopt notifies the provisioner that the state machine believes the host
// to be currently provisioned, and that it should be managed as such.
func (p *demoProvisioner) Adopt(_ provisioner.AdoptData, _ bool) (result provisioner.Result, err error) {
Expand Down
6 changes: 6 additions & 0 deletions pkg/provisioner/fixture/fixture.go
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,12 @@ func (p *fixtureProvisioner) Service(_ provisioner.ServicingData, unprepared boo
return
}

// AbortServicing aborts an in-progress or failed servicing operation.
func (p *fixtureProvisioner) AbortServicing() (result provisioner.Result, started bool, err error) {
p.log.Info("aborting servicing")
return
}

// Adopt notifies the provisioner that the state machine believes the host
// to be currently provisioned, and that it should be managed as such.
func (p *fixtureProvisioner) Adopt(_ provisioner.AdoptData, _ bool) (result provisioner.Result, err error) {
Expand Down
40 changes: 39 additions & 1 deletion pkg/provisioner/ironic/servicing.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,38 @@ func (p *ironicProvisioner) startServicing(bmcAccess bmc.AccessDetails, ironicNo
return
}

// AbortServicing aborts an in-progress or failed servicing operation.
func (p *ironicProvisioner) AbortServicing() (result provisioner.Result, started bool, err error) {
ironicNode, err := p.getNode()
if err != nil {
result, err = transientError(err)
return result, started, err
}

// Only abort if in a servicing-related state
state := nodes.ProvisionState(ironicNode.ProvisionState)
if state != nodes.ServiceFail && state != nodes.ServiceWait {
p.log.Info("cannot abort servicing in current state", "state", ironicNode.ProvisionState)
result, err = operationComplete()
return result, started, err
}

// Clear maintenance flag first if it's set
if ironicNode.Maintenance {
p.log.Info("clearing maintenance flag before aborting servicing")
result, err = p.setMaintenanceFlag(ironicNode, false, "")
return result, started, err
}

// Set started to let the controller know about the change
p.log.Info("aborting servicing")
started, result, err = p.tryChangeNodeProvisionState(
ironicNode,
nodes.ProvisionStateOpts{Target: nodes.TargetAbort},
)
return
}

func (p *ironicProvisioner) Service(data provisioner.ServicingData, unprepared, restartOnFailure bool) (result provisioner.Result, started bool, err error) {
bmcAccess, err := p.bmcAccess()
if err != nil {
Expand Down Expand Up @@ -119,7 +151,13 @@ func (p *ironicProvisioner) Service(data provisioner.ServicingData, unprepared,
// Servicing finished
p.log.Info("servicing finished on the host")
result, err = operationComplete()
case nodes.Servicing, nodes.ServiceWait:
case nodes.Servicing:
p.log.Info("waiting for host to finish servicing",
"state", ironicNode.ProvisionState,
"serviceStep", ironicNode.ServiceStep)
result, err = operationContinuing(provisionRequeueDelay)

case nodes.ServiceWait:
p.log.Info("waiting for host to become active",
"state", ironicNode.ProvisionState,
"serviceStep", ironicNode.ServiceStep)
Expand Down
2 changes: 1 addition & 1 deletion pkg/provisioner/ironic/servicing_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ func TestService(t *testing.T) {
host.Status.Provisioning.ID = nodeUUID
prepData := provisioner.ServicingData{}
if !tc.skipConfig {
host.Spec.BMC.Address = "raid-test://test.bmc/"
host.Spec.BMC.Address = "bios-test://test.bmc/"
prepData.ActualFirmwareSettings = metal3api.SettingsMap{
"Answer": "unknown",
}
Expand Down
5 changes: 5 additions & 0 deletions pkg/provisioner/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,11 @@ type Provisioner interface {
// Servicing updates configuration for a provisioned host.
Service(data ServicingData, unprepared, restartOnFailure bool) (result Result, started bool, err error)

// AbortServicing aborts an in-progress or failed servicing operation,
// returning the host to an active state. This is used when the user
// removes the firmware update/settings specifications.
AbortServicing() (result Result, started bool, err error)

// Provision writes the image from the host spec to the host. It
// may be called multiple times, and should return true for its
// dirty flag until the provisioning operation is completed.
Expand Down