diff --git a/crates/machine-controller/src/boot_interface.rs b/crates/machine-controller/src/boot_interface.rs index 65b0184775..5c6c98256d 100644 --- a/crates/machine-controller/src/boot_interface.rs +++ b/crates/machine-controller/src/boot_interface.rs @@ -38,3 +38,79 @@ pub fn boot_interface_target( .boot_interface_mac() .map(BootInterfaceTarget::MacOnly) } + +/// What a Redfish boot step should do with a host's boot interface. +/// +/// Separates "not ready yet" from "broken". A zero-DPU host (`NoDpu` or +/// `NicMode`) boots from a plain NIC that takes its first HostInband lease only +/// after the host comes up, so until then it has no boot interface to +/// resolve -- the controller should wait, not fail. A host with managed DPUs +/// always has its DPU-facing primary set at promotion, so a missing boot +/// interface there is a genuine fault. +#[derive(Debug)] +pub enum BootInterfaceResolution { + /// The boot interface resolved; target it. + Ready(BootInterfaceTarget), + /// A zero-DPU host whose boot NIC has not been discovered yet -- wait. + AwaitingNic, + /// A host that should already have a boot interface is missing one. + Missing, +} + +/// Resolve this host's boot interface for a Redfish boot step, classifying a +/// missing one as either "wait for the NIC" (zero-DPU) or "fault". +pub fn resolve_boot_interface(mh_snapshot: &ManagedHostStateSnapshot) -> BootInterfaceResolution { + classify_boot_interface( + boot_interface_target(mh_snapshot), + mh_snapshot.has_managed_dpus(), + ) +} + +/// The decision behind [`resolve_boot_interface`], split out from the snapshot +/// lookup so it can be unit-tested directly. +fn classify_boot_interface( + boot_interface: Option, + has_managed_dpus: bool, +) -> BootInterfaceResolution { + match boot_interface { + Some(target) => BootInterfaceResolution::Ready(target), + None if !has_managed_dpus => BootInterfaceResolution::AwaitingNic, + None => BootInterfaceResolution::Missing, + } +} + +#[cfg(test)] +mod tests { + use mac_address::MacAddress; + + use super::*; + + #[test] + fn classify_waits_for_a_zero_dpu_host_without_a_boot_interface() { + // The zero-DPU host's boot NIC has not taken its first lease yet: wait + // for it instead of faulting. + assert!(matches!( + classify_boot_interface(None, false), + BootInterfaceResolution::AwaitingNic + )); + } + + #[test] + fn classify_faults_when_a_dpu_host_has_no_boot_interface() { + // A host with managed DPUs always has its DPU-facing primary set at + // promotion, so a missing boot interface is a real fault. + assert!(matches!( + classify_boot_interface(None, true), + BootInterfaceResolution::Missing + )); + } + + #[test] + fn classify_uses_the_resolved_interface_when_present() { + let target = BootInterfaceTarget::MacOnly(MacAddress::new([0, 0, 0, 0, 0, 1])); + assert!(matches!( + classify_boot_interface(Some(target), false), + BootInterfaceResolution::Ready(_) + )); + } +} diff --git a/crates/machine-controller/src/handler.rs b/crates/machine-controller/src/handler.rs index 626c8bfd87..97dd8bc1bd 100644 --- a/crates/machine-controller/src/handler.rs +++ b/crates/machine-controller/src/handler.rs @@ -98,7 +98,7 @@ use tokio::sync::Semaphore; use tracing::instrument; use version_compare::Cmp; -use crate::boot_interface::boot_interface_target; +use crate::boot_interface::{BootInterfaceResolution, resolve_boot_interface}; use crate::config::{ FirmwareGlobal, MachineStateHandlerSiteConfig, MachineValidationConfig, TimePeriod, }; @@ -3339,6 +3339,7 @@ async fn handle_dpu_reprovision( SetBootOrderOutcome::WaitingForReboot(reason) => { Ok(StateHandlerOutcome::wait(reason)) } + SetBootOrderOutcome::Wait(reason) => Ok(StateHandlerOutcome::wait(reason)), } } ReprovisionState::LockHostAfterBootRepair => { @@ -3527,13 +3528,24 @@ async fn check_host_boot_config( )); } - // Resolve the interface whose boot option should be first in host UEFI. - let boot_interface = boot_interface_target(mh_snapshot).ok_or_else(|| { - StateHandlerError::GenericError(eyre::eyre!( - "Missing boot interface for host: {}", - mh_snapshot.host_snapshot.id - )) - })?; + // Resolve the interface whose boot option should be first in host UEFI. A + // zero-DPU host whose boot NIC has not taken its first HostInband lease yet + // has no boot interface to resolve -- wait for it rather than failing. + let boot_interface = match resolve_boot_interface(mh_snapshot) { + BootInterfaceResolution::Ready(target) => target, + BootInterfaceResolution::AwaitingNic => { + return Ok(HostBootConfigDecision::Wait(format!( + "Waiting for zero-DPU host {} to discover its boot NIC before configuring boot.", + mh_snapshot.host_snapshot.id + ))); + } + BootInterfaceResolution::Missing => { + return Err(StateHandlerError::GenericError(eyre::eyre!( + "Missing boot interface for host: {}", + mh_snapshot.host_snapshot.id + ))); + } + }; let vendor = mh_snapshot.host_snapshot.bmc_vendor(); @@ -4835,6 +4847,10 @@ enum SetBootOrderOutcome { Continue(SetBootOrderInfo), Done, WaitingForReboot(String), + /// No boot interface to act on yet -- e.g. a zero-DPU host whose boot NIC + /// has not been discovered. Distinct from `WaitingForReboot`: nothing was + /// rebooted, the caller just waits and retries. + Wait(String), } /// Decision from checking whether host boot repair is still required. @@ -5237,6 +5253,9 @@ async fn handle_host_boot_order_setup( SetBootOrderOutcome::WaitingForReboot(reason) => { return Ok(StateHandlerOutcome::wait(reason)); } + SetBootOrderOutcome::Wait(reason) => { + return Ok(StateHandlerOutcome::wait(reason)); + } } } None => ManagedHostState::HostInit { @@ -10876,6 +10895,9 @@ async fn handle_instance_host_platform_config( SetBootOrderOutcome::WaitingForReboot(reason) => { return Ok(StateHandlerOutcome::wait(reason)); } + SetBootOrderOutcome::Wait(reason) => { + return Ok(StateHandlerOutcome::wait(reason)); + } } } HostPlatformConfigurationState::LockHost => { @@ -10922,13 +10944,24 @@ async fn set_host_boot_order( // for verification. // // Resolve the boot NIC MAC the same way `CheckHostConfig` does, - // supporting hosts with DPU(s) and zero DPUs alike. - let boot_interface = boot_interface_target(mh_snapshot).ok_or_else(|| { - StateHandlerError::GenericError(eyre::eyre!( - "Missing boot interface for host: {}", - mh_snapshot.host_snapshot.id - )) - })?; + // supporting hosts with DPU(s) and zero DPUs alike. A zero-DPU host + // whose boot NIC has not taken its first HostInband lease yet has no + // boot interface to resolve -- wait for it rather than failing. + let boot_interface = match resolve_boot_interface(mh_snapshot) { + BootInterfaceResolution::Ready(target) => target, + BootInterfaceResolution::AwaitingNic => { + return Ok(SetBootOrderOutcome::Wait(format!( + "Waiting for zero-DPU host {} to discover its boot NIC before setting boot order.", + mh_snapshot.host_snapshot.id + ))); + } + BootInterfaceResolution::Missing => { + return Err(StateHandlerError::GenericError(eyre::eyre!( + "Missing boot interface for host: {}", + mh_snapshot.host_snapshot.id + ))); + } + }; let jid = match set_boot_order_dpu_first_and_handle_no_dpu_error( redfish_client, @@ -11207,12 +11240,21 @@ async fn set_host_boot_order( let retry_count = set_boot_order_info.retry_count; - let boot_interface = boot_interface_target(mh_snapshot).ok_or_else(|| { - StateHandlerError::GenericError(eyre::eyre!( - "Missing boot interface for host: {}", - mh_snapshot.host_snapshot.id - )) - })?; + let boot_interface = match resolve_boot_interface(mh_snapshot) { + BootInterfaceResolution::Ready(target) => target, + BootInterfaceResolution::AwaitingNic => { + return Ok(SetBootOrderOutcome::Wait(format!( + "Waiting for zero-DPU host {} to discover its boot NIC before verifying boot order.", + mh_snapshot.host_snapshot.id + ))); + } + BootInterfaceResolution::Missing => { + return Err(StateHandlerError::GenericError(eyre::eyre!( + "Missing boot interface for host: {}", + mh_snapshot.host_snapshot.id + ))); + } + }; let boot_order_configured = boot_interface .run(|bi| redfish_client.is_boot_order_setup(bi))