Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions crates/machine-controller/src/boot_interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,79 @@ pub fn boot_interface_target(
.boot_interface_mac()
.map(BootInterfaceTarget::MacOnly)
}

/// What a Redfish boot step should do with a host's boot interface.
///
/// Separates "not ready yet" from "broken". A zero-DPU host (`NoDpu` or
/// `NicMode`) boots from a plain NIC that takes its first HostInband lease only
/// after the host comes up, so until then it has no boot interface to
/// resolve -- the controller should wait, not fail. A host with managed DPUs
/// always has its DPU-facing primary set at promotion, so a missing boot
/// interface there is a genuine fault.
#[derive(Debug)]
pub enum BootInterfaceResolution {
/// The boot interface resolved; target it.
Ready(BootInterfaceTarget),
/// A zero-DPU host whose boot NIC has not been discovered yet -- wait.
AwaitingNic,
/// A host that should already have a boot interface is missing one.
Missing,
}

/// Resolve this host's boot interface for a Redfish boot step, classifying a
/// missing one as either "wait for the NIC" (zero-DPU) or "fault".
pub fn resolve_boot_interface(mh_snapshot: &ManagedHostStateSnapshot) -> BootInterfaceResolution {
classify_boot_interface(
boot_interface_target(mh_snapshot),
mh_snapshot.has_managed_dpus(),
)
}

/// The decision behind [`resolve_boot_interface`], split out from the snapshot
/// lookup so it can be unit-tested directly.
fn classify_boot_interface(
boot_interface: Option<BootInterfaceTarget>,
has_managed_dpus: bool,
) -> BootInterfaceResolution {
match boot_interface {
Some(target) => BootInterfaceResolution::Ready(target),
None if !has_managed_dpus => BootInterfaceResolution::AwaitingNic,
None => BootInterfaceResolution::Missing,
}
}

#[cfg(test)]
mod tests {
use mac_address::MacAddress;

use super::*;

#[test]
fn classify_waits_for_a_zero_dpu_host_without_a_boot_interface() {
// The zero-DPU host's boot NIC has not taken its first lease yet: wait
// for it instead of faulting.
assert!(matches!(
classify_boot_interface(None, false),
BootInterfaceResolution::AwaitingNic
));
}

#[test]
fn classify_faults_when_a_dpu_host_has_no_boot_interface() {
// A host with managed DPUs always has its DPU-facing primary set at
// promotion, so a missing boot interface is a real fault.
assert!(matches!(
classify_boot_interface(None, true),
BootInterfaceResolution::Missing
));
}

#[test]
fn classify_uses_the_resolved_interface_when_present() {
let target = BootInterfaceTarget::MacOnly(MacAddress::new([0, 0, 0, 0, 0, 1]));
assert!(matches!(
classify_boot_interface(Some(target), false),
BootInterfaceResolution::Ready(_)
));
}
}
84 changes: 63 additions & 21 deletions crates/machine-controller/src/handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ use tokio::sync::Semaphore;
use tracing::instrument;
use version_compare::Cmp;

use crate::boot_interface::boot_interface_target;
use crate::boot_interface::{BootInterfaceResolution, resolve_boot_interface};
use crate::config::{
FirmwareGlobal, MachineStateHandlerSiteConfig, MachineValidationConfig, TimePeriod,
};
Expand Down Expand Up @@ -3339,6 +3339,7 @@ async fn handle_dpu_reprovision(
SetBootOrderOutcome::WaitingForReboot(reason) => {
Ok(StateHandlerOutcome::wait(reason))
}
SetBootOrderOutcome::Wait(reason) => Ok(StateHandlerOutcome::wait(reason)),
}
}
ReprovisionState::LockHostAfterBootRepair => {
Expand Down Expand Up @@ -3527,13 +3528,24 @@ async fn check_host_boot_config(
));
}

// Resolve the interface whose boot option should be first in host UEFI.
let boot_interface = boot_interface_target(mh_snapshot).ok_or_else(|| {
StateHandlerError::GenericError(eyre::eyre!(
"Missing boot interface for host: {}",
mh_snapshot.host_snapshot.id
))
})?;
// Resolve the interface whose boot option should be first in host UEFI. A
// zero-DPU host whose boot NIC has not taken its first HostInband lease yet
// has no boot interface to resolve -- wait for it rather than failing.
let boot_interface = match resolve_boot_interface(mh_snapshot) {
BootInterfaceResolution::Ready(target) => target,
BootInterfaceResolution::AwaitingNic => {
return Ok(HostBootConfigDecision::Wait(format!(
"Waiting for zero-DPU host {} to discover its boot NIC before configuring boot.",
mh_snapshot.host_snapshot.id
)));
}
BootInterfaceResolution::Missing => {
return Err(StateHandlerError::GenericError(eyre::eyre!(
"Missing boot interface for host: {}",
mh_snapshot.host_snapshot.id
)));
}
};

let vendor = mh_snapshot.host_snapshot.bmc_vendor();

Expand Down Expand Up @@ -4835,6 +4847,10 @@ enum SetBootOrderOutcome {
Continue(SetBootOrderInfo),
Done,
WaitingForReboot(String),
/// No boot interface to act on yet -- e.g. a zero-DPU host whose boot NIC
/// has not been discovered. Distinct from `WaitingForReboot`: nothing was
/// rebooted, the caller just waits and retries.
Wait(String),
}

/// Decision from checking whether host boot repair is still required.
Expand Down Expand Up @@ -5237,6 +5253,9 @@ async fn handle_host_boot_order_setup(
SetBootOrderOutcome::WaitingForReboot(reason) => {
return Ok(StateHandlerOutcome::wait(reason));
}
SetBootOrderOutcome::Wait(reason) => {
return Ok(StateHandlerOutcome::wait(reason));
}
}
}
None => ManagedHostState::HostInit {
Expand Down Expand Up @@ -10876,6 +10895,9 @@ async fn handle_instance_host_platform_config(
SetBootOrderOutcome::WaitingForReboot(reason) => {
return Ok(StateHandlerOutcome::wait(reason));
}
SetBootOrderOutcome::Wait(reason) => {
return Ok(StateHandlerOutcome::wait(reason));
}
}
}
HostPlatformConfigurationState::LockHost => {
Expand Down Expand Up @@ -10922,13 +10944,24 @@ async fn set_host_boot_order(
// for verification.
//
// Resolve the boot NIC MAC the same way `CheckHostConfig` does,
// supporting hosts with DPU(s) and zero DPUs alike.
let boot_interface = boot_interface_target(mh_snapshot).ok_or_else(|| {
StateHandlerError::GenericError(eyre::eyre!(
"Missing boot interface for host: {}",
mh_snapshot.host_snapshot.id
))
})?;
// supporting hosts with DPU(s) and zero DPUs alike. A zero-DPU host
// whose boot NIC has not taken its first HostInband lease yet has no
// boot interface to resolve -- wait for it rather than failing.
let boot_interface = match resolve_boot_interface(mh_snapshot) {
BootInterfaceResolution::Ready(target) => target,
BootInterfaceResolution::AwaitingNic => {
return Ok(SetBootOrderOutcome::Wait(format!(
"Waiting for zero-DPU host {} to discover its boot NIC before setting boot order.",
mh_snapshot.host_snapshot.id
)));
}
BootInterfaceResolution::Missing => {
return Err(StateHandlerError::GenericError(eyre::eyre!(
"Missing boot interface for host: {}",
mh_snapshot.host_snapshot.id
)));
}
};

let jid = match set_boot_order_dpu_first_and_handle_no_dpu_error(
redfish_client,
Expand Down Expand Up @@ -11207,12 +11240,21 @@ async fn set_host_boot_order(

let retry_count = set_boot_order_info.retry_count;

let boot_interface = boot_interface_target(mh_snapshot).ok_or_else(|| {
StateHandlerError::GenericError(eyre::eyre!(
"Missing boot interface for host: {}",
mh_snapshot.host_snapshot.id
))
})?;
let boot_interface = match resolve_boot_interface(mh_snapshot) {
BootInterfaceResolution::Ready(target) => target,
BootInterfaceResolution::AwaitingNic => {
return Ok(SetBootOrderOutcome::Wait(format!(
"Waiting for zero-DPU host {} to discover its boot NIC before verifying boot order.",
mh_snapshot.host_snapshot.id
)));
}
BootInterfaceResolution::Missing => {
return Err(StateHandlerError::GenericError(eyre::eyre!(
"Missing boot interface for host: {}",
mh_snapshot.host_snapshot.id
)));
}
};

let boot_order_configured = boot_interface
.run(|bi| redfish_client.is_boot_order_setup(bi))
Expand Down
Loading