Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 11 additions & 13 deletions crates/api-core/src/dhcp/discover.rs
Original file line number Diff line number Diff line change
Expand Up @@ -270,20 +270,18 @@ pub async fn discover_dhcp(
.await
.map_err(CarbideError::from)?
{
// Walk the host_nics list to see if there's a matching NIC (because it
// may have a static reservation need or a primary-interface need)
let mut declared_primary_mac: Option<MacAddress> = None;
for nic in &m.data.host_nics {
if nic.primary == Some(true) {
declared_primary_mac = Some(nic.mac_address);
}
if nic.mac_address == parsed_mac {
host_nic = Some(nic.clone());
}
}
if let Some(pmac) = declared_primary_mac {
is_primary_nic = Some(pmac == parsed_mac);
// The host's declared primary NIC (if any) decides whether this
// MAC is its boot interface; the matched NIC also carries any
// static reservation need handled below.
if let Some(declared_primary_mac) = m.data.declared_primary_mac() {
is_primary_nic = Some(declared_primary_mac == parsed_mac);
}
host_nic = m
.data
.host_nics
.iter()
.find(|nic| nic.mac_address == parsed_mac)
.cloned();
if let Some(ref nic) = host_nic
&& let Some(fixed_ip) = nic.fixed_ip
{
Expand Down
28 changes: 16 additions & 12 deletions crates/api-core/src/handlers/bmc_endpoint_explorer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,12 @@ use crate::api::{Api, log_machine_id, log_request_data};
/// `predicted_machine_interfaces` instead: the predicted NIC's MAC and
/// recorded Redfish interface id form the same [`MachineBootInterface`] the
/// real row will hold once the lease promotes it. Predictions answer only
/// when unambiguous -- exactly one non-underlay prediction. Predictions hold
/// no primary flag, so with several (e.g. a host whose report lists SuperNICs
/// alongside the boot NIC) the declared `ExpectedHostNic.primary` cannot be
/// applied here; resolution refuses to guess and the action keeps requiring
/// an explicit MAC, which the matching prediction's recorded id completes.
/// when unambiguous -- exactly one non-underlay prediction. Predictions now
/// carry a `primary_interface` flag, but this resolver doesn't consult it yet,
/// so with several (e.g. a host whose report lists SuperNICs alongside the boot
/// NIC) the declared `ExpectedHostNic.primary` is not applied here; resolution
/// refuses to guess and the action keeps requiring an explicit MAC, which the
/// matching prediction's recorded id completes.
/// The machine-controller does not consult predictions at all yet -- its
/// boot states wait out this window -- a known follow-up.
///
Expand Down Expand Up @@ -118,8 +119,9 @@ fn resolve_admin_boot_interface_target(
}
// The rows offered no boot candidate: the machine's predicted
// NICs answer, but only when unambiguous -- exactly one
// non-underlay prediction. Predictions hold no primary flag, so
// with several the declared intent is unknowable here.
// non-underlay prediction. Predictions now carry a primary flag,
// but this resolver doesn't consult it yet, so with several the
// declared intent isn't applied here.
let mut bootable = candidates.predicted.iter().filter(|predicted| {
predicted.expected_network_segment_type != NetworkSegmentType::Underlay
});
Expand Down Expand Up @@ -1140,6 +1142,7 @@ mod tests {
mac_address: mac.parse().unwrap(),
expected_network_segment_type: NetworkSegmentType::HostInband,
boot_interface_id: boot_interface_id.map(String::from),
primary_interface: false,
}
}

Expand Down Expand Up @@ -1318,11 +1321,12 @@ mod tests {

#[test]
fn no_mac_multiple_predictions_refuse_to_guess_a_boot_device() {
// Predictions hold no primary flag, so with several (a report listing
// SuperNICs alongside the boot NIC) the declared intent is unknowable:
// resolution refuses to guess rather than silently programming boot
// order against whichever NIC sorts lowest. The operator's explicit
// MAC still resolves, completed from the matching prediction.
// These predictions are non-primary and this resolver doesn't consult
// the primary flag yet, so with several (a report listing SuperNICs
// alongside the boot NIC) the declared intent is unknowable: resolution
// refuses to guess rather than silently programming boot order against
// whichever NIC sorts lowest. The operator's explicit MAC still
// resolves, completed from the matching prediction.
let c = BootInterfaceCandidates {
interfaces: vec![],
predicted: vec![
Expand Down
76 changes: 76 additions & 0 deletions crates/api-core/src/tests/expected_machine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2622,6 +2622,82 @@ async fn test_add_rejects_multiple_primary_host_nics(
Ok(())
}

/// The declared primary survives whichever order its NICs DHCP in: leasing the
/// non-primary NIC first, then the declared primary, still lands the declared
/// primary as `primary_interface` and the other as non-primary.
#[crate::sqlx_test]
async fn test_declared_primary_survives_dhcp_arrival_order(
pool: sqlx::PgPool,
) -> Result<(), Box<dyn std::error::Error>> {
let env = {
let mut config = get_config();
config.rack_management_enabled = true;
create_test_env_with_overrides(pool, TestEnvOverrides::with_config(config)).await
};
let bmc_mac: MacAddress = "9A:9B:9C:9D:9F:10".parse().unwrap();
let primary_mac: MacAddress = "9A:9B:9C:9D:9F:11".parse().unwrap();
let other_mac: MacAddress = "9A:9B:9C:9D:9F:12".parse().unwrap();

env.api
.add_expected_machine(tonic::Request::new(rpc::forge::ExpectedMachine {
id: None,
bmc_mac_address: bmc_mac.to_string(),
bmc_username: "ADMIN".into(),
bmc_password: "PASS".into(),
chassis_serial_number: "EM-PRIMARY-003".into(),
host_nics: vec![
rpc::forge::ExpectedHostNic {
mac_address: primary_mac.to_string(),
nic_type: Some("onboard".into()),
fixed_ip: None,
fixed_mask: None,
fixed_gateway: None,
primary: Some(true),
},
rpc::forge::ExpectedHostNic {
mac_address: other_mac.to_string(),
nic_type: Some("onboard".into()),
fixed_ip: None,
fixed_mask: None,
fixed_gateway: None,
primary: None,
},
],
..Default::default()
}))
.await?;

// The non-primary NIC leases first, then the declared primary.
for mac in [other_mac, primary_mac] {
let mac_str = mac.to_string();
env.api
.discover_dhcp(
common::rpc_builder::DhcpDiscovery::builder(
&mac_str,
common::api_fixtures::FIXTURE_DHCP_RELAY_ADDRESS,
)
.tonic_request(),
)
.await?;
}

let mut txn = env.pool.begin().await?;
let primary = db::machine_interface::find_by_mac_address(&mut *txn, primary_mac).await?;
let other = db::machine_interface::find_by_mac_address(&mut *txn, other_mac).await?;
assert_eq!(primary.len(), 1);
assert_eq!(other.len(), 1);
assert!(
primary[0].primary_interface,
"the declared primary NIC should be primary even when it leases last"
);
assert!(
!other[0].primary_interface,
"the non-declared NIC should not be primary"
);

Ok(())
}

/// Simple test to have some round-trip coverage for `ExpectedMachine.dpu_mode`
/// to make sure a `NicMode` setting makes it from the API to the DB and back
/// correctly. Verifies:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
-- Carry the declared ExpectedHostNic.primary boot interface on the prediction so
-- it survives promotion into machine_interfaces (the predicted row previously
-- promoted as primary_interface = false unconditionally). Defaults false: a host
-- that declares nothing keeps today's automation -- the boot interface is chosen
-- by the pick_boot_interface fallback (lowest-MAC non-underlay) or DPU takeover.
ALTER TABLE predicted_machine_interfaces
ADD COLUMN primary_interface boolean NOT NULL DEFAULT false;

-- The machine_id foreign key has no backing index (Postgres does not create one
-- for FK columns), so find_by_machine_id scans the table. Add it now that
-- promotion reads predictions by machine more often.
CREATE INDEX predicted_machine_interfaces_machine_id_idx
ON predicted_machine_interfaces (machine_id);
114 changes: 67 additions & 47 deletions crates/api-db/src/machine_interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -419,20 +419,12 @@ pub async fn find_one(
// newly_created_interface indicates that we couldn't find a
// MachineInterface, so created new one.
//
// `is_primary` integrates `ExpectedHostNic.primary` into machine
// interface creation. If True, this NIC is declared the primary
// boot NIC (which is/was the previous default behavior anyway,
// meaning None does the same thing), and this is fine, because
// at the end of the day, site-explorer will end up demoting it
// as part of attaching a DPU.
//
// Now, if it's *False*, there's a different NIC on this host declared
// as the boot NIC, so we actually overide the new interface and
// explicitly mark it as non-primary here. We *could* bake this in
// as part of validate_existing_mac_and_create, but since this is
// the only call-site that cares about it, I'm making it specific
// to here.
// TODO(chet): ...but consider plumbing it through.
// `is_primary` carries the declared `ExpectedHostNic.primary` for this MAC:
// `Some(true)` -- this NIC is the host's declared boot interface, `Some(false)`
// -- a different NIC is, `None` -- nothing was declared. On a newly created (and
// thus still machine-less) row we make that declaration stick, promoting to or
// demoting from the creation default as needed, so the boot interface is right
// from the first lease. `None` keeps the creation default.
//
// If we're not making a new interface, then existing interfaces
// are returned untouched.
Expand All @@ -456,10 +448,6 @@ pub async fn find_or_create_machine_interface(
%mac_address,
"Found no existing machine with mac address {mac_address} using networks with relays {relaystr}",
);
// validate_existing_mac_and_create hardcodes primary_interface: true
// at creation. If the caller has explicitly declared a *different*
// NIC as this machine's primary (i.e. is_primary == false), override the
// true/default here.
let mut interface = validate_existing_mac_and_create(
&mut *txn,
mac_address,
Expand All @@ -468,9 +456,22 @@ pub async fn find_or_create_machine_interface(
retained_window,
)
.await?;
if is_primary == Some(false) && interface.primary_interface {
set_primary_interface(&interface.id, false, &mut *txn).await?;
interface.primary_interface = false;
// Make the declaration authoritative on this machine-less row.
// `validate_existing_mac_and_create` defaults a freshly created row to
// primary, so the demote covers "a different NIC is declared primary"
// and the promote covers a row we *found* (rather than created) that is
// the declared primary. Safe on a NULL machine_id row: the
// one_primary_interface_per_machine index does not constrain it.
match is_primary {
Some(false) if interface.primary_interface => {
set_primary_interface(&interface.id, false, &mut *txn).await?;
interface.primary_interface = false;
}
Some(true) if !interface.primary_interface => {
set_primary_interface(&interface.id, true, &mut *txn).await?;
interface.primary_interface = true;
}
_ => {}
}
Ok(interface)
}
Expand Down Expand Up @@ -1457,33 +1458,52 @@ pub async fn move_predicted_machine_interface_to_machine(
);
}

let (machine_interface_id, current_boot_interface_id, row_created_here) = match existing_row {
// This host has already DHCP'd once and created a machine_interface;
// we will migrate it below.
Some(machine_interface_snapshot) => (
machine_interface_snapshot.id,
machine_interface_snapshot.boot_interface_id,
false,
),
None => {
// This host has never DHCP'd before, create a new machine_interface for it
// (`create` recovers any retained boot interface id onto it).
let machine_interface = create(
txn,
&[network_segment],
&predicted_machine_interface.mac_address,
let (machine_interface_id, current_boot_interface_id, current_primary, row_created_here) =
match existing_row {
// This host has already DHCP'd once and created a machine_interface;
// we will migrate it below.
Some(machine_interface_snapshot) => (
machine_interface_snapshot.id,
machine_interface_snapshot.boot_interface_id,
machine_interface_snapshot.primary_interface,
false,
AddressSelectionStrategy::NextAvailableIp,
retained_window,
)
.await?;
(
machine_interface.id,
machine_interface.boot_interface_id,
true,
)
}
};
),
None => {
// This host has never DHCP'd before, create a new machine_interface for it
// (`create` recovers any retained boot interface id onto it). The promoted row
// is primary exactly when the prediction carries the declared
// `ExpectedHostNic.primary`.
let machine_interface = create(
txn,
&[network_segment],
&predicted_machine_interface.mac_address,
predicted_machine_interface.primary_interface,
AddressSelectionStrategy::NextAvailableIp,
retained_window,
)
.await?;
(
machine_interface.id,
machine_interface.boot_interface_id,
machine_interface.primary_interface,
true,
)
}
};

// Land the declared boot interface as we promote: the prediction holds the
// host's declared `ExpectedHostNic.primary`, so a promoted interface is primary
// exactly when it was declared. (An anonymous row found here keeps whatever
// flag DHCP set, so reconcile it to the declaration.) Done before association
// so a row reaches its machine already carrying the right flag.
if current_primary != predicted_machine_interface.primary_interface {
set_primary_interface(
&machine_interface_id,
predicted_machine_interface.primary_interface,
&mut *txn,
)
.await?;
}

// Take either the newly-created interface or the anonymous one we found, and associate it with
// this machine.
Expand Down
3 changes: 2 additions & 1 deletion crates/api-db/src/predicted_machine_interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,13 @@ pub async fn create(
value: NewPredictedMachineInterface<'_>,
txn: &mut PgConnection,
) -> Result<PredictedMachineInterface, DatabaseError> {
let query = "INSERT INTO predicted_machine_interfaces (machine_id, mac_address, expected_network_segment_type, boot_interface_id) VALUES ($1, $2, $3, $4) RETURNING *";
let query = "INSERT INTO predicted_machine_interfaces (machine_id, mac_address, expected_network_segment_type, boot_interface_id, primary_interface) VALUES ($1, $2, $3, $4, $5) RETURNING *";
sqlx::query_as(query)
.bind(value.machine_id)
.bind(value.mac_address)
.bind(value.expected_network_segment_type)
.bind(&value.boot_interface_id)
.bind(value.primary_interface)
.fetch_one(txn)
.await
.map_err(|e| DatabaseError::query(query, e))
Expand Down
Loading
Loading