From 1737378ecc1855e7f64d958944181d9d492afaa7 Mon Sep 17 00:00:00 2001 From: Chet Nichols III Date: Wed, 17 Jun 2026 23:25:42 -0700 Subject: [PATCH] feat: honor a declared primary host NIC when selecting the boot device ExpectedHostNic.primary is meant to say "this NIC is the host's boot interface," but ingestion ignored it: a zero-DPU or NIC-mode host's predicted interfaces always promoted as non-primary, so the declared NIC never stuck and the host booted via the lowest-MAC fallback until an operator ran set-primary-interface. The declared primary is now authoritative across the writers that own a NIC -- recorded on the prediction so it survives promotion, and honored by the DHCP path. This also settles the pre-ingestion window for multi-NIC hosts: several NICs that lease before ingestion each default to primary, and adopting the second one tripped the one_primary_interface_per_machine index and failed the host's ingestion. Ingestion now reconciles the adopted rows so exactly one interface is primary. - Record the declared primary on predicted_machine_interfaces and resolve it through one ExpectedMachineData::declared_primary_mac() the writers share; promotion and the DHCP find-or-create path both honor it. - Reconcile the adopted interfaces at zero-DPU ingestion to exactly one primary, ending the OnePrimaryInterface adoption failure. - Nothing declared -> today's automation stands: DPU takeover during ingestion, else the pick_boot_interface lowest-MAC fallback. Tests cover the declared primary surviving DHCP arrival order, promotion landing it on the real row, multi-NIC adoption no longer colliding, and a resolver unit test. Part of #2657 (epic #2660). The genuine-DPU-mode case -- a declared integrated NIC while the DPU stays managed -- is tracked separately in Signed-off-by: Chet Nichols III #2668. --- crates/api-core/src/dhcp/discover.rs | 24 +- .../src/handlers/bmc_endpoint_explorer.rs | 28 +- crates/api-core/src/tests/expected_machine.rs | 76 ++++++ ...17_predicted_machine_interface_primary.sql | 13 + crates/api-db/src/machine_interface.rs | 114 +++++---- .../api-db/src/predicted_machine_interface.rs | 3 +- crates/api-model/src/expected_machine.rs | 52 ++++ .../src/predicted_machine_interface.rs | 8 + crates/site-explorer/src/machine_creator.rs | 24 +- crates/site-explorer/tests/zero_dpu.rs | 241 +++++++++++++++++- 10 files changed, 508 insertions(+), 75 deletions(-) create mode 100644 crates/api-db/migrations/20260616215917_predicted_machine_interface_primary.sql diff --git a/crates/api-core/src/dhcp/discover.rs b/crates/api-core/src/dhcp/discover.rs index 9a82226ec6..34a8a9b018 100644 --- a/crates/api-core/src/dhcp/discover.rs +++ b/crates/api-core/src/dhcp/discover.rs @@ -270,20 +270,18 @@ pub async fn discover_dhcp( .await .map_err(CarbideError::from)? { - // Walk the host_nics list to see if there's a matching NIC (because it - // may have a static reservation need or a primary-interface need) - let mut declared_primary_mac: Option = None; - for nic in &m.data.host_nics { - if nic.primary == Some(true) { - declared_primary_mac = Some(nic.mac_address); - } - if nic.mac_address == parsed_mac { - host_nic = Some(nic.clone()); - } - } - if let Some(pmac) = declared_primary_mac { - is_primary_nic = Some(pmac == parsed_mac); + // The host's declared primary NIC (if any) decides whether this + // MAC is its boot interface; the matched NIC also carries any + // static reservation need handled below. + if let Some(declared_primary_mac) = m.data.declared_primary_mac() { + is_primary_nic = Some(declared_primary_mac == parsed_mac); } + host_nic = m + .data + .host_nics + .iter() + .find(|nic| nic.mac_address == parsed_mac) + .cloned(); if let Some(ref nic) = host_nic && let Some(fixed_ip) = nic.fixed_ip { diff --git a/crates/api-core/src/handlers/bmc_endpoint_explorer.rs b/crates/api-core/src/handlers/bmc_endpoint_explorer.rs index e8742e3e88..48189a215e 100644 --- a/crates/api-core/src/handlers/bmc_endpoint_explorer.rs +++ b/crates/api-core/src/handlers/bmc_endpoint_explorer.rs @@ -54,11 +54,12 @@ use crate::api::{Api, log_machine_id, log_request_data}; /// `predicted_machine_interfaces` instead: the predicted NIC's MAC and /// recorded Redfish interface id form the same [`MachineBootInterface`] the /// real row will hold once the lease promotes it. Predictions answer only -/// when unambiguous -- exactly one non-underlay prediction. Predictions hold -/// no primary flag, so with several (e.g. a host whose report lists SuperNICs -/// alongside the boot NIC) the declared `ExpectedHostNic.primary` cannot be -/// applied here; resolution refuses to guess and the action keeps requiring -/// an explicit MAC, which the matching prediction's recorded id completes. +/// when unambiguous -- exactly one non-underlay prediction. Predictions now +/// carry a `primary_interface` flag, but this resolver doesn't consult it yet, +/// so with several (e.g. a host whose report lists SuperNICs alongside the boot +/// NIC) the declared `ExpectedHostNic.primary` is not applied here; resolution +/// refuses to guess and the action keeps requiring an explicit MAC, which the +/// matching prediction's recorded id completes. /// The machine-controller does not consult predictions at all yet -- its /// boot states wait out this window -- a known follow-up. /// @@ -118,8 +119,9 @@ fn resolve_admin_boot_interface_target( } // The rows offered no boot candidate: the machine's predicted // NICs answer, but only when unambiguous -- exactly one - // non-underlay prediction. Predictions hold no primary flag, so - // with several the declared intent is unknowable here. + // non-underlay prediction. Predictions now carry a primary flag, + // but this resolver doesn't consult it yet, so with several the + // declared intent isn't applied here. let mut bootable = candidates.predicted.iter().filter(|predicted| { predicted.expected_network_segment_type != NetworkSegmentType::Underlay }); @@ -1140,6 +1142,7 @@ mod tests { mac_address: mac.parse().unwrap(), expected_network_segment_type: NetworkSegmentType::HostInband, boot_interface_id: boot_interface_id.map(String::from), + primary_interface: false, } } @@ -1318,11 +1321,12 @@ mod tests { #[test] fn no_mac_multiple_predictions_refuse_to_guess_a_boot_device() { - // Predictions hold no primary flag, so with several (a report listing - // SuperNICs alongside the boot NIC) the declared intent is unknowable: - // resolution refuses to guess rather than silently programming boot - // order against whichever NIC sorts lowest. The operator's explicit - // MAC still resolves, completed from the matching prediction. + // These predictions are non-primary and this resolver doesn't consult + // the primary flag yet, so with several (a report listing SuperNICs + // alongside the boot NIC) the declared intent is unknowable: resolution + // refuses to guess rather than silently programming boot order against + // whichever NIC sorts lowest. The operator's explicit MAC still + // resolves, completed from the matching prediction. let c = BootInterfaceCandidates { interfaces: vec![], predicted: vec![ diff --git a/crates/api-core/src/tests/expected_machine.rs b/crates/api-core/src/tests/expected_machine.rs index 4a1b763246..35805258dc 100644 --- a/crates/api-core/src/tests/expected_machine.rs +++ b/crates/api-core/src/tests/expected_machine.rs @@ -2622,6 +2622,82 @@ async fn test_add_rejects_multiple_primary_host_nics( Ok(()) } +/// The declared primary survives whichever order its NICs DHCP in: leasing the +/// non-primary NIC first, then the declared primary, still lands the declared +/// primary as `primary_interface` and the other as non-primary. +#[crate::sqlx_test] +async fn test_declared_primary_survives_dhcp_arrival_order( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = { + let mut config = get_config(); + config.rack_management_enabled = true; + create_test_env_with_overrides(pool, TestEnvOverrides::with_config(config)).await + }; + let bmc_mac: MacAddress = "9A:9B:9C:9D:9F:10".parse().unwrap(); + let primary_mac: MacAddress = "9A:9B:9C:9D:9F:11".parse().unwrap(); + let other_mac: MacAddress = "9A:9B:9C:9D:9F:12".parse().unwrap(); + + env.api + .add_expected_machine(tonic::Request::new(rpc::forge::ExpectedMachine { + id: None, + bmc_mac_address: bmc_mac.to_string(), + bmc_username: "ADMIN".into(), + bmc_password: "PASS".into(), + chassis_serial_number: "EM-PRIMARY-003".into(), + host_nics: vec![ + rpc::forge::ExpectedHostNic { + mac_address: primary_mac.to_string(), + nic_type: Some("onboard".into()), + fixed_ip: None, + fixed_mask: None, + fixed_gateway: None, + primary: Some(true), + }, + rpc::forge::ExpectedHostNic { + mac_address: other_mac.to_string(), + nic_type: Some("onboard".into()), + fixed_ip: None, + fixed_mask: None, + fixed_gateway: None, + primary: None, + }, + ], + ..Default::default() + })) + .await?; + + // The non-primary NIC leases first, then the declared primary. + for mac in [other_mac, primary_mac] { + let mac_str = mac.to_string(); + env.api + .discover_dhcp( + common::rpc_builder::DhcpDiscovery::builder( + &mac_str, + common::api_fixtures::FIXTURE_DHCP_RELAY_ADDRESS, + ) + .tonic_request(), + ) + .await?; + } + + let mut txn = env.pool.begin().await?; + let primary = db::machine_interface::find_by_mac_address(&mut *txn, primary_mac).await?; + let other = db::machine_interface::find_by_mac_address(&mut *txn, other_mac).await?; + assert_eq!(primary.len(), 1); + assert_eq!(other.len(), 1); + assert!( + primary[0].primary_interface, + "the declared primary NIC should be primary even when it leases last" + ); + assert!( + !other[0].primary_interface, + "the non-declared NIC should not be primary" + ); + + Ok(()) +} + /// Simple test to have some round-trip coverage for `ExpectedMachine.dpu_mode` /// to make sure a `NicMode` setting makes it from the API to the DB and back /// correctly. Verifies: diff --git a/crates/api-db/migrations/20260616215917_predicted_machine_interface_primary.sql b/crates/api-db/migrations/20260616215917_predicted_machine_interface_primary.sql new file mode 100644 index 0000000000..214cd49b33 --- /dev/null +++ b/crates/api-db/migrations/20260616215917_predicted_machine_interface_primary.sql @@ -0,0 +1,13 @@ +-- Carry the declared ExpectedHostNic.primary boot interface on the prediction so +-- it survives promotion into machine_interfaces (the predicted row previously +-- promoted as primary_interface = false unconditionally). Defaults false: a host +-- that declares nothing keeps today's automation -- the boot interface is chosen +-- by the pick_boot_interface fallback (lowest-MAC non-underlay) or DPU takeover. +ALTER TABLE predicted_machine_interfaces + ADD COLUMN primary_interface boolean NOT NULL DEFAULT false; + +-- The machine_id foreign key has no backing index (Postgres does not create one +-- for FK columns), so find_by_machine_id scans the table. Add it now that +-- promotion reads predictions by machine more often. +CREATE INDEX predicted_machine_interfaces_machine_id_idx + ON predicted_machine_interfaces (machine_id); diff --git a/crates/api-db/src/machine_interface.rs b/crates/api-db/src/machine_interface.rs index 5d38641550..24086cd2bd 100644 --- a/crates/api-db/src/machine_interface.rs +++ b/crates/api-db/src/machine_interface.rs @@ -419,20 +419,12 @@ pub async fn find_one( // newly_created_interface indicates that we couldn't find a // MachineInterface, so created new one. // -// `is_primary` integrates `ExpectedHostNic.primary` into machine -// interface creation. If True, this NIC is declared the primary -// boot NIC (which is/was the previous default behavior anyway, -// meaning None does the same thing), and this is fine, because -// at the end of the day, site-explorer will end up demoting it -// as part of attaching a DPU. -// -// Now, if it's *False*, there's a different NIC on this host declared -// as the boot NIC, so we actually overide the new interface and -// explicitly mark it as non-primary here. We *could* bake this in -// as part of validate_existing_mac_and_create, but since this is -// the only call-site that cares about it, I'm making it specific -// to here. -// TODO(chet): ...but consider plumbing it through. +// `is_primary` carries the declared `ExpectedHostNic.primary` for this MAC: +// `Some(true)` -- this NIC is the host's declared boot interface, `Some(false)` +// -- a different NIC is, `None` -- nothing was declared. On a newly created (and +// thus still machine-less) row we make that declaration stick, promoting to or +// demoting from the creation default as needed, so the boot interface is right +// from the first lease. `None` keeps the creation default. // // If we're not making a new interface, then existing interfaces // are returned untouched. @@ -456,10 +448,6 @@ pub async fn find_or_create_machine_interface( %mac_address, "Found no existing machine with mac address {mac_address} using networks with relays {relaystr}", ); - // validate_existing_mac_and_create hardcodes primary_interface: true - // at creation. If the caller has explicitly declared a *different* - // NIC as this machine's primary (i.e. is_primary == false), override the - // true/default here. let mut interface = validate_existing_mac_and_create( &mut *txn, mac_address, @@ -468,9 +456,22 @@ pub async fn find_or_create_machine_interface( retained_window, ) .await?; - if is_primary == Some(false) && interface.primary_interface { - set_primary_interface(&interface.id, false, &mut *txn).await?; - interface.primary_interface = false; + // Make the declaration authoritative on this machine-less row. + // `validate_existing_mac_and_create` defaults a freshly created row to + // primary, so the demote covers "a different NIC is declared primary" + // and the promote covers a row we *found* (rather than created) that is + // the declared primary. Safe on a NULL machine_id row: the + // one_primary_interface_per_machine index does not constrain it. + match is_primary { + Some(false) if interface.primary_interface => { + set_primary_interface(&interface.id, false, &mut *txn).await?; + interface.primary_interface = false; + } + Some(true) if !interface.primary_interface => { + set_primary_interface(&interface.id, true, &mut *txn).await?; + interface.primary_interface = true; + } + _ => {} } Ok(interface) } @@ -1457,33 +1458,52 @@ pub async fn move_predicted_machine_interface_to_machine( ); } - let (machine_interface_id, current_boot_interface_id, row_created_here) = match existing_row { - // This host has already DHCP'd once and created a machine_interface; - // we will migrate it below. - Some(machine_interface_snapshot) => ( - machine_interface_snapshot.id, - machine_interface_snapshot.boot_interface_id, - false, - ), - None => { - // This host has never DHCP'd before, create a new machine_interface for it - // (`create` recovers any retained boot interface id onto it). - let machine_interface = create( - txn, - &[network_segment], - &predicted_machine_interface.mac_address, + let (machine_interface_id, current_boot_interface_id, current_primary, row_created_here) = + match existing_row { + // This host has already DHCP'd once and created a machine_interface; + // we will migrate it below. + Some(machine_interface_snapshot) => ( + machine_interface_snapshot.id, + machine_interface_snapshot.boot_interface_id, + machine_interface_snapshot.primary_interface, false, - AddressSelectionStrategy::NextAvailableIp, - retained_window, - ) - .await?; - ( - machine_interface.id, - machine_interface.boot_interface_id, - true, - ) - } - }; + ), + None => { + // This host has never DHCP'd before, create a new machine_interface for it + // (`create` recovers any retained boot interface id onto it). The promoted row + // is primary exactly when the prediction carries the declared + // `ExpectedHostNic.primary`. + let machine_interface = create( + txn, + &[network_segment], + &predicted_machine_interface.mac_address, + predicted_machine_interface.primary_interface, + AddressSelectionStrategy::NextAvailableIp, + retained_window, + ) + .await?; + ( + machine_interface.id, + machine_interface.boot_interface_id, + machine_interface.primary_interface, + true, + ) + } + }; + + // Land the declared boot interface as we promote: the prediction holds the + // host's declared `ExpectedHostNic.primary`, so a promoted interface is primary + // exactly when it was declared. (An anonymous row found here keeps whatever + // flag DHCP set, so reconcile it to the declaration.) Done before association + // so a row reaches its machine already carrying the right flag. + if current_primary != predicted_machine_interface.primary_interface { + set_primary_interface( + &machine_interface_id, + predicted_machine_interface.primary_interface, + &mut *txn, + ) + .await?; + } // Take either the newly-created interface or the anonymous one we found, and associate it with // this machine. diff --git a/crates/api-db/src/predicted_machine_interface.rs b/crates/api-db/src/predicted_machine_interface.rs index 5a07591639..597e617190 100644 --- a/crates/api-db/src/predicted_machine_interface.rs +++ b/crates/api-db/src/predicted_machine_interface.rs @@ -112,12 +112,13 @@ pub async fn create( value: NewPredictedMachineInterface<'_>, txn: &mut PgConnection, ) -> Result { - let query = "INSERT INTO predicted_machine_interfaces (machine_id, mac_address, expected_network_segment_type, boot_interface_id) VALUES ($1, $2, $3, $4) RETURNING *"; + let query = "INSERT INTO predicted_machine_interfaces (machine_id, mac_address, expected_network_segment_type, boot_interface_id, primary_interface) VALUES ($1, $2, $3, $4, $5) RETURNING *"; sqlx::query_as(query) .bind(value.machine_id) .bind(value.mac_address) .bind(value.expected_network_segment_type) .bind(&value.boot_interface_id) + .bind(value.primary_interface) .fetch_one(txn) .await .map_err(|e| DatabaseError::query(query, e)) diff --git a/crates/api-model/src/expected_machine.rs b/crates/api-model/src/expected_machine.rs index 83cc3be54c..3a409e20cf 100644 --- a/crates/api-model/src/expected_machine.rs +++ b/crates/api-model/src/expected_machine.rs @@ -170,6 +170,22 @@ pub struct ExpectedMachineData { // unless you want to go update all the files in each production deployment that autoload // the expected machines on api startup +impl ExpectedMachineData { + /// The MAC the operator declared as this host's boot interface via + /// `ExpectedHostNic.primary`. This is the single source of declared boot + /// intent the writers consult -- site-explorer ingestion, DHCP, and + /// prediction promotion -- so they all agree on which NIC wins. The API + /// enforces at most one `primary` host NIC, so the first match is the + /// declaration. `None` leaves the boot interface to today's automation + /// (DPU takeover during ingestion, else the `pick_boot_interface` fallback). + pub fn declared_primary_mac(&self) -> Option { + self.host_nics + .iter() + .find(|nic| nic.primary == Some(true)) + .map(|nic| nic.mac_address) + } +} + /// Per-host lifecycle profile for settings that affect state-machine progression. /// `Option` fields support CLI patch semantics (`None` = not specified, /// keep existing DB value via `COALESCE`). Converts to the runtime `HostProfile` @@ -427,4 +443,40 @@ mod tests { }; assert!(!hlp.is_empty()); } + + /// `declared_primary_mac` returns the MAC of the one NIC flagged + /// `primary: Some(true)`, and `None` when nothing is declared. `primary: + /// Some(false)` is an explicit non-primary, not a declaration. + #[test] + fn declared_primary_mac_returns_the_flagged_nic() { + let mac_a: MacAddress = "AA:BB:CC:00:00:01".parse().unwrap(); + let mac_b: MacAddress = "AA:BB:CC:00:00:02".parse().unwrap(); + + let nic = |mac: MacAddress, primary: Option| ExpectedHostNic { + mac_address: mac, + primary, + ..Default::default() + }; + + // Nothing declared -- empty, or only explicit non-primaries. + assert_eq!(ExpectedMachineData::default().declared_primary_mac(), None); + assert_eq!( + ExpectedMachineData { + host_nics: vec![nic(mac_a, None), nic(mac_b, Some(false))], + ..Default::default() + } + .declared_primary_mac(), + None + ); + + // The declared NIC wins. + assert_eq!( + ExpectedMachineData { + host_nics: vec![nic(mac_a, Some(false)), nic(mac_b, Some(true))], + ..Default::default() + } + .declared_primary_mac(), + Some(mac_b) + ); + } } diff --git a/crates/api-model/src/predicted_machine_interface.rs b/crates/api-model/src/predicted_machine_interface.rs index a7e001d85c..eada95c876 100644 --- a/crates/api-model/src/predicted_machine_interface.rs +++ b/crates/api-model/src/predicted_machine_interface.rs @@ -32,6 +32,12 @@ pub struct PredictedMachineInterface { /// MAC, handed to the `machine_interfaces` row at DHCP promotion so /// the host's boot target is a full pair from its first owned interface. pub boot_interface_id: Option, + /// The declared `ExpectedHostNic.primary` intent, carried so promotion into + /// `machine_interfaces` lands the operator's chosen boot interface as + /// `primary_interface`. `false` when nothing is declared -- promotion then + /// leaves the row non-primary and the boot interface falls to the + /// `pick_boot_interface` automation. + pub primary_interface: bool, } impl PredictedMachineInterface { @@ -50,4 +56,6 @@ pub struct NewPredictedMachineInterface<'a> { pub mac_address: MacAddress, pub expected_network_segment_type: NetworkSegmentType, pub boot_interface_id: Option, + /// See [`PredictedMachineInterface::primary_interface`]. + pub primary_interface: bool, } diff --git a/crates/site-explorer/src/machine_creator.rs b/crates/site-explorer/src/machine_creator.rs index 083490ceda..53346c876a 100644 --- a/crates/site-explorer/src/machine_creator.rs +++ b/crates/site-explorer/src/machine_creator.rs @@ -418,9 +418,19 @@ impl MachineCreator { self.create_machine_from_explored_managed_host(txn, managed_host, machine_id, machine_data) .await?; + // Settle this host's single boot interface as we take ownership: the + // declared `ExpectedHostNic.primary` (if any) is the host's primary, and + // every other NIC is non-primary. Routing both the already-leased rows and + // the freshly-minted predictions through the same declaration makes the + // choice authoritative regardless of DHCP arrival order, and keeps exactly + // one primary per machine -- so adopting several NICs that leased before + // ingestion never trips the `one_primary_interface_per_machine` index. + let declared_primary = machine_data.and_then(|data| data.declared_primary_mac()); + // Create and attach a non-DPU machine_interface to the host for every MAC address we see in // the exploration report for mac_address in mac_addresses { + let is_declared_primary = declared_primary == Some(mac_address); if let Some(machine_interface) = db::machine_interface::find_by_mac_address(&mut *txn, mac_address) .await? @@ -443,7 +453,18 @@ impl MachineCreator { id: mac_address.to_string(), }); } else { - // ...If it has no MachineId, the host must have DHCP'd before site-explorer ran. Set it to the new machine ID. + // ...If it has no MachineId, the host must have DHCP'd before site-explorer ran. + // Reconcile its primary flag to the declaration before adopting it: an anonymous + // DHCP row defaults to primary=true, so without this two pre-ingestion leases + // would both arrive primary and collide on association. + if machine_interface.primary_interface != is_declared_primary { + db::machine_interface::set_primary_interface( + &machine_interface.id, + is_declared_primary, + txn, + ) + .await?; + } tracing::info!(%mac_address, %machine_id, "Migrating unowned machine_interface to new managed host"); db::machine_interface::associate_interface_with_machine( &machine_interface.id, @@ -471,6 +492,7 @@ impl MachineCreator { mac_address, expected_network_segment_type: NetworkSegmentType::HostInband, boot_interface_id, + primary_interface: is_declared_primary, }, txn, ) diff --git a/crates/site-explorer/tests/zero_dpu.rs b/crates/site-explorer/tests/zero_dpu.rs index 83a8295f72..88ea0cf1c5 100644 --- a/crates/site-explorer/tests/zero_dpu.rs +++ b/crates/site-explorer/tests/zero_dpu.rs @@ -26,7 +26,7 @@ use carbide_test_harness::network::segment::TestNetworkSegment; use carbide_test_harness::prelude::*; use carbide_test_harness::test_support::fixture_config::FixtureDefault as _; use mac_address::MacAddress; -use model::expected_machine::{DpuMode, ExpectedMachine, ExpectedMachineData}; +use model::expected_machine::{DpuMode, ExpectedHostNic, ExpectedMachine, ExpectedMachineData}; use model::test_support::ManagedHostConfig; struct ZeroDpuEnv { @@ -563,3 +563,242 @@ async fn test_exploration_refreshes_pending_predicted_boot_interface_id( Ok(()) } + +/// Two NICs on a zero-DPU host that lease before site-explorer ingests it both +/// land as anonymous primary rows (the DHCP creation default). With no declared +/// primary, ingestion still adopts both -- demoting the extras so at most one +/// primary survives -- rather than colliding on the +/// `one_primary_interface_per_machine` index. (Regression: a second primary +/// adoption previously failed the host's ingestion outright.) +#[sqlx_test] +async fn test_zero_dpu_multi_nic_no_declaration_adopts_without_primary_collision( + pool: PgPool, +) -> Result<(), Box> { + let env = init(pool).await; + let nic_a = MacAddress::from_str("d4:04:e6:84:20:01").unwrap(); + let nic_b = MacAddress::from_str("d4:04:e6:84:20:02").unwrap(); + let mock_host = ManagedHostConfig { + dpus: vec![], + non_dpu_macs: vec![nic_a, nic_b], + ..ManagedHostConfig::default() + }; + // No declared primary: NoDpu with empty host_nics. + register_zero_dpu_expected_machine(&env, &mock_host).await?; + + // Both NICs lease before site-explorer runs -> two anonymous primary rows. + for nic in [nic_a, nic_b] { + env.api() + .discover_dhcp( + rpc::forge::DhcpDiscovery::builder(nic, env.host_inband_segment.relay_address) + .vendor_string("Bluefield") + .tonic_request(), + ) + .await?; + } + + let mut txn = env.pool.begin().await?; + for nic in [nic_a, nic_b] { + let row = db::machine_interface::find_by_mac_address(txn.as_mut(), nic) + .await? + .into_iter() + .next() + .expect("pre-ingestion DHCP should have created an anonymous row"); + assert!( + row.machine_id.is_none(), + "the row should be anonymous before ingestion" + ); + assert!( + row.primary_interface, + "anonymous DHCP rows default to primary" + ); + } + txn.rollback().await?; + + // Ingest the host. Adoption must reconcile the two primary rows. + let host_bmc_response = env + .api() + .discover_dhcp( + rpc::forge::DhcpDiscovery::builder( + mock_host.bmc_mac_address, + env.underlay_segment.relay_address, + ) + .vendor_string("SomeVendor") + .tonic_request(), + ) + .await? + .into_inner(); + let host_bmc_ip = host_bmc_response.address.parse()?; + env.site_explorer.insert_endpoints( + mock_host + .exploration_results(Some(host_bmc_ip), &[])? + .into_endpoints(), + ); + env.site_explorer.run_single_iteration().await?; + let mut txn = env.pool.begin().await?; + db::explored_endpoints::set_preingestion_complete(host_bmc_ip, &mut txn).await?; + txn.commit().await?; + env.site_explorer.run_single_iteration().await?; + + // Both NICs are now owned by the same host, with at most one primary. + let mut txn = env.pool.begin().await?; + let row_a = db::machine_interface::find_by_mac_address(txn.as_mut(), nic_a) + .await? + .into_iter() + .next() + .expect("nic_a should still have a row after ingestion"); + let row_b = db::machine_interface::find_by_mac_address(txn.as_mut(), nic_b) + .await? + .into_iter() + .next() + .expect("nic_b should still have a row after ingestion"); + txn.rollback().await?; + let machine_a = row_a + .machine_id + .expect("ingestion should have adopted nic_a onto the host"); + let machine_b = row_b + .machine_id + .expect("ingestion should have adopted nic_b onto the host"); + assert_eq!( + machine_a, machine_b, + "both NICs should be adopted by the same host machine" + ); + let primary_count = [row_a.primary_interface, row_b.primary_interface] + .into_iter() + .filter(|primary| *primary) + .count(); + assert!( + primary_count <= 1, + "at most one interface may be primary after adoption, got {primary_count}" + ); + + Ok(()) +} + +/// A zero-DPU host that declares one of its NICs `primary` mints that intent +/// onto the prediction, and DHCP promotion lands it: the declared NIC promotes +/// as primary and the other as non-primary -- even when the non-declared NIC +/// leases first. +#[sqlx_test] +async fn test_zero_dpu_declared_primary_promotes_as_primary( + pool: PgPool, +) -> Result<(), Box> { + let env = init(pool).await; + let primary_nic = MacAddress::from_str("d4:04:e6:84:21:01").unwrap(); + let other_nic = MacAddress::from_str("d4:04:e6:84:21:02").unwrap(); + let mock_host = ManagedHostConfig { + dpus: vec![], + non_dpu_macs: vec![primary_nic, other_nic], + ..ManagedHostConfig::default() + }; + + // Register the host declaring `primary_nic` as its boot interface. + let mut txn = env.pool.begin().await?; + db::expected_machine::create( + &mut txn, + ExpectedMachine { + id: None, + bmc_mac_address: mock_host.bmc_mac_address, + data: ExpectedMachineData { + serial_number: mock_host.serial.clone(), + dpu_mode: DpuMode::NoDpu, + host_nics: vec![ + ExpectedHostNic { + mac_address: primary_nic, + primary: Some(true), + ..Default::default() + }, + ExpectedHostNic { + mac_address: other_nic, + primary: None, + ..Default::default() + }, + ], + ..Default::default() + }, + }, + ) + .await?; + txn.commit().await?; + + // Ingest before either NIC leases, so both get predictions carrying the + // declared intent. + let host_bmc_response = env + .api() + .discover_dhcp( + rpc::forge::DhcpDiscovery::builder( + mock_host.bmc_mac_address, + env.underlay_segment.relay_address, + ) + .vendor_string("SomeVendor") + .tonic_request(), + ) + .await? + .into_inner(); + let host_bmc_ip = host_bmc_response.address.parse()?; + env.site_explorer.insert_endpoints( + mock_host + .exploration_results(Some(host_bmc_ip), &[])? + .into_endpoints(), + ); + env.site_explorer.run_single_iteration().await?; + let mut txn = env.pool.begin().await?; + db::explored_endpoints::set_preingestion_complete(host_bmc_ip, &mut txn).await?; + txn.commit().await?; + env.site_explorer.run_single_iteration().await?; + + let mut txn = env.pool.begin().await?; + let predicted_primary = + db::predicted_machine_interface::find_by_mac_address(&mut txn, primary_nic) + .await? + .expect("the declared NIC should have a prediction"); + assert!( + predicted_primary.primary_interface, + "the declared NIC's prediction should carry the primary intent" + ); + let predicted_other = db::predicted_machine_interface::find_by_mac_address(&mut txn, other_nic) + .await? + .expect("the non-declared NIC should have a prediction"); + assert!( + !predicted_other.primary_interface, + "the non-declared NIC's prediction should be non-primary" + ); + txn.rollback().await?; + + // Promote with the non-declared NIC leasing first. + for nic in [other_nic, primary_nic] { + env.api() + .discover_dhcp( + rpc::forge::DhcpDiscovery::builder(nic, env.host_inband_segment.relay_address) + .vendor_string("Bluefield") + .tonic_request(), + ) + .await?; + } + + let mut txn = env.pool.begin().await?; + let primary_row = db::machine_interface::find_by_mac_address(txn.as_mut(), primary_nic) + .await? + .into_iter() + .next() + .expect("the declared NIC should be promoted to a row"); + let other_row = db::machine_interface::find_by_mac_address(txn.as_mut(), other_nic) + .await? + .into_iter() + .next() + .expect("the non-declared NIC should be promoted to a row"); + txn.rollback().await?; + assert!( + primary_row.primary_interface, + "the declared NIC should promote as the primary interface" + ); + assert!( + !other_row.primary_interface, + "the non-declared NIC should promote as non-primary" + ); + assert_eq!( + primary_row.machine_id, other_row.machine_id, + "both NICs should belong to the same host" + ); + + Ok(()) +}