From cc63c6db18f17cfe268f4c99450f79331bfb6799 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 14 Feb 2026 11:16:45 -0800 Subject: [PATCH 1/5] thermal: only poll T6 temperatures in A0+HP Presently, the thermal loop on Gimlet and Cosmo will attempt to read the T6 temperature sensors whilst in the A0 *or* A0+HP power states. This is wrong, as the NIC is only known to be powered in the A0+HP state (since its sequencing is controlled by the host OS). This means that if the T6 MAPOs but the host processor is still up, the thermal loop will continue trying to read temperatures from the T6 whilst it is not powered. Because it's not powered, we will just read I2C NACKs from it. This causes the logic in the thermal loop that attempts to extrapolate a worst-case temperature increase for devices which have transient sensor read errors to imagine a T6 temperature which continues to increase, eventually leading to a thermal shutdown. This is described in #2384. This commit adds a bit in the Gimlet and Cosmo thermal tasks' `PowerBitmask`s to differentiate between the A0 and A0+HP power states, and only consider the T6 temperature if the A0+HP bit is set. This will fix #2384...alongside #2390, which is necessary to fix Cosmo ignoring the sequencer IRQ that would tell it to transition from A0+HP back to A0 if the NIC MAPOs. --- task/thermal/src/bsp/cosmo_ab.rs | 10 ++++++---- task/thermal/src/bsp/gimlet_bcdef.rs | 21 +++++++++++++++++---- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/task/thermal/src/bsp/cosmo_ab.rs b/task/thermal/src/bsp/cosmo_ab.rs index 608c7c8977..821dc496ac 100644 --- a/task/thermal/src/bsp/cosmo_ab.rs +++ b/task/thermal/src/bsp/cosmo_ab.rs @@ -66,6 +66,9 @@ bitflags::bitflags! { // in A2; you probably want to use `A0_OR_A2` instead. const A2 = 0b00000001; const A0 = 0b00000010; + // A0+HP: T6 is also active. + const HP = 0b00000100; + const A0_PLUS_HP = Self::A0.bits() | Self::HP.bits(); const A0_OR_A2 = Self::A0.bits() | Self::A2.bits(); } } @@ -97,9 +100,8 @@ impl Bsp { pub fn power_mode(&self) -> PowerBitmask { match self.seq.get_state() { - PowerState::A0PlusHP | PowerState::A0 | PowerState::A0Reset => { - PowerBitmask::A0 - } + PowerState::A0PlusHP => PowerBitmask::A0_PLUS_HP, + PowerState::A0 | PowerState::A0Reset => PowerBitmask::A0, PowerState::A2 | PowerState::A2PlusFans | PowerState::A0Thermtrip => PowerBitmask::A2, @@ -236,7 +238,7 @@ const INPUTS: [InputChannel; NUM_TEMPERATURE_INPUTS] = [ sensors::TMP451_T6_TEMPERATURE_SENSOR, ), T6_THERMALS, - PowerBitmask::A0, + PowerBitmask::HP, ChannelType::MustBePresent, ), // U.2 drives diff --git a/task/thermal/src/bsp/gimlet_bcdef.rs b/task/thermal/src/bsp/gimlet_bcdef.rs index d81e0c1e44..4b382f63fa 100644 --- a/task/thermal/src/bsp/gimlet_bcdef.rs +++ b/task/thermal/src/bsp/gimlet_bcdef.rs @@ -82,12 +82,16 @@ bitflags::bitflags! { const A0 = 0b00000010; const A0_OR_A2 = Self::A0.bits() | Self::A2.bits(); + // A0+HP: T6 is also active. + const HP = 0b00000100; + // Bonus bits for M.2 power, which is switched separately. We *cannot* // read the M.2 drives when they are unpowered; otherwise, we risk // locking up the I2C bus (see hardware-gimlet#1804 for the gory // details) - const M2A = 0b00000100; - const M2B = 0b00001000; + const M2A = 0b00001000; + const M2B = 0b00010000; + } } @@ -118,7 +122,9 @@ impl Bsp { pub fn power_mode(&self) -> PowerBitmask { match self.seq.get_state() { - PowerState::A0PlusHP | PowerState::A0 | PowerState::A0Reset => { + state @ PowerState::A0PlusHP + | PowerState::A0 + | PowerState::A0Reset => { use drv_i2c_devices::max5970; use userlib::units::Ohms; @@ -145,6 +151,13 @@ impl Bsp { // TODO: error handling here? Err(_e) => (), } + + // If we are in A0+HP, also care about T6 temperatures. + // Otherwise, it may not be powered. + if state == PowerState::A0PlusHP { + out |= PowerBitmask::HP; + } + out } PowerState::A2 @@ -321,7 +334,7 @@ const INPUTS: [InputChannel; NUM_TEMPERATURE_INPUTS] = [ sensors::TMP451_T6_TEMPERATURE_SENSOR, ), T6_THERMALS, - PowerBitmask::A0, + PowerBitmask::HP, ChannelType::MustBePresent, ), InputChannel::new( From e082bc439407e68f239f246544be94d64eacf6aa Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 14 Feb 2026 11:41:32 -0800 Subject: [PATCH 2/5] back out Gimlet change @rmustacc informs me that this does indeed only apply on Cosmo, as: > Yeah, Gimlet has the TMP451 on V3P3_NIC_A0HP. > This was folded into the A0 domain by the FPGA logic. So, we can ignore this on Gimlet, which simplifies things there a bit. --- task/thermal/src/bsp/gimlet_bcdef.rs | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/task/thermal/src/bsp/gimlet_bcdef.rs b/task/thermal/src/bsp/gimlet_bcdef.rs index 4b382f63fa..d81e0c1e44 100644 --- a/task/thermal/src/bsp/gimlet_bcdef.rs +++ b/task/thermal/src/bsp/gimlet_bcdef.rs @@ -82,16 +82,12 @@ bitflags::bitflags! { const A0 = 0b00000010; const A0_OR_A2 = Self::A0.bits() | Self::A2.bits(); - // A0+HP: T6 is also active. - const HP = 0b00000100; - // Bonus bits for M.2 power, which is switched separately. We *cannot* // read the M.2 drives when they are unpowered; otherwise, we risk // locking up the I2C bus (see hardware-gimlet#1804 for the gory // details) - const M2A = 0b00001000; - const M2B = 0b00010000; - + const M2A = 0b00000100; + const M2B = 0b00001000; } } @@ -122,9 +118,7 @@ impl Bsp { pub fn power_mode(&self) -> PowerBitmask { match self.seq.get_state() { - state @ PowerState::A0PlusHP - | PowerState::A0 - | PowerState::A0Reset => { + PowerState::A0PlusHP | PowerState::A0 | PowerState::A0Reset => { use drv_i2c_devices::max5970; use userlib::units::Ohms; @@ -151,13 +145,6 @@ impl Bsp { // TODO: error handling here? Err(_e) => (), } - - // If we are in A0+HP, also care about T6 temperatures. - // Otherwise, it may not be powered. - if state == PowerState::A0PlusHP { - out |= PowerBitmask::HP; - } - out } PowerState::A2 @@ -334,7 +321,7 @@ const INPUTS: [InputChannel; NUM_TEMPERATURE_INPUTS] = [ sensors::TMP451_T6_TEMPERATURE_SENSOR, ), T6_THERMALS, - PowerBitmask::HP, + PowerBitmask::A0, ChannelType::MustBePresent, ), InputChannel::new( From 0edcd77225fd25d82edde71cbdafef947aca965a Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 17 Feb 2026 12:48:35 -0800 Subject: [PATCH 3/5] clearer cosmo bitflag definitions --- task/thermal/src/bsp/cosmo_ab.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/task/thermal/src/bsp/cosmo_ab.rs b/task/thermal/src/bsp/cosmo_ab.rs index 821dc496ac..bc66502143 100644 --- a/task/thermal/src/bsp/cosmo_ab.rs +++ b/task/thermal/src/bsp/cosmo_ab.rs @@ -66,10 +66,8 @@ bitflags::bitflags! { // in A2; you probably want to use `A0_OR_A2` instead. const A2 = 0b00000001; const A0 = 0b00000010; - // A0+HP: T6 is also active. - const HP = 0b00000100; - const A0_PLUS_HP = Self::A0.bits() | Self::HP.bits(); - const A0_OR_A2 = Self::A0.bits() | Self::A2.bits(); + // A0+HP: T6 is also active, in addition to all A0 devices. + const A0_PLUS_HP = Self::A0.bits() | 0b00000100; } } @@ -238,7 +236,7 @@ const INPUTS: [InputChannel; NUM_TEMPERATURE_INPUTS] = [ sensors::TMP451_T6_TEMPERATURE_SENSOR, ), T6_THERMALS, - PowerBitmask::HP, + PowerBitmask::AO_PLUS_HP, ChannelType::MustBePresent, ), // U.2 drives From f2679d377865609faef5c0da3ce4f709a238434c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 17 Feb 2026 12:49:35 -0800 Subject: [PATCH 4/5] add a note to gimelt --- task/thermal/src/bsp/gimlet_bcdef.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/task/thermal/src/bsp/gimlet_bcdef.rs b/task/thermal/src/bsp/gimlet_bcdef.rs index d81e0c1e44..3b67275aa7 100644 --- a/task/thermal/src/bsp/gimlet_bcdef.rs +++ b/task/thermal/src/bsp/gimlet_bcdef.rs @@ -81,6 +81,9 @@ bitflags::bitflags! { const A2 = 0b00000001; const A0 = 0b00000010; const A0_OR_A2 = Self::A0.bits() | Self::A2.bits(); + // Note that Gimlet does *not* need a separate flag for the A0+HP + // power domain (like Cosmo does), as the T6 is powered in the A0 + // domain on Gimlet. // Bonus bits for M.2 power, which is switched separately. We *cannot* // read the M.2 drives when they are unpowered; otherwise, we risk From 61ef7680e9a8c11b226481e54f1dde535d15f9be Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 17 Feb 2026 12:57:37 -0800 Subject: [PATCH 5/5] oh, so im like *stupid* stupid --- task/thermal/src/bsp/cosmo_ab.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task/thermal/src/bsp/cosmo_ab.rs b/task/thermal/src/bsp/cosmo_ab.rs index bc66502143..7ec50590bf 100644 --- a/task/thermal/src/bsp/cosmo_ab.rs +++ b/task/thermal/src/bsp/cosmo_ab.rs @@ -236,7 +236,7 @@ const INPUTS: [InputChannel; NUM_TEMPERATURE_INPUTS] = [ sensors::TMP451_T6_TEMPERATURE_SENSOR, ), T6_THERMALS, - PowerBitmask::AO_PLUS_HP, + PowerBitmask::A0_PLUS_HP, ChannelType::MustBePresent, ), // U.2 drives