From f9bb7d1e246243572c13726a3995aa06e8799d42 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 29 Jan 2026 11:39:20 -0800 Subject: [PATCH 1/4] thermal, transceivers: rudimentary pass at powering down individual xcvrs See #2355 --- drv/transceivers-server/src/main.rs | 35 +++++++++++++++++++++++++++- task/thermal-api/src/lib.rs | 18 ++++++++++++-- task/thermal/src/bsp/cosmo_ab.rs | 8 +++---- task/thermal/src/bsp/gimlet_bcdef.rs | 10 ++++---- task/thermal/src/bsp/grapefruit.rs | 2 +- task/thermal/src/bsp/sidecar_bcd.rs | 4 ++-- 6 files changed, 62 insertions(+), 15 deletions(-) diff --git a/drv/transceivers-server/src/main.rs b/drv/transceivers-server/src/main.rs index 1a299be093..7050a07a4d 100644 --- a/drv/transceivers-server/src/main.rs +++ b/drv/transceivers-server/src/main.rs @@ -79,6 +79,8 @@ enum Trace { DisableFailed(usize, LogicalPortMask), ClearDisabledPorts(LogicalPortMask), SeqError(SeqError), + ModuleTemperatureCritical(u8, Celcius), + ModuleTemperaturePowerDown(u8, Celcius), } counted_ringbuf!(Trace, 16, Trace::None); @@ -335,7 +337,7 @@ impl ServerImpl { model: ThermalProperties { target_temperature: Celsius(65.0), critical_temperature: Celsius(70.0), - power_down_temperature: Celsius(80.0), + power_down_temperature: Some(Celsius(80.0)), temperature_slew_deg_per_sec: 0.5, }, }) @@ -429,6 +431,17 @@ impl ServerImpl { // will return a `NotInAutoMode` error if the thermal loop is in // manual mode; this is harmless and will be ignored (instead of // cluttering up the logs). + + let model = ThermalProperties { + // We do *not* want the thermal loop to power down the whole + // system in response to a transceiver overheating. Instead, + // we will just disable the individual transceiver here. + // Thus, remove the power_down_temeprature from the version + // of the thermal properties that we will give to the + // `thermal` task. + power_down_temperature: None, + ..m.model + }; match self.thermal_api.update_dynamic_input(i, m.model) { Ok(()) | Err(ThermalError::NotInAutoMode) => (), Err(e) => ringbuf_entry!(Trace::ThermalError(i, e)), @@ -453,6 +466,26 @@ impl ServerImpl { // We got a temperature! Send it over to the thermal task self.sensor_api .post_now(TRANSCEIVER_TEMPERATURE_SENSORS[i], t.0); + + if m.model.should_power_down(t) { + // If the module's temperature exceeds the power-down + // threshold, add it to the list of things to disable. + ringbuf_entry!(Trace::ModuleTemperaturePowerDown( + port.0, t + )); + to_disable.set(port); + // TODO(eliza): ereport + } else if m.model.is_critical(t) { + ringbuf_entry!(Trace::ModuleTemperatureCritical( + port.0, t + )); + // TODO(eliza): ereport + // TODO(eliza): should we have a timeout for shutting + // down the module if it's been over critical for too + // long? + } + // TODO(eliza): see if it's nominal again and turn it back + // on...? } // We failed to read a temperature :( // diff --git a/task/thermal-api/src/lib.rs b/task/thermal-api/src/lib.rs index 9c5cc75ef4..5e62b76379 100644 --- a/task/thermal-api/src/lib.rs +++ b/task/thermal-api/src/lib.rs @@ -93,7 +93,10 @@ pub struct ThermalProperties { /// Temperature at which we drop into the A2 power state. This should be /// below the part's nonrecoverable temperature. - pub power_down_temperature: Celsius, + /// + /// If this is `None`, the system will not be sent to A2 due to this part's + /// temperature. + pub power_down_temperature: Option, /// Maximum slew rate of temperature, measured in °C per second /// @@ -108,7 +111,11 @@ pub struct ThermalProperties { impl ThermalProperties { /// Returns whether this part is exceeding its power-down temperature pub fn should_power_down(&self, t: Celsius) -> bool { - t.0 >= self.power_down_temperature.0 + if let Some(power_down_temperature) = self.power_down_temperature { + t.0 >= power_down_temperature.0 + } else { + false + } } /// Returns whether this part is exceeding its critical temperature @@ -229,4 +236,11 @@ impl From for SensorReadError { } } +#[derive(Clone, Copy, IntoBytes, TryFromBytes, Immutable, KnownLayout)] +#[repr(u8)] +pub enum PowerDownMode { + System = 1, + Transceiver, +} + include!(concat!(env!("OUT_DIR"), "/client_stub.rs")); diff --git a/task/thermal/src/bsp/cosmo_ab.rs b/task/thermal/src/bsp/cosmo_ab.rs index 608c7c8977..3bd57e02d1 100644 --- a/task/thermal/src/bsp/cosmo_ab.rs +++ b/task/thermal/src/bsp/cosmo_ab.rs @@ -165,7 +165,7 @@ impl Bsp { const U2_THERMALS: ThermalProperties = ThermalProperties { target_temperature: Celsius(65f32), critical_temperature: Celsius(70f32), - power_down_temperature: Celsius(75f32), + power_down_temperature: Some(Celsius(75f32)), temperature_slew_deg_per_sec: 0.5, }; @@ -175,7 +175,7 @@ const U2_THERMALS: ThermalProperties = ThermalProperties { const M2_THERMALS: ThermalProperties = ThermalProperties { target_temperature: Celsius(65f32), critical_temperature: Celsius(70f32), - power_down_temperature: Celsius(75f32), + power_down_temperature: Some(Celsius(75f32)), temperature_slew_deg_per_sec: 0.5, }; @@ -186,7 +186,7 @@ const M2_THERMALS: ThermalProperties = ThermalProperties { const CPU_THERMALS: ThermalProperties = ThermalProperties { target_temperature: Celsius(80f32), critical_temperature: Celsius(90f32), - power_down_temperature: Celsius(100f32), + power_down_temperature: Some(Celsius(100f32)), temperature_slew_deg_per_sec: 0.5, }; @@ -194,7 +194,7 @@ const CPU_THERMALS: ThermalProperties = ThermalProperties { const T6_THERMALS: ThermalProperties = ThermalProperties { target_temperature: Celsius(70f32), critical_temperature: Celsius(80f32), - power_down_temperature: Celsius(85f32), + power_down_temperature: Some(Celsius(85f32)), temperature_slew_deg_per_sec: 0.5, }; diff --git a/task/thermal/src/bsp/gimlet_bcdef.rs b/task/thermal/src/bsp/gimlet_bcdef.rs index d81e0c1e44..b77840eba8 100644 --- a/task/thermal/src/bsp/gimlet_bcdef.rs +++ b/task/thermal/src/bsp/gimlet_bcdef.rs @@ -208,7 +208,7 @@ impl Bsp { const DIMM_THERMALS: ThermalProperties = ThermalProperties { target_temperature: Celsius(80f32), critical_temperature: Celsius(90f32), - power_down_temperature: Celsius(95f32), + power_down_temperature: Some(Celsius(95f32)), temperature_slew_deg_per_sec: 0.5, }; @@ -223,7 +223,7 @@ const DIMM_THERMALS: ThermalProperties = ThermalProperties { const U2_THERMALS: ThermalProperties = ThermalProperties { target_temperature: Celsius(65f32), critical_temperature: Celsius(70f32), - power_down_temperature: Celsius(75f32), + power_down_temperature: Some(Celsius(75f32)), temperature_slew_deg_per_sec: 0.5, }; @@ -233,7 +233,7 @@ const U2_THERMALS: ThermalProperties = ThermalProperties { const M2_THERMALS: ThermalProperties = ThermalProperties { target_temperature: Celsius(65f32), critical_temperature: Celsius(70f32), - power_down_temperature: Celsius(75f32), + power_down_temperature: Some(Celsius(75f32)), temperature_slew_deg_per_sec: 0.5, }; @@ -244,7 +244,7 @@ const M2_THERMALS: ThermalProperties = ThermalProperties { const CPU_THERMALS: ThermalProperties = ThermalProperties { target_temperature: Celsius(80f32), critical_temperature: Celsius(90f32), - power_down_temperature: Celsius(100f32), + power_down_temperature: Some(Celsius(100f32)), temperature_slew_deg_per_sec: 0.5, }; @@ -252,7 +252,7 @@ const CPU_THERMALS: ThermalProperties = ThermalProperties { const T6_THERMALS: ThermalProperties = ThermalProperties { target_temperature: Celsius(70f32), critical_temperature: Celsius(80f32), - power_down_temperature: Celsius(85f32), + power_down_temperature: Some(Celsius(85f32)), temperature_slew_deg_per_sec: 0.5, }; diff --git a/task/thermal/src/bsp/grapefruit.rs b/task/thermal/src/bsp/grapefruit.rs index bfdf13ec54..7afa715023 100644 --- a/task/thermal/src/bsp/grapefruit.rs +++ b/task/thermal/src/bsp/grapefruit.rs @@ -128,7 +128,7 @@ impl Bsp { const LM75_THERMALS: ThermalProperties = ThermalProperties { target_temperature: Celsius(60f32), critical_temperature: Celsius(70f32), - power_down_temperature: Celsius(80f32), + power_down_temperature: Some(Celsius(80f32)), temperature_slew_deg_per_sec: 0.5, }; diff --git a/task/thermal/src/bsp/sidecar_bcd.rs b/task/thermal/src/bsp/sidecar_bcd.rs index 68c4b8688c..c297a32c00 100644 --- a/task/thermal/src/bsp/sidecar_bcd.rs +++ b/task/thermal/src/bsp/sidecar_bcd.rs @@ -212,7 +212,7 @@ impl Bsp { const TF2_THERMALS: ThermalProperties = ThermalProperties { target_temperature: Celsius(60f32), critical_temperature: Celsius(70f32), - power_down_temperature: Celsius(80f32), + power_down_temperature: Some(Celsius(80f32)), temperature_slew_deg_per_sec: 0.5, }; @@ -221,7 +221,7 @@ const TF2_THERMALS: ThermalProperties = ThermalProperties { const VSC7448_THERMALS: ThermalProperties = ThermalProperties { target_temperature: Celsius(85f32), critical_temperature: Celsius(95f32), - power_down_temperature: Celsius(105f32), + power_down_temperature: Some(Celsius(105f32)), temperature_slew_deg_per_sec: 0.5, }; From b82bf6180335a1ff53fc9d7667fa3c3e36ab21e2 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 29 Jan 2026 11:52:11 -0800 Subject: [PATCH 2/4] transceivers: wip actually track states --- drv/transceivers-server/src/main.rs | 41 +++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/drv/transceivers-server/src/main.rs b/drv/transceivers-server/src/main.rs index 7050a07a4d..a80cef24ba 100644 --- a/drv/transceivers-server/src/main.rs +++ b/drv/transceivers-server/src/main.rs @@ -151,6 +151,16 @@ struct ThermalModel { /// What are its thermal properties, e.g. critical temperature? #[allow(dead_code)] model: ThermalProperties, + + /// What is the current thermal state? + state: ThermalState, +} + +#[derive(Copy, Clone, Eq, PartialEq)] +enum ThermalState { + Nominal, + Critical { at: u64 }, + PowerDown, } /// Controls how often we poll the transceivers (in milliseconds). @@ -340,6 +350,7 @@ impl ServerImpl { power_down_temperature: Some(Celsius(80.0)), temperature_slew_deg_per_sec: 0.5, }, + state: ThermalState::Nominal, }) } ManagementInterface::Unknown(..) => { @@ -470,19 +481,27 @@ impl ServerImpl { if m.model.should_power_down(t) { // If the module's temperature exceeds the power-down // threshold, add it to the list of things to disable. - ringbuf_entry!(Trace::ModuleTemperaturePowerDown( - port.0, t - )); + if m.state != ThermalState::PowerDown { + ringbuf_entry!(Trace::ModuleTemperaturePowerDown( + port.0, t + )); + m.state = ThermalState::PowerDown; + // TODO(eliza): ereport + } to_disable.set(port); - // TODO(eliza): ereport } else if m.model.is_critical(t) { - ringbuf_entry!(Trace::ModuleTemperatureCritical( - port.0, t - )); - // TODO(eliza): ereport - // TODO(eliza): should we have a timeout for shutting - // down the module if it's been over critical for too - // long? + if let ThermalState::Critical { at } = m.state { + // TODO(eliza): this is where we could we have a + // timeout for shutting down the module if it's been + // over critical for too long? + } else { + let at = userlib::sys_get_timer().now; + m.state = ThermalState::Critical { at }; + ringbuf_entry!(Trace::ModuleTemperatureCritical( + port.0, t + )); + // TODO(eliza): ereport + } } // TODO(eliza): see if it's nominal again and turn it back // on...? From 7a8d4fef1958cd443e35e2a3e14ba7a2c981405d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 2 Feb 2026 10:48:01 -0800 Subject: [PATCH 3/4] blarghhh --- drv/transceivers-server/src/main.rs | 58 +++++++++------------------- task/thermal-api/src/lib.rs | 23 ++++++----- task/thermal/src/bsp/cosmo_ab.rs | 4 ++ task/thermal/src/bsp/gimlet_bcdef.rs | 5 +++ task/thermal/src/bsp/grapefruit.rs | 1 + task/thermal/src/bsp/sidecar_bcd.rs | 6 ++- 6 files changed, 44 insertions(+), 53 deletions(-) diff --git a/drv/transceivers-server/src/main.rs b/drv/transceivers-server/src/main.rs index a80cef24ba..be02fa7471 100644 --- a/drv/transceivers-server/src/main.rs +++ b/drv/transceivers-server/src/main.rs @@ -49,7 +49,7 @@ task_slot!(THERMAL, thermal); include!(concat!(env!("OUT_DIR"), "/i2c_config.rs")); #[allow(dead_code)] -#[derive(Copy, Clone, PartialEq, Eq, Count)] +#[derive(Copy, Clone, PartialEq, Count)] enum Trace { #[count(skip)] None, @@ -79,8 +79,8 @@ enum Trace { DisableFailed(usize, LogicalPortMask), ClearDisabledPorts(LogicalPortMask), SeqError(SeqError), - ModuleTemperatureCritical(u8, Celcius), - ModuleTemperaturePowerDown(u8, Celcius), + ModuleTemperatureCritical(u8, Celsius), + ModuleTemperaturePowerDown(u8, Celsius), } counted_ringbuf!(Trace, 16, Trace::None); @@ -151,16 +151,6 @@ struct ThermalModel { /// What are its thermal properties, e.g. critical temperature? #[allow(dead_code)] model: ThermalProperties, - - /// What is the current thermal state? - state: ThermalState, -} - -#[derive(Copy, Clone, Eq, PartialEq)] -enum ThermalState { - Nominal, - Critical { at: u64 }, - PowerDown, } /// Controls how often we poll the transceivers (in milliseconds). @@ -347,10 +337,10 @@ impl ServerImpl { model: ThermalProperties { target_temperature: Celsius(65.0), critical_temperature: Celsius(70.0), - power_down_temperature: Some(Celsius(80.0)), + power_down_temperature: Celsius(80.0), temperature_slew_deg_per_sec: 0.5, + power_down_enabled: true, }, - state: ThermalState::Nominal, }) } ManagementInterface::Unknown(..) => { @@ -447,13 +437,12 @@ impl ServerImpl { // We do *not* want the thermal loop to power down the whole // system in response to a transceiver overheating. Instead, // we will just disable the individual transceiver here. - // Thus, remove the power_down_temeprature from the version - // of the thermal properties that we will give to the - // `thermal` task. - power_down_temperature: None, + // Thus, disable power-down on the version of the device's + // thermal properties we give to the `thermal` task. + power_down_enabled: false, ..m.model }; - match self.thermal_api.update_dynamic_input(i, m.model) { + match self.thermal_api.update_dynamic_input(i, model) { Ok(()) | Err(ThermalError::NotInAutoMode) => (), Err(e) => ringbuf_entry!(Trace::ThermalError(i, e)), } @@ -481,27 +470,18 @@ impl ServerImpl { if m.model.should_power_down(t) { // If the module's temperature exceeds the power-down // threshold, add it to the list of things to disable. - if m.state != ThermalState::PowerDown { - ringbuf_entry!(Trace::ModuleTemperaturePowerDown( - port.0, t - )); - m.state = ThermalState::PowerDown; - // TODO(eliza): ereport - } + ringbuf_entry!(Trace::ModuleTemperaturePowerDown( + port.0, t + )); + // TODO(eliza): ereport + // TODO(eliza): debounce to_disable.set(port); } else if m.model.is_critical(t) { - if let ThermalState::Critical { at } = m.state { - // TODO(eliza): this is where we could we have a - // timeout for shutting down the module if it's been - // over critical for too long? - } else { - let at = userlib::sys_get_timer().now; - m.state = ThermalState::Critical { at }; - ringbuf_entry!(Trace::ModuleTemperatureCritical( - port.0, t - )); - // TODO(eliza): ereport - } + ringbuf_entry!(Trace::ModuleTemperatureCritical( + port.0, t + )); + // TODO(eliza): ereport + // TODO(eliza): track over critical duration... } // TODO(eliza): see if it's nominal again and turn it back // on...? diff --git a/task/thermal-api/src/lib.rs b/task/thermal-api/src/lib.rs index 5e62b76379..af3db341c2 100644 --- a/task/thermal-api/src/lib.rs +++ b/task/thermal-api/src/lib.rs @@ -11,7 +11,7 @@ use drv_i2c_api::ResponseCode; use hubpack::SerializedSize; use serde::{Deserialize, Serialize}; use userlib::{units::Celsius, *}; -use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout}; +use zerocopy::{Immutable, IntoBytes, KnownLayout, TryFromBytes}; #[derive( Copy, Clone, Debug, FromPrimitive, Eq, PartialEq, IdolError, counters::Count, @@ -81,8 +81,8 @@ pub enum ThermalAutoState { } /// Properties for a particular part in the system -#[derive(Clone, Copy, IntoBytes, FromBytes, Immutable, KnownLayout)] -#[repr(C)] +#[derive(Clone, Copy, IntoBytes, TryFromBytes, Immutable, KnownLayout)] +#[repr(packed)] pub struct ThermalProperties { /// Target temperature for this part pub target_temperature: Celsius, @@ -93,16 +93,19 @@ pub struct ThermalProperties { /// Temperature at which we drop into the A2 power state. This should be /// below the part's nonrecoverable temperature. - /// - /// If this is `None`, the system will not be sent to A2 due to this part's - /// temperature. - pub power_down_temperature: Option, + pub power_down_temperature: Celsius, /// Maximum slew rate of temperature, measured in °C per second /// /// The slew rate is used to model worst-case temperature if we haven't /// heard from a chip in a while (e.g. due to dropped samples) pub temperature_slew_deg_per_sec: f32, + + /// If `true`, this device should be considered whether deciding if the the + /// system should drop into the A2 state. If `false`, then + /// [`ThermalProperties::should_power_down`] will always return `false` for + /// this device, regardless of the device's actual temperature. + pub power_down_enabled: bool, } /// All of these functions take an **instantaneous** temperature; to convert a @@ -111,11 +114,7 @@ pub struct ThermalProperties { impl ThermalProperties { /// Returns whether this part is exceeding its power-down temperature pub fn should_power_down(&self, t: Celsius) -> bool { - if let Some(power_down_temperature) = self.power_down_temperature { - t.0 >= power_down_temperature.0 - } else { - false - } + self.power_down_enabled && t.0 >= self.power_down_temperature.0 } /// Returns whether this part is exceeding its critical temperature diff --git a/task/thermal/src/bsp/cosmo_ab.rs b/task/thermal/src/bsp/cosmo_ab.rs index 3bd57e02d1..f076daa39d 100644 --- a/task/thermal/src/bsp/cosmo_ab.rs +++ b/task/thermal/src/bsp/cosmo_ab.rs @@ -167,6 +167,7 @@ const U2_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(70f32), power_down_temperature: Some(Celsius(75f32)), temperature_slew_deg_per_sec: 0.5, + power_down_enabled: true, }; // The Micron-7300 (primary source) begins throttling at 72°, and its "critical @@ -177,6 +178,7 @@ const M2_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(70f32), power_down_temperature: Some(Celsius(75f32)), temperature_slew_deg_per_sec: 0.5, + power_down_enabled: true, }; // The CPU doesn't actually report true temperature; it reports a @@ -188,6 +190,7 @@ const CPU_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(90f32), power_down_temperature: Some(Celsius(100f32)), temperature_slew_deg_per_sec: 0.5, + power_down_enabled: true, }; // The T6's specifications aren't clearly detailed anywhere. @@ -196,6 +199,7 @@ const T6_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(80f32), power_down_temperature: Some(Celsius(85f32)), temperature_slew_deg_per_sec: 0.5, + power_down_enabled: true, }; const INPUTS: [InputChannel; NUM_TEMPERATURE_INPUTS] = [ diff --git a/task/thermal/src/bsp/gimlet_bcdef.rs b/task/thermal/src/bsp/gimlet_bcdef.rs index b77840eba8..53796df926 100644 --- a/task/thermal/src/bsp/gimlet_bcdef.rs +++ b/task/thermal/src/bsp/gimlet_bcdef.rs @@ -210,6 +210,7 @@ const DIMM_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(90f32), power_down_temperature: Some(Celsius(95f32)), temperature_slew_deg_per_sec: 0.5, + power_down_enabled: true, }; // Thermal throttling begins at 78° for WD-SN840 (primary source) and @@ -225,6 +226,7 @@ const U2_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(70f32), power_down_temperature: Some(Celsius(75f32)), temperature_slew_deg_per_sec: 0.5, + power_down_enabled: true, }; // The Micron-7300 (primary source) begins throttling at 72°, and its "critical @@ -235,6 +237,7 @@ const M2_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(70f32), power_down_temperature: Some(Celsius(75f32)), temperature_slew_deg_per_sec: 0.5, + power_down_enabled: true, }; // The CPU doesn't actually report true temperature; it reports a @@ -246,6 +249,7 @@ const CPU_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(90f32), power_down_temperature: Some(Celsius(100f32)), temperature_slew_deg_per_sec: 0.5, + power_down_enabled: true, }; // The T6's specifications aren't clearly detailed anywhere. @@ -254,6 +258,7 @@ const T6_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(80f32), power_down_temperature: Some(Celsius(85f32)), temperature_slew_deg_per_sec: 0.5, + power_down_enabled: true, }; const INPUTS: [InputChannel; NUM_TEMPERATURE_INPUTS] = [ diff --git a/task/thermal/src/bsp/grapefruit.rs b/task/thermal/src/bsp/grapefruit.rs index 7afa715023..b11b953615 100644 --- a/task/thermal/src/bsp/grapefruit.rs +++ b/task/thermal/src/bsp/grapefruit.rs @@ -130,6 +130,7 @@ const LM75_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(70f32), power_down_temperature: Some(Celsius(80f32)), temperature_slew_deg_per_sec: 0.5, + power_down_enabled: true, }; const INPUTS: [InputChannel; NUM_TEMPERATURE_INPUTS] = [InputChannel::new( diff --git a/task/thermal/src/bsp/sidecar_bcd.rs b/task/thermal/src/bsp/sidecar_bcd.rs index c297a32c00..b03ad5b299 100644 --- a/task/thermal/src/bsp/sidecar_bcd.rs +++ b/task/thermal/src/bsp/sidecar_bcd.rs @@ -212,8 +212,9 @@ impl Bsp { const TF2_THERMALS: ThermalProperties = ThermalProperties { target_temperature: Celsius(60f32), critical_temperature: Celsius(70f32), - power_down_temperature: Some(Celsius(80f32)), + power_down_temperature: Celsius(80f32), temperature_slew_deg_per_sec: 0.5, + power_down_enabled: true, }; // The VSC7448 has a maximum die temperature of 110°C, which is very @@ -221,8 +222,9 @@ const TF2_THERMALS: ThermalProperties = ThermalProperties { const VSC7448_THERMALS: ThermalProperties = ThermalProperties { target_temperature: Celsius(85f32), critical_temperature: Celsius(95f32), - power_down_temperature: Some(Celsius(105f32)), + power_down_temperature: Celsius(105f32), temperature_slew_deg_per_sec: 0.5, + power_down_enabled: true, }; const INPUTS: [InputChannel; NUM_TEMPERATURE_INPUTS] = [ From 4711417f9aa3756d1fb476e37e9158578a93afd1 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 2 Feb 2026 11:30:39 -0800 Subject: [PATCH 4/4] blargh okay --- Cargo.lock | 1 + drv/transceivers-server/src/main.rs | 10 +++++----- idl/thermal.idol | 3 ++- sys/userlib/Cargo.toml | 1 + sys/userlib/src/units.rs | 5 +++++ task/thermal-api/src/lib.rs | 21 +++++++++++---------- task/thermal/src/bsp/cosmo_ab.rs | 4 ---- task/thermal/src/bsp/gimlet_bcdef.rs | 5 ----- task/thermal/src/bsp/grapefruit.rs | 1 - task/thermal/src/bsp/sidecar_bcd.rs | 6 ++---- task/thermal/src/main.rs | 4 ++-- 11 files changed, 29 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ab074128a5..aafbe41178 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6891,6 +6891,7 @@ dependencies = [ "cfg-if", "cortex-m", "critical-section", + "hubpack", "num-derive 0.4.2", "num-traits", "paste", diff --git a/drv/transceivers-server/src/main.rs b/drv/transceivers-server/src/main.rs index be02fa7471..637c756457 100644 --- a/drv/transceivers-server/src/main.rs +++ b/drv/transceivers-server/src/main.rs @@ -337,9 +337,8 @@ impl ServerImpl { model: ThermalProperties { target_temperature: Celsius(65.0), critical_temperature: Celsius(70.0), - power_down_temperature: Celsius(80.0), + power_down_temperature: Some(Celsius(80.0)), temperature_slew_deg_per_sec: 0.5, - power_down_enabled: true, }, }) } @@ -438,11 +437,12 @@ impl ServerImpl { // system in response to a transceiver overheating. Instead, // we will just disable the individual transceiver here. // Thus, disable power-down on the version of the device's - // thermal properties we give to the `thermal` task. - power_down_enabled: false, + // thermal properties by setting the + // `power_down_temperature` to `None`. + power_down_temperature: None, ..m.model }; - match self.thermal_api.update_dynamic_input(i, model) { + match self.thermal_api.update_dynamic_input(i as u32, model) { Ok(()) | Err(ThermalError::NotInAutoMode) => (), Err(e) => ringbuf_entry!(Trace::ThermalError(i, e)), } diff --git a/idl/thermal.idol b/idl/thermal.idol index 5505b6bcf2..dab15236e6 100644 --- a/idl/thermal.idol +++ b/idl/thermal.idol @@ -80,13 +80,14 @@ Interface( "update_dynamic_input": ( doc: "Provides a thermal model for a dynamic sensor", args: { - "index": "usize", + "index": "u32", "model": "ThermalProperties", }, reply: Result( ok: "()", err: CLike("ThermalError"), ), + encoding: Hubpack ), "remove_dynamic_input": ( doc: "Removes the given dynamic input, so it is no longer used in the control loop", diff --git a/sys/userlib/Cargo.toml b/sys/userlib/Cargo.toml index f6d6216453..1a0f2f69c4 100644 --- a/sys/userlib/Cargo.toml +++ b/sys/userlib/Cargo.toml @@ -13,6 +13,7 @@ critical-section = ["dep:critical-section"] bstringify = { workspace = true } cfg-if = { workspace = true } critical-section = {workspace = true, optional = true, features = ["restore-state-none"]} +hubpack = { workspace = true } num-derive = { workspace = true } num-traits = { workspace = true } paste = { workspace = true } diff --git a/sys/userlib/src/units.rs b/sys/userlib/src/units.rs index 005250595c..f92ac2ddea 100644 --- a/sys/userlib/src/units.rs +++ b/sys/userlib/src/units.rs @@ -6,6 +6,8 @@ //! Tuple structs for units that are useful in the real world //! +use hubpack::SerializedSize; +use serde::{Deserialize, Serialize}; use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout}; /// Degrees Celsius @@ -19,6 +21,9 @@ use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout}; IntoBytes, Immutable, KnownLayout, + Serialize, + Deserialize, + SerializedSize, )] #[repr(C)] pub struct Celsius(pub f32); diff --git a/task/thermal-api/src/lib.rs b/task/thermal-api/src/lib.rs index af3db341c2..806aa6aae9 100644 --- a/task/thermal-api/src/lib.rs +++ b/task/thermal-api/src/lib.rs @@ -81,8 +81,8 @@ pub enum ThermalAutoState { } /// Properties for a particular part in the system -#[derive(Clone, Copy, IntoBytes, TryFromBytes, Immutable, KnownLayout)] -#[repr(packed)] +#[derive(Clone, Copy, Serialize, Deserialize, SerializedSize)] +#[repr(C)] pub struct ThermalProperties { /// Target temperature for this part pub target_temperature: Celsius, @@ -93,19 +93,16 @@ pub struct ThermalProperties { /// Temperature at which we drop into the A2 power state. This should be /// below the part's nonrecoverable temperature. - pub power_down_temperature: Celsius, + /// + /// If this is `None`, the system will not be sent to A2 due to this part's + /// temperature. + pub power_down_temperature: Option, /// Maximum slew rate of temperature, measured in °C per second /// /// The slew rate is used to model worst-case temperature if we haven't /// heard from a chip in a while (e.g. due to dropped samples) pub temperature_slew_deg_per_sec: f32, - - /// If `true`, this device should be considered whether deciding if the the - /// system should drop into the A2 state. If `false`, then - /// [`ThermalProperties::should_power_down`] will always return `false` for - /// this device, regardless of the device's actual temperature. - pub power_down_enabled: bool, } /// All of these functions take an **instantaneous** temperature; to convert a @@ -114,7 +111,11 @@ pub struct ThermalProperties { impl ThermalProperties { /// Returns whether this part is exceeding its power-down temperature pub fn should_power_down(&self, t: Celsius) -> bool { - self.power_down_enabled && t.0 >= self.power_down_temperature.0 + if let Some(power_down_temperature) = self.power_down_temperature { + t.0 >= power_down_temperature.0 + } else { + false + } } /// Returns whether this part is exceeding its critical temperature diff --git a/task/thermal/src/bsp/cosmo_ab.rs b/task/thermal/src/bsp/cosmo_ab.rs index f076daa39d..3bd57e02d1 100644 --- a/task/thermal/src/bsp/cosmo_ab.rs +++ b/task/thermal/src/bsp/cosmo_ab.rs @@ -167,7 +167,6 @@ const U2_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(70f32), power_down_temperature: Some(Celsius(75f32)), temperature_slew_deg_per_sec: 0.5, - power_down_enabled: true, }; // The Micron-7300 (primary source) begins throttling at 72°, and its "critical @@ -178,7 +177,6 @@ const M2_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(70f32), power_down_temperature: Some(Celsius(75f32)), temperature_slew_deg_per_sec: 0.5, - power_down_enabled: true, }; // The CPU doesn't actually report true temperature; it reports a @@ -190,7 +188,6 @@ const CPU_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(90f32), power_down_temperature: Some(Celsius(100f32)), temperature_slew_deg_per_sec: 0.5, - power_down_enabled: true, }; // The T6's specifications aren't clearly detailed anywhere. @@ -199,7 +196,6 @@ const T6_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(80f32), power_down_temperature: Some(Celsius(85f32)), temperature_slew_deg_per_sec: 0.5, - power_down_enabled: true, }; const INPUTS: [InputChannel; NUM_TEMPERATURE_INPUTS] = [ diff --git a/task/thermal/src/bsp/gimlet_bcdef.rs b/task/thermal/src/bsp/gimlet_bcdef.rs index 53796df926..b77840eba8 100644 --- a/task/thermal/src/bsp/gimlet_bcdef.rs +++ b/task/thermal/src/bsp/gimlet_bcdef.rs @@ -210,7 +210,6 @@ const DIMM_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(90f32), power_down_temperature: Some(Celsius(95f32)), temperature_slew_deg_per_sec: 0.5, - power_down_enabled: true, }; // Thermal throttling begins at 78° for WD-SN840 (primary source) and @@ -226,7 +225,6 @@ const U2_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(70f32), power_down_temperature: Some(Celsius(75f32)), temperature_slew_deg_per_sec: 0.5, - power_down_enabled: true, }; // The Micron-7300 (primary source) begins throttling at 72°, and its "critical @@ -237,7 +235,6 @@ const M2_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(70f32), power_down_temperature: Some(Celsius(75f32)), temperature_slew_deg_per_sec: 0.5, - power_down_enabled: true, }; // The CPU doesn't actually report true temperature; it reports a @@ -249,7 +246,6 @@ const CPU_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(90f32), power_down_temperature: Some(Celsius(100f32)), temperature_slew_deg_per_sec: 0.5, - power_down_enabled: true, }; // The T6's specifications aren't clearly detailed anywhere. @@ -258,7 +254,6 @@ const T6_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(80f32), power_down_temperature: Some(Celsius(85f32)), temperature_slew_deg_per_sec: 0.5, - power_down_enabled: true, }; const INPUTS: [InputChannel; NUM_TEMPERATURE_INPUTS] = [ diff --git a/task/thermal/src/bsp/grapefruit.rs b/task/thermal/src/bsp/grapefruit.rs index b11b953615..7afa715023 100644 --- a/task/thermal/src/bsp/grapefruit.rs +++ b/task/thermal/src/bsp/grapefruit.rs @@ -130,7 +130,6 @@ const LM75_THERMALS: ThermalProperties = ThermalProperties { critical_temperature: Celsius(70f32), power_down_temperature: Some(Celsius(80f32)), temperature_slew_deg_per_sec: 0.5, - power_down_enabled: true, }; const INPUTS: [InputChannel; NUM_TEMPERATURE_INPUTS] = [InputChannel::new( diff --git a/task/thermal/src/bsp/sidecar_bcd.rs b/task/thermal/src/bsp/sidecar_bcd.rs index b03ad5b299..c297a32c00 100644 --- a/task/thermal/src/bsp/sidecar_bcd.rs +++ b/task/thermal/src/bsp/sidecar_bcd.rs @@ -212,9 +212,8 @@ impl Bsp { const TF2_THERMALS: ThermalProperties = ThermalProperties { target_temperature: Celsius(60f32), critical_temperature: Celsius(70f32), - power_down_temperature: Celsius(80f32), + power_down_temperature: Some(Celsius(80f32)), temperature_slew_deg_per_sec: 0.5, - power_down_enabled: true, }; // The VSC7448 has a maximum die temperature of 110°C, which is very @@ -222,9 +221,8 @@ const TF2_THERMALS: ThermalProperties = ThermalProperties { const VSC7448_THERMALS: ThermalProperties = ThermalProperties { target_temperature: Celsius(85f32), critical_temperature: Celsius(95f32), - power_down_temperature: Celsius(105f32), + power_down_temperature: Some(Celsius(105f32)), temperature_slew_deg_per_sec: 0.5, - power_down_enabled: true, }; const INPUTS: [InputChannel; NUM_TEMPERATURE_INPUTS] = [ diff --git a/task/thermal/src/main.rs b/task/thermal/src/main.rs index d92dd63805..56b07ed7c7 100644 --- a/task/thermal/src/main.rs +++ b/task/thermal/src/main.rs @@ -269,14 +269,14 @@ impl<'a> idl::InOrderThermalImpl for ServerImpl<'a> { fn update_dynamic_input( &mut self, _: &RecvMessage, - index: usize, + index: u32, model: ThermalProperties, ) -> Result<(), RequestError> { if self.mode != ThermalMode::Auto { return Err(ThermalError::NotInAutoMode.into()); } self.control - .update_dynamic_input(index, model) + .update_dynamic_input(index as usize, model) .map_err(RequestError::from) }