Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 97 additions & 43 deletions task/thermal/src/control.rs
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,24 @@ struct TimestampedTemperatureReading {
value: Celsius,
}

/// Represents a worst-case temperature reading from the thermal model,
/// including the estimated temperature and the time since the last actual
/// sensor reading (lag).
#[derive(Copy, Clone, PartialEq)]
pub(crate) struct WorstCaseTemperature {
/// The worst-case temperature estimate from the thermal model, projected
/// from the `last_reading`.
worst_case_temp: Celsius,
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/// The last actual temperature reading from the device.
///
/// Subtracting this value from `worst_case_temp` gives the portion of the
/// worst case temperature that was calculated based on the lag since the
/// last actual reading fro mthe sensor .
last_reading: Celsius,
/// Approximately how old (in seconds) is the the last real temperature?
age_s: f32,
}

impl TimestampedTemperatureReading {
/// Returns the worst-case temperature, given a current time and thermal
/// model for this part.
Expand All @@ -521,12 +539,20 @@ impl TimestampedTemperatureReading {
/// safe. If there's invalid data in the sensors task (i.e. readings
/// claiming to be from the future), then this will saturate instead of
/// underflowing.
fn worst_case(&self, now_ms: u64, model: &ThermalProperties) -> Celsius {
Celsius(
self.value.0
+ now_ms.saturating_sub(self.time_ms) as f32 / 1000.0
* model.temperature_slew_deg_per_sec,
)
fn worst_case(
&self,
now_ms: u64,
model: &ThermalProperties,
) -> WorstCaseTemperature {
// How long has it been since the last real life temperature reading?
let age_s = now_ms.saturating_sub(self.time_ms) as f32 / 1000.0;
let worst_case_temp =
Celsius(self.value.0 + age_s * model.temperature_slew_deg_per_sec);
WorstCaseTemperature {
worst_case_temp,
last_reading: self.value,
age_s,
}
}
}

Expand Down Expand Up @@ -1161,9 +1187,10 @@ impl<'a> ThermalControl<'a> {
) {
match v {
Some(TemperatureReading::Valid(v)) => {
let temperature = v.worst_case(now_ms, &model);
let worst_case = v.worst_case(now_ms, &model);
let temperature = worst_case.worst_case_temp;
if model.should_power_down(temperature) {
any_power_down = Some((sensor_id, temperature));
any_power_down = Some((sensor_id, worst_case));
}
worst_margin =
worst_margin.min(model.margin(temperature).0);
Expand All @@ -1177,12 +1204,8 @@ impl<'a> ThermalControl<'a> {
}
}

if let Some((sensor_id, temperature)) = any_power_down {
ringbuf_entry!(Trace::PowerDownDueTo {
sensor_id,
temperature
});
self.transition_to_uncontrollable(now_ms)
if let Some(due_to) = any_power_down {
self.transition_to_uncontrollable_due_to(due_to, now_ms)
} else if all_some {
let values = values.map(Option::unwrap);
self.transition_to_running(worst_margin, now_ms, values)
Expand All @@ -1207,25 +1230,22 @@ impl<'a> ThermalControl<'a> {
&self.dynamic_inputs,
) {
if let TemperatureReading::Valid(v) = v {
let temperature = v.worst_case(now_ms, &model);
let worst_case = v.worst_case(now_ms, &model);
let temperature = worst_case.worst_case_temp;
if model.should_power_down(temperature) {
any_power_down = Some((sensor_id, temperature));
any_power_down = Some((sensor_id, worst_case));
}
if model.is_critical(temperature) {
any_critical = Some((sensor_id, temperature));
any_critical = Some((sensor_id, worst_case));
}

worst_margin =
worst_margin.min(model.margin(temperature).0);
}
}

if let Some((sensor_id, temperature)) = any_power_down {
ringbuf_entry!(Trace::PowerDownDueTo {
sensor_id,
temperature
});
self.transition_to_uncontrollable(now_ms)
if let Some(due_to) = any_power_down {
self.transition_to_uncontrollable_due_to(due_to, now_ms)
} else if let Some(due_to) = any_critical {
let values = *values;
self.transition_to_critical(due_to, now_ms, values)
Expand Down Expand Up @@ -1261,23 +1281,20 @@ impl<'a> ThermalControl<'a> {
&self.dynamic_inputs,
) {
if let TemperatureReading::Valid(v) = v {
let temperature = v.worst_case(now_ms, &model);
let worst_case = v.worst_case(now_ms, &model);
let temperature = worst_case.worst_case_temp;
all_nominal &= model.is_nominal(temperature);
any_still_critical |= model.is_critical(temperature);
if model.should_power_down(temperature) {
any_power_down = Some((sensor_id, temperature));
any_power_down = Some((sensor_id, worst_case));
}
worst_margin =
worst_margin.min(model.margin(temperature).0);
}
}

if let Some((sensor_id, temperature)) = any_power_down {
ringbuf_entry!(Trace::PowerDownDueTo {
sensor_id,
temperature
});
self.transition_to_uncontrollable(now_ms)
if let Some(due_to) = any_power_down {
self.transition_to_uncontrollable_due_to(due_to, now_ms)
} else if all_nominal {
let values = *values;
self.transition_to_running(worst_margin, now_ms, values)
Expand Down Expand Up @@ -1310,25 +1327,22 @@ impl<'a> ThermalControl<'a> {
&self.dynamic_inputs,
) {
if let TemperatureReading::Valid(v) = v {
let temperature = v.worst_case(now_ms, &model);
let worst_case = v.worst_case(now_ms, &model);
let temperature = worst_case.worst_case_temp;
all_nominal &= model.is_nominal(temperature);
if model.should_power_down(temperature) {
any_power_down = Some((sensor_id, temperature));
any_power_down = Some((sensor_id, worst_case));
}
if model.is_critical(temperature) {
any_critical = Some((sensor_id, temperature));
any_critical = Some((sensor_id, worst_case));
}
worst_margin =
worst_margin.min(model.margin(temperature).0);
}
}

if let Some((sensor_id, temperature)) = any_power_down {
ringbuf_entry!(Trace::PowerDownDueTo {
sensor_id,
temperature
});
self.transition_to_uncontrollable(now_ms)
if let Some(due_to) = any_power_down {
self.transition_to_uncontrollable_due_to(due_to, now_ms)
} else if let Some(due_to) = any_critical {
// If anything's gone over critical, transition back to the
// `Critical` state.
Expand Down Expand Up @@ -1394,13 +1408,23 @@ impl<'a> ThermalControl<'a> {
/// component exceeding its critical threshold.
fn transition_to_critical(
&mut self,
(sensor_id, temperature): (SensorId, Celsius),
(sensor_id, worst_case): (SensorId, WorstCaseTemperature),
now_ms: u64,
values: [TemperatureReading; TEMPERATURE_ARRAY_SIZE],
) -> ControlResult {
ringbuf_entry!(Trace::CriticalDueTo {
let WorstCaseTemperature {
worst_case_temp,
last_reading,
age_s,
} = worst_case;
ringbuf_entry!(Trace::PowerDownDueTo {
sensor_id,
worst_case_temp
});
ringbuf_entry!(Trace::LastRealTemperature {
sensor_id,
temperature
temperature: last_reading,
age_s,
});
self.state = ThermalControlState::Critical {
values,
Expand Down Expand Up @@ -1433,6 +1457,36 @@ impl<'a> ThermalControl<'a> {
ControlResult::Pwm(PWMDuty(self.pid_config.max_output as u8))
}

/// Transition to the `Uncontrollable` state due to a device exceeding its
/// power-down temperature threshold.
///
/// This is a wrapper around [`Self::transition_to_uncontrollable`] which
/// also records the sensor ID and temperature measurements for the device
/// that tripped over the threshold. We separate this into two functions as
/// we may also transition to uncontrollable due to an inability to read
/// sensors at all, or due to the power-down timeout.
fn transition_to_uncontrollable_due_to(
&mut self,
(sensor_id, worst_case): (SensorId, WorstCaseTemperature),
now_ms: u64,
) -> ControlResult {
let WorstCaseTemperature {
worst_case_temp,
last_reading,
age_s,
} = worst_case;
ringbuf_entry!(Trace::CriticalDueTo {
sensor_id,
worst_case_temp
});
ringbuf_entry!(Trace::LastRealTemperature {
sensor_id,
temperature: last_reading,
age_s,
});
self.transition_to_uncontrollable(now_ms)
}

/// Transition to the `Uncontrollable` state, either in response to the
/// overheat timeout, thermal sensor errors, or a component exceeding its
/// power-down temperature threshold.
Expand Down
40 changes: 39 additions & 1 deletion task/thermal/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,49 @@ enum Trace {
AutoState(#[count(children)] ThermalAutoState),
PowerDownDueTo {
sensor_id: SensorId,
temperature: units::Celsius,
/// The thermal model's worst-case temperature projection for this
/// sensor.
///
/// Note that this may not be an *actual temperature measurement*
/// from this sensor. Instead, it is projected from the last successful
/// temperature reading, the lag since that measurement was received,
/// and the thermal model's slew rate for the component.
///
/// This ringbuf entry is always followed by a [`LastActualTemperature`]
/// entry, which records the last actual temperature measurement
/// reported by the sensor.
worst_case_temp: units::Celsius,
},
CriticalDueTo {
sensor_id: SensorId,
/// The thermal model's worst-case temperature projection for this
/// sensor.
///
/// Note that this may not be an *actual temperature measurement*
/// from this sensor. Instead, it is projected from the last successful
/// temperature reading, the lag since that measurement was received,
/// and the thermal model's slew rate for the component.
///
/// This ringbuf entry is always followed by a [`LastActualTemperature`]
/// entry, which records the last actual temperature measurement
/// reported by the sensor.
worst_case_temp: units::Celsius,
},
/// The last actual temperature measurement reported by a sensor.
///
/// This is recorded after every [`CriticalDueTo`] or [`PowerDownDueTo`]
/// entry so that the last known real life temperature can be compared to
/// the worst-case temperature projection that caused a thermal loop state
/// transition.
#[count(skip)]
LastRealTemperature {
sensor_id: SensorId,
/// The most recent real life (not fake) temperature measurement from
/// the sensor.
temperature: units::Celsius,
/// The (approximate) time, in seconds, since the real life temperature
/// measurement was received.
age_s: f32,
},
Comment on lines +113 to 128
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I split this into its own ringbuf entries because adding two more 32-bit fields to CriticalDoTo and PowerDownDueTo would have probably made them the largest ringbuf entry and made each ringbuf slot a word bigger, which I wanted to avoid.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was going to ask this exact question, good anticipation! :-)

/// Total duration spent in the overheated control regime.
#[count(skip)]
Expand Down