From 436f5dd690f6c37635c89b982c7931be03ae137f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 14 Dec 2024 11:39:37 -0800 Subject: [PATCH 01/14] gimlet-seq: Record why the power state changed Currently, when the Gimlet CPU sequencer changes the system's power state, no information about *why* the power state changed is recorded by the SP firmware. A system may power off or reboot for a variety of reasons: it may be requested by the host OS over IPCC, by the control plane over the management network, or triggered by the thermal task due to an overheat condition. This makes debugging an unexpected reboot or power off difficult, as the SP ringbuffers and other diagnostics do not indicate why an unexpected power state change occurred. See #1950 for a motivating example. This commit resolves this as described in #1950 by adding a new field to the `SetState` variant in the `drv-gimlet-seq-server` ringbuffer, so that the reason a power state change occurred can be recorded. Clients of the `cpu_seq` IPC API must now provide a `StateChangeReason` when calling `Sequencer.set_state`, along with the desired power state, and the sequencer task will record the provided reason in its ringbuffer. This way, we can distinguish between the various reasons a power state change may have occurred when debugging such issues. The `StateChangeReason` enum also generates counters, so that the total number of power state changes can be tracked. Fixes #1950 --- drv/cpu-seq-api/src/lib.rs | 17 ++++++++++++++ drv/gimlet-seq-server/src/main.rs | 23 ++++++++++++++----- idl/cpu-seq.idol | 6 ++++- .../src/mgs_compute_sled.rs | 5 +++- task/host-sp-comms/src/main.rs | 12 ++++++++-- task/thermal/src/bsp/gimlet_bcdef.rs | 5 ++-- 6 files changed, 56 insertions(+), 12 deletions(-) diff --git a/drv/cpu-seq-api/src/lib.rs b/drv/cpu-seq-api/src/lib.rs index 8ab006e09..d95a8562b 100644 --- a/drv/cpu-seq-api/src/lib.rs +++ b/drv/cpu-seq-api/src/lib.rs @@ -9,6 +9,7 @@ use counters::Count; use derive_idol_err::IdolError; use userlib::{sys_send, FromPrimitive}; +use zerocopy::AsBytes; // Re-export PowerState for client convenience. pub use drv_cpu_power_state::PowerState; @@ -31,6 +32,22 @@ pub enum SeqError { ServerRestarted, } +#[derive(Copy, Clone, Debug, FromPrimitive, Eq, PartialEq, AsBytes, Count)] +#[repr(u8)] +pub enum StateChangeReason { + /// TThe system has just received power, so the sequencer has booted the + /// host CPU. + InitialPowerOn = 1, + /// A power state change was requested by the control plane. + ControlPlane, + /// The host OS requested that the system power off without rebooting. + HostPowerOff, + /// The host OS requested that the system reboot. + HostReboot, + /// The system powered off because a component has overheated. + Overheat, +} + // On Gimlet, we have two banks of up to 8 DIMMs apiece. Export the "two banks" // bit of knowledge here so it can be used by gimlet-seq-server, spd, and // packrat, all of which want to know at compile-time how many banks there are. diff --git a/drv/gimlet-seq-server/src/main.rs b/drv/gimlet-seq-server/src/main.rs index 0c3776e96..cca31aaf4 100644 --- a/drv/gimlet-seq-server/src/main.rs +++ b/drv/gimlet-seq-server/src/main.rs @@ -17,7 +17,7 @@ use userlib::{ sys_set_timer, task_slot, units, RecvMessage, TaskId, UnwrapLite, }; -use drv_cpu_seq_api::{PowerState, SeqError}; +use drv_cpu_seq_api::{PowerState, SeqError, StateChangeReason}; use drv_hf_api as hf_api; use drv_i2c_api as i2c; use drv_ice40_spi_program as ice40; @@ -90,7 +90,12 @@ enum Trace { RailsOn, UartEnabled, A0(u16), - SetState(PowerState, PowerState, u64), + SetState( + PowerState, + PowerState, + #[count(children)] StateChangeReason, + u64, + ), UpdateState(#[count(children)] PowerState), ClockConfigWrite, ClockConfigSuccess, @@ -482,7 +487,10 @@ impl ServerImpl { // Power on, unless suppressed by the `stay-in-a2` feature if !cfg!(feature = "stay-in-a2") { - _ = server.set_state_internal(PowerState::A0); + _ = server.set_state_internal( + PowerState::A0, + StateChangeReason::InitialPowerOn, + ); } // @@ -666,11 +674,12 @@ impl ServerImpl { fn set_state_internal( &mut self, state: PowerState, + reason: StateChangeReason, ) -> Result<(), SeqError> { let sys = sys_api::Sys::from(SYS.get_task_id()); let now = sys_get_timer().now; - ringbuf_entry!(Trace::SetState(self.state, state, now)); + ringbuf_entry!(Trace::SetState(self.state, state, reason, now)); ringbuf_entry_v3p3_sys_a0_vout(); @@ -1032,8 +1041,10 @@ impl idl::InOrderSequencerImpl for ServerImpl { &mut self, _: &RecvMessage, state: PowerState, + reason: StateChangeReason, ) -> Result<(), RequestError> { - self.set_state_internal(state).map_err(RequestError::from) + self.set_state_internal(state, reason) + .map_err(RequestError::from) } fn send_hardware_nmi( @@ -1403,7 +1414,7 @@ cfg_if::cfg_if! { } mod idl { - use super::SeqError; + use super::{SeqError, StateChangeReason}; include!(concat!(env!("OUT_DIR"), "/server_stub.rs")); } diff --git a/idl/cpu-seq.idol b/idl/cpu-seq.idol index f345f36ed..ba1909788 100644 --- a/idl/cpu-seq.idol +++ b/idl/cpu-seq.idol @@ -18,7 +18,11 @@ Interface( "state": ( type: "drv_cpu_power_state::PowerState", recv: FromPrimitive("u8"), - ) + ), + "reason": ( + type: "StateChangeReason", + recv: FromPrimitive("u8"), + ), }, reply: Result( ok: "()", diff --git a/task/control-plane-agent/src/mgs_compute_sled.rs b/task/control-plane-agent/src/mgs_compute_sled.rs index d191cefcd..18d43f838 100644 --- a/task/control-plane-agent/src/mgs_compute_sled.rs +++ b/task/control-plane-agent/src/mgs_compute_sled.rs @@ -728,7 +728,10 @@ impl SpHandler for MgsHandler { }; self.sequencer - .set_state(power_state) + .set_state( + power_state, + drv_cpu_seq_api::StateChangeReason::ControlPlane, + ) .map_err(|e| SpError::PowerStateError(e as u32)) } diff --git a/task/host-sp-comms/src/main.rs b/task/host-sp-comms/src/main.rs index f0d39438c..78ac633d7 100644 --- a/task/host-sp-comms/src/main.rs +++ b/task/host-sp-comms/src/main.rs @@ -374,11 +374,16 @@ impl ServerImpl { // basically only ever succeed in our initial set_state() request, so I // don't know how we'd test it fn power_off_host(&mut self, reboot: bool) { + let reason = if reboot { + drv_cpu_seq_api::StateChangeReason::HostReboot + } else { + drv_cpu_seq_api::StateChangeReason::HostPowerOff + }; loop { // Attempt to move to A2; given we only call this function in // response to a host request, we expect we're currently in A0 and // this should work. - let err = match self.sequencer.set_state(PowerState::A2) { + let err = match self.sequencer.set_state(PowerState::A2, reason) { Ok(()) => { ringbuf_entry!(Trace::SetState { now: sys_get_timer().now, @@ -1420,7 +1425,10 @@ fn handle_reboot_waiting_in_a2_timer( now: sys_get_timer().now, state: PowerState::A0, }); - _ = sequencer.set_state(PowerState::A0); + _ = sequencer.set_state( + PowerState::A0, + drv_cpu_seq_api::StateChangeReason::HostReboot, + ); *reboot_state = None; } } diff --git a/task/thermal/src/bsp/gimlet_bcdef.rs b/task/thermal/src/bsp/gimlet_bcdef.rs index 220a2fe2d..92ba23341 100644 --- a/task/thermal/src/bsp/gimlet_bcdef.rs +++ b/task/thermal/src/bsp/gimlet_bcdef.rs @@ -12,7 +12,7 @@ use crate::{ i2c_config::{devices, sensors}, }; pub use drv_cpu_seq_api::SeqError; -use drv_cpu_seq_api::{PowerState, Sequencer}; +use drv_cpu_seq_api::{PowerState, Sequencer, StateChangeReason}; use task_sensor_api::SensorId; use task_thermal_api::ThermalProperties; use userlib::{task_slot, units::Celsius, TaskId, UnwrapLite}; @@ -109,7 +109,8 @@ impl Bsp { } pub fn power_down(&self) -> Result<(), SeqError> { - self.seq.set_state(PowerState::A2) + self.seq + .set_state(PowerState::A2, StateChangeReason::Overheat) } pub fn power_mode(&self) -> PowerBitmask { From 7b43a728d0f2f28895767608c8820de1602ec882 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 14 Dec 2024 11:50:38 -0800 Subject: [PATCH 02/14] gimlet-seq: Name `SetState` ringbuf entry fields Currently, the `Trace::SetState` ringbuf entry in the sequencer is a tuple-like enum variant. This entry includes two `PowerState` fields, one recording the previous power state and the other recording the new power state that has been set. IMHO, using a tuple-like variant to represent this is a bit unfortunate, as in Humility, we'll see two values of the same type and it's not immediately obvious which is the previous state and which is the new state. This must be determined based on the order of the fields in the ringbuf entry, which requires referencing the Hubris code to determine. I felt like it was nicer to just use a struct-like variant with named fields for this. That way, the semantic meaning of the two `PowerState`s is actually encoded in the debug info, and Humility can just indicate which is the previous state and which is the new state when displaying the ring buffer. I also think it's a bit nicer to name the timestamp field --- otherwise, it just looks like some arbitrary integer, and you need to look at the code to determine that it's the timestamp of the power state change. If this is controversial for some reason, I'm happy to land it in a separate PR, but I figured it was nice to do while I was messing with the sequencer ringbuf. --- drv/gimlet-seq-server/src/main.rs | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drv/gimlet-seq-server/src/main.rs b/drv/gimlet-seq-server/src/main.rs index cca31aaf4..2e29cd620 100644 --- a/drv/gimlet-seq-server/src/main.rs +++ b/drv/gimlet-seq-server/src/main.rs @@ -90,12 +90,13 @@ enum Trace { RailsOn, UartEnabled, A0(u16), - SetState( - PowerState, - PowerState, - #[count(children)] StateChangeReason, - u64, - ), + SetState { + prev: PowerState, + next: PowerState, + #[count(children)] + why: StateChangeReason, + now: u64, + }, UpdateState(#[count(children)] PowerState), ClockConfigWrite, ClockConfigSuccess, @@ -674,12 +675,17 @@ impl ServerImpl { fn set_state_internal( &mut self, state: PowerState, - reason: StateChangeReason, + why: StateChangeReason, ) -> Result<(), SeqError> { let sys = sys_api::Sys::from(SYS.get_task_id()); let now = sys_get_timer().now; - ringbuf_entry!(Trace::SetState(self.state, state, reason, now)); + ringbuf_entry!(Trace::SetState { + prev: self.state, + next: state, + why, + now + }); ringbuf_entry_v3p3_sys_a0_vout(); From ee14af80332cab76899c04f8b854434d7e104216 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 14 Dec 2024 12:13:12 -0800 Subject: [PATCH 03/14] gimletlet: fix mock seq server --- drv/mock-gimlet-seq-server/src/main.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drv/mock-gimlet-seq-server/src/main.rs b/drv/mock-gimlet-seq-server/src/main.rs index 64b0733bd..02023cbbb 100644 --- a/drv/mock-gimlet-seq-server/src/main.rs +++ b/drv/mock-gimlet-seq-server/src/main.rs @@ -7,7 +7,7 @@ #![no_std] #![no_main] -use drv_cpu_seq_api::{PowerState, SeqError}; +use drv_cpu_seq_api::{PowerState, SeqError, StateChangeReason}; use idol_runtime::{NotificationHandler, RequestError}; use task_jefe_api::Jefe; use userlib::{FromPrimitive, RecvMessage, UnwrapLite}; @@ -58,6 +58,7 @@ impl idl::InOrderSequencerImpl for ServerImpl { &mut self, _: &RecvMessage, state: PowerState, + _: StateChangeReason, ) -> Result<(), RequestError> { match (self.get_state_impl(), state) { (PowerState::A2, PowerState::A0) @@ -99,7 +100,7 @@ impl NotificationHandler for ServerImpl { } mod idl { - use super::SeqError; + use super::{SeqError, StateChangeReason}; include!(concat!(env!("OUT_DIR"), "/server_stub.rs")); } From 976a9ab46f965a96fb5042f1a76883ad15b493a1 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 14 Dec 2024 12:33:04 -0800 Subject: [PATCH 04/14] agh, of course there's also grapefruit... --- drv/grapefruit-seq-server/src/main.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drv/grapefruit-seq-server/src/main.rs b/drv/grapefruit-seq-server/src/main.rs index 176ac1570..c42a6af52 100644 --- a/drv/grapefruit-seq-server/src/main.rs +++ b/drv/grapefruit-seq-server/src/main.rs @@ -7,7 +7,7 @@ #![no_std] #![no_main] -use drv_cpu_seq_api::PowerState; +use drv_cpu_seq_api::{PowerState, StateChangeReason}; use drv_spi_api::{SpiDevice, SpiServer}; use drv_stm32xx_sys_api as sys_api; use idol_runtime::{NotificationHandler, RequestError}; @@ -285,6 +285,7 @@ impl idl::InOrderSequencerImpl for ServerImpl { &mut self, _: &RecvMessage, state: PowerState, + _: StateChangeReason, ) -> Result<(), RequestError> { match (self.get_state_impl(), state) { (PowerState::A2, PowerState::A0) @@ -327,7 +328,7 @@ impl NotificationHandler for ServerImpl { } mod idl { - use drv_cpu_seq_api::SeqError; + use drv_cpu_seq_api::{SeqError, StateChangeReason}; include!(concat!(env!("OUT_DIR"), "/server_stub.rs")); } From 63995e0cdd0e5087244bbd413db7d00b6654bfc5 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 14 Dec 2024 12:40:13 -0800 Subject: [PATCH 05/14] fix typo thanks @aapoalas! Co-authored-by: Aapo Alasuutari --- drv/cpu-seq-api/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drv/cpu-seq-api/src/lib.rs b/drv/cpu-seq-api/src/lib.rs index d95a8562b..2a35f4f12 100644 --- a/drv/cpu-seq-api/src/lib.rs +++ b/drv/cpu-seq-api/src/lib.rs @@ -35,7 +35,7 @@ pub enum SeqError { #[derive(Copy, Clone, Debug, FromPrimitive, Eq, PartialEq, AsBytes, Count)] #[repr(u8)] pub enum StateChangeReason { - /// TThe system has just received power, so the sequencer has booted the + /// The system has just received power, so the sequencer has booted the /// host CPU. InitialPowerOn = 1, /// A power state change was requested by the control plane. From 417a03648346320d10ad155e932a19fc4fce77e0 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 16 Dec 2024 10:37:35 -0800 Subject: [PATCH 06/14] differentiate host panics --- drv/cpu-seq-api/src/lib.rs | 2 ++ drv/gimlet-seq-server/src/main.rs | 1 + task/host-sp-comms/src/main.rs | 23 +++++++++++++++++++---- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/drv/cpu-seq-api/src/lib.rs b/drv/cpu-seq-api/src/lib.rs index 2a35f4f12..9f01ed193 100644 --- a/drv/cpu-seq-api/src/lib.rs +++ b/drv/cpu-seq-api/src/lib.rs @@ -42,6 +42,8 @@ pub enum StateChangeReason { ControlPlane, /// The host OS requested that the system power off without rebooting. HostPowerOff, + /// The host OS panicked. + HostPanic, /// The host OS requested that the system reboot. HostReboot, /// The system powered off because a component has overheated. diff --git a/drv/gimlet-seq-server/src/main.rs b/drv/gimlet-seq-server/src/main.rs index 2e29cd620..5ba7f8577 100644 --- a/drv/gimlet-seq-server/src/main.rs +++ b/drv/gimlet-seq-server/src/main.rs @@ -828,6 +828,7 @@ impl ServerImpl { // // Now wait for the end of Group C. + // loop { let mut status = [0u8]; diff --git a/task/host-sp-comms/src/main.rs b/task/host-sp-comms/src/main.rs index 78ac633d7..99a8dc045 100644 --- a/task/host-sp-comms/src/main.rs +++ b/task/host-sp-comms/src/main.rs @@ -12,7 +12,7 @@ use attest_data::messages::{ HostToRotCommand, RecvSprotError as AttestDataSprotError, RotToHost, MAX_DATA_LEN, }; -use drv_cpu_seq_api::{PowerState, SeqError, Sequencer}; +use drv_cpu_seq_api::{PowerState, SeqError, Sequencer, StateChangeReason}; use drv_hf_api::{HfDevSelect, HfMuxState, HostFlash}; use drv_sprot_api::SpRot; use drv_stm32xx_sys_api as sys_api; @@ -250,6 +250,11 @@ struct ServerImpl { reboot_state: Option, host_kv_storage: HostKeyValueStorage, hf_mux_state: Option, + /// Set to true when the host OS panics, and unset when the system reboots. + /// + /// This is used to determine whether a host-triggered power-off is due to a + /// kernel panic or was a normal power-off. + host_just_panicked: bool, } impl ServerImpl { @@ -317,6 +322,7 @@ impl ServerImpl { dtrace_conf_len: 0, }, hf_mux_state: None, + host_just_panicked: false, } } @@ -374,10 +380,15 @@ impl ServerImpl { // basically only ever succeed in our initial set_state() request, so I // don't know how we'd test it fn power_off_host(&mut self, reboot: bool) { - let reason = if reboot { - drv_cpu_seq_api::StateChangeReason::HostReboot + let reason = if self.host_just_panicked { + // Clear the panic flag, so that subsequent power state transitions + // are not marked as panics unless the host OS once again panics. + self.host_just_panicked = false; + StateChangeReason::HostPanic + } else if reboot { + StateChangeReason::HostReboot } else { - drv_cpu_seq_api::StateChangeReason::HostPowerOff + StateChangeReason::HostPowerOff }; loop { // Attempt to move to A2; given we only call this function in @@ -817,6 +828,10 @@ impl ServerImpl { Some(SpToHost::Ack) } HostToSp::HostPanic => { + // Indicate that a subsequent power-off request will be due to a + // panic. + self.host_just_panicked = true; + // TODO forward to MGS // // For now, copy it into a static var we can pull out via From 7e6b0d31f80a9821e6a72ec22821b8fe6611cbb3 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 16 Dec 2024 11:03:29 -0800 Subject: [PATCH 07/14] leave `Sequencer.set_state` as-is for Hiffy compat Since manufacturing and test automation currently uses the `Sequencer.set_state` IPC via Hiffy, let's avoid breaking it and instead introduce a separate `Sequencer.set_state_with_reason`. Now, calls to `set_state` without a reason will get `StateChangeReason::Other`. In practice, this means Hiffy, as all Hubris-internal callers now use `set_state_with_reason`. --- drv/cpu-seq-api/src/lib.rs | 9 ++++++++- drv/gimlet-seq-server/src/main.rs | 8 ++++++++ drv/grapefruit-seq-server/src/main.rs | 9 +++++++++ drv/mock-gimlet-seq-server/src/main.rs | 8 ++++++++ idl/cpu-seq.idol | 13 +++++++++++++ task/control-plane-agent/src/mgs_compute_sled.rs | 2 +- task/host-sp-comms/src/main.rs | 9 ++++++--- task/thermal/src/bsp/gimlet_bcdef.rs | 2 +- 8 files changed, 54 insertions(+), 6 deletions(-) diff --git a/drv/cpu-seq-api/src/lib.rs b/drv/cpu-seq-api/src/lib.rs index 9f01ed193..f909f3ba3 100644 --- a/drv/cpu-seq-api/src/lib.rs +++ b/drv/cpu-seq-api/src/lib.rs @@ -35,9 +35,16 @@ pub enum SeqError { #[derive(Copy, Clone, Debug, FromPrimitive, Eq, PartialEq, AsBytes, Count)] #[repr(u8)] pub enum StateChangeReason { + /// No reason was provided. + /// + /// This indicates a legacy caller of `Sequencer.set_state`, rather than + /// `Sequencer.set_state_with_reason`. All Hubris-internal callers should + /// use `set_state_with_reason`, so this variant generally indicates that + /// the `Sequencer.set_state` IPC is being called via Hiffy. + Other = 1, /// The system has just received power, so the sequencer has booted the /// host CPU. - InitialPowerOn = 1, + InitialPowerOn, /// A power state change was requested by the control plane. ControlPlane, /// The host OS requested that the system power off without rebooting. diff --git a/drv/gimlet-seq-server/src/main.rs b/drv/gimlet-seq-server/src/main.rs index 5ba7f8577..8bd9a6756 100644 --- a/drv/gimlet-seq-server/src/main.rs +++ b/drv/gimlet-seq-server/src/main.rs @@ -1045,6 +1045,14 @@ impl idl::InOrderSequencerImpl for ServerImpl { } fn set_state( + &mut self, + msg: &RecvMessage, + state: PowerState, + ) -> Result<(), RequestError> { + self.set_state_with_reason(msg, state, StateChangeReason::Other) + } + + fn set_state_with_reason( &mut self, _: &RecvMessage, state: PowerState, diff --git a/drv/grapefruit-seq-server/src/main.rs b/drv/grapefruit-seq-server/src/main.rs index c42a6af52..75717f33e 100644 --- a/drv/grapefruit-seq-server/src/main.rs +++ b/drv/grapefruit-seq-server/src/main.rs @@ -29,6 +29,7 @@ enum Trace { ContinueBitstreamLoad(usize), WaitForDone, Programmed, + #[count(skip)] None, } @@ -282,6 +283,14 @@ impl idl::InOrderSequencerImpl for ServerImpl { } fn set_state( + &mut self, + msg: &RecvMessage, + state: PowerState, + ) -> Result<(), RequestError> { + self.set_state_with_reason(msg, state, StateChangeReason::Other) + } + + fn set_state_with_reason( &mut self, _: &RecvMessage, state: PowerState, diff --git a/drv/mock-gimlet-seq-server/src/main.rs b/drv/mock-gimlet-seq-server/src/main.rs index 02023cbbb..c520fd9ff 100644 --- a/drv/mock-gimlet-seq-server/src/main.rs +++ b/drv/mock-gimlet-seq-server/src/main.rs @@ -55,6 +55,14 @@ impl idl::InOrderSequencerImpl for ServerImpl { } fn set_state( + &mut self, + msg: &RecvMessage, + state: PowerState, + ) -> Result<(), ReqestError> { + self.set_state_with_reason(msg, state, StateChangeReason::Other) + } + + fn set_state_with_reason( &mut self, _: &RecvMessage, state: PowerState, diff --git a/idl/cpu-seq.idol b/idl/cpu-seq.idol index ba1909788..12015daa7 100644 --- a/idl/cpu-seq.idol +++ b/idl/cpu-seq.idol @@ -13,6 +13,19 @@ Interface( idempotent: true, ), "set_state": ( + doc: "Set the power state without providing a reason (legacy).", + args: { + "state": ( + type: "drv_cpu_power_state::PowerState", + recv: FromPrimitive("u8"), + ), + }, + reply: Result( + ok: "()", + err: CLike("SeqError"), + ), + ), + "set_state_with_reason": ( doc: "Set the power state", args: { "state": ( diff --git a/task/control-plane-agent/src/mgs_compute_sled.rs b/task/control-plane-agent/src/mgs_compute_sled.rs index 18d43f838..54c46855b 100644 --- a/task/control-plane-agent/src/mgs_compute_sled.rs +++ b/task/control-plane-agent/src/mgs_compute_sled.rs @@ -728,7 +728,7 @@ impl SpHandler for MgsHandler { }; self.sequencer - .set_state( + .set_state_with_reason( power_state, drv_cpu_seq_api::StateChangeReason::ControlPlane, ) diff --git a/task/host-sp-comms/src/main.rs b/task/host-sp-comms/src/main.rs index 99a8dc045..6a5de2147 100644 --- a/task/host-sp-comms/src/main.rs +++ b/task/host-sp-comms/src/main.rs @@ -394,7 +394,10 @@ impl ServerImpl { // Attempt to move to A2; given we only call this function in // response to a host request, we expect we're currently in A0 and // this should work. - let err = match self.sequencer.set_state(PowerState::A2, reason) { + let err = match self + .sequencer + .set_state_with_reason(PowerState::A2, reason) + { Ok(()) => { ringbuf_entry!(Trace::SetState { now: sys_get_timer().now, @@ -1440,9 +1443,9 @@ fn handle_reboot_waiting_in_a2_timer( now: sys_get_timer().now, state: PowerState::A0, }); - _ = sequencer.set_state( + _ = sequencer.set_state_with_reason( PowerState::A0, - drv_cpu_seq_api::StateChangeReason::HostReboot, + StateChangeReason::HostReboot, ); *reboot_state = None; } diff --git a/task/thermal/src/bsp/gimlet_bcdef.rs b/task/thermal/src/bsp/gimlet_bcdef.rs index 92ba23341..ba08f04c4 100644 --- a/task/thermal/src/bsp/gimlet_bcdef.rs +++ b/task/thermal/src/bsp/gimlet_bcdef.rs @@ -110,7 +110,7 @@ impl Bsp { pub fn power_down(&self) -> Result<(), SeqError> { self.seq - .set_state(PowerState::A2, StateChangeReason::Overheat) + .set_state_with_reason(PowerState::A2, StateChangeReason::Overheat) } pub fn power_mode(&self) -> PowerBitmask { From 0be577451fec84f490399cf9e617f76a617ff147 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 16 Dec 2024 11:45:03 -0800 Subject: [PATCH 08/14] argh i cant type --- drv/mock-gimlet-seq-server/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drv/mock-gimlet-seq-server/src/main.rs b/drv/mock-gimlet-seq-server/src/main.rs index c520fd9ff..ce7750db8 100644 --- a/drv/mock-gimlet-seq-server/src/main.rs +++ b/drv/mock-gimlet-seq-server/src/main.rs @@ -58,7 +58,7 @@ impl idl::InOrderSequencerImpl for ServerImpl { &mut self, msg: &RecvMessage, state: PowerState, - ) -> Result<(), ReqestError> { + ) -> Result<(), RequestError> { self.set_state_with_reason(msg, state, StateChangeReason::Other) } From 2650ced4a3bcfaf0ef1384ee64b1f0961e794e2b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 16 Dec 2024 15:18:48 -0800 Subject: [PATCH 09/14] separate panics, boot failures, and resets in host-sp-comms --- drv/cpu-seq-api/src/lib.rs | 9 +++++-- task/host-sp-comms/src/main.rs | 49 +++++++++++++++++++++++----------- 2 files changed, 41 insertions(+), 17 deletions(-) diff --git a/drv/cpu-seq-api/src/lib.rs b/drv/cpu-seq-api/src/lib.rs index f909f3ba3..4dc3ba20b 100644 --- a/drv/cpu-seq-api/src/lib.rs +++ b/drv/cpu-seq-api/src/lib.rs @@ -47,10 +47,15 @@ pub enum StateChangeReason { InitialPowerOn, /// A power state change was requested by the control plane. ControlPlane, - /// The host OS requested that the system power off without rebooting. - HostPowerOff, + /// The host CPU reset while in A0, so the system has powered off to clear + /// hidden core state. + CpuReset, + /// The host OS failed to boot, so the system has powered off. + HostBootFailure, /// The host OS panicked. HostPanic, + /// The host OS requested that the system power off without rebooting. + HostPowerOff, /// The host OS requested that the system reboot. HostReboot, /// The system powered off because a component has overheated. diff --git a/task/host-sp-comms/src/main.rs b/task/host-sp-comms/src/main.rs index 6a5de2147..b693ee3ef 100644 --- a/task/host-sp-comms/src/main.rs +++ b/task/host-sp-comms/src/main.rs @@ -250,11 +250,19 @@ struct ServerImpl { reboot_state: Option, host_kv_storage: HostKeyValueStorage, hf_mux_state: Option, - /// Set to true when the host OS panics, and unset when the system reboots. + /// Set when the host OS fails to boot or panics, and unset when the system + /// reboots. /// /// This is used to determine whether a host-triggered power-off is due to a - /// kernel panic or was a normal power-off. - host_just_panicked: bool, + /// kernel panic, boot failure, or was a normal power-off. + last_power_off: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum PowerOffReason { + HostPanic, + HostBootFailure, + CpuReset, } impl ServerImpl { @@ -322,7 +330,7 @@ impl ServerImpl { dtrace_conf_len: 0, }, hf_mux_state: None, - host_just_panicked: false, + last_power_off: None, } } @@ -380,15 +388,14 @@ impl ServerImpl { // basically only ever succeed in our initial set_state() request, so I // don't know how we'd test it fn power_off_host(&mut self, reboot: bool) { - let reason = if self.host_just_panicked { - // Clear the panic flag, so that subsequent power state transitions - // are not marked as panics unless the host OS once again panics. - self.host_just_panicked = false; - StateChangeReason::HostPanic - } else if reboot { - StateChangeReason::HostReboot - } else { - StateChangeReason::HostPowerOff + let reason = match self.last_power_off { + None if reboot => StateChangeReason::HostReboot, + None => StateChangeReason::HostPowerOff, + Some(PowerOffReason::HostPanic) => StateChangeReason::HostPanic, + Some(PowerOffReason::HostBootFailure) => { + StateChangeReason::HostBootFailure + } + Some(PowerOffReason::CpuReset) => StateChangeReason::CpuReset, }; loop { // Attempt to move to A2; given we only call this function in @@ -463,6 +470,9 @@ impl ServerImpl { // move to A0. Otherwise, ignore this notification. match state { PowerState::A2 | PowerState::A2PlusFans => { + // Clear the last power-off, as we have now reached A2; + // subsequent power-offs will set a new reason. + self.last_power_off = None; // Were we waiting for a transition to A2? If so, start our // timer for going back to A0. if self.reboot_state == Some(RebootState::WaitingForA2) { @@ -482,6 +492,7 @@ impl ServerImpl { // we cannot let the SoC simply reset because the true state // of hidden cores is unknown: explicitly bounce to A2 // as if the host had requested it. + self.last_power_off = Some(PowerOffReason::CpuReset); self.power_off_host(true); } @@ -814,6 +825,9 @@ impl ServerImpl { Some(response) } HostToSp::HostBootFailure { reason } => { + // Indicate that the host boot failed, so that we can then tell + // sequencer why we are asking it to power off the system. + self.last_power_off = Some(PowerOffReason::HostPanic); // TODO forward to MGS // // For now, copy it into a static var we can pull out via @@ -832,8 +846,13 @@ impl ServerImpl { } HostToSp::HostPanic => { // Indicate that a subsequent power-off request will be due to a - // panic. - self.host_just_panicked = true; + // panic. Since a host panic message is also sent after a host + // boot failure, only set the last power-off reason if we + // haven't just seen a boot failure. + if self.last_power_off != Some(PowerOffReason::HostBootFailure) + { + self.last_power_off = Some(PowerOffReason::HostPanic); + } // TODO forward to MGS // From cc5a068b17bfc78e29d2523c13454b24f706926b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 17 Dec 2024 09:56:06 -0800 Subject: [PATCH 10/14] actually record host boot failures --- task/host-sp-comms/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task/host-sp-comms/src/main.rs b/task/host-sp-comms/src/main.rs index b693ee3ef..e4d0fd1ad 100644 --- a/task/host-sp-comms/src/main.rs +++ b/task/host-sp-comms/src/main.rs @@ -827,7 +827,7 @@ impl ServerImpl { HostToSp::HostBootFailure { reason } => { // Indicate that the host boot failed, so that we can then tell // sequencer why we are asking it to power off the system. - self.last_power_off = Some(PowerOffReason::HostPanic); + self.last_power_off = Some(PowerOffReason::HostBootFailure); // TODO forward to MGS // // For now, copy it into a static var we can pull out via From 8e608881a711ff544f5ed4480109727ab1d589d4 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 17 Dec 2024 09:59:22 -0800 Subject: [PATCH 11/14] clear last power off only upon reaching A0 --- task/host-sp-comms/src/main.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/task/host-sp-comms/src/main.rs b/task/host-sp-comms/src/main.rs index e4d0fd1ad..e5c9963e9 100644 --- a/task/host-sp-comms/src/main.rs +++ b/task/host-sp-comms/src/main.rs @@ -470,9 +470,6 @@ impl ServerImpl { // move to A0. Otherwise, ignore this notification. match state { PowerState::A2 | PowerState::A2PlusFans => { - // Clear the last power-off, as we have now reached A2; - // subsequent power-offs will set a new reason. - self.last_power_off = None; // Were we waiting for a transition to A2? If so, start our // timer for going back to A0. if self.reboot_state == Some(RebootState::WaitingForA2) { @@ -497,6 +494,9 @@ impl ServerImpl { } PowerState::A0 | PowerState::A0PlusHP | PowerState::A0Thermtrip => { + // Clear the last power-off, as we have now reached A0; + // subsequent power-offs will set a new reason. + self.last_power_off = None; // TODO should we clear self.reboot_state here? What if we // transitioned from one A0 state to another? For now, leave it // set, and we'll move back to A0 whenever we transition to From 108894718aec404173b363f5c6f77658e3af8dbb Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 17 Dec 2024 10:48:59 -0800 Subject: [PATCH 12/14] include the last reason in host-sp-comms reboots Instead of just setting it to `HostReboot` always, hang onto the last power off until reaching A0, so that the reason can be sent for the `set_state` call to reboot, as well as the power off. Also, just use `StateChangeReason` here instead of our own enum, and add it to the `host-sp-comms` ringbuf as well. --- task/host-sp-comms/src/main.rs | 68 +++++++++++++++------------------- 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/task/host-sp-comms/src/main.rs b/task/host-sp-comms/src/main.rs index e5c9963e9..70c772956 100644 --- a/task/host-sp-comms/src/main.rs +++ b/task/host-sp-comms/src/main.rs @@ -106,6 +106,7 @@ enum Trace { now: u64, #[count(children)] state: PowerState, + why: StateChangeReason, }, HfMux { now: u64, @@ -255,14 +256,7 @@ struct ServerImpl { /// /// This is used to determine whether a host-triggered power-off is due to a /// kernel panic, boot failure, or was a normal power-off. - last_power_off: Option, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum PowerOffReason { - HostPanic, - HostBootFailure, - CpuReset, + last_power_off: Option, } impl ServerImpl { @@ -388,35 +382,31 @@ impl ServerImpl { // basically only ever succeed in our initial set_state() request, so I // don't know how we'd test it fn power_off_host(&mut self, reboot: bool) { - let reason = match self.last_power_off { + let why = match self.last_power_off { None if reboot => StateChangeReason::HostReboot, None => StateChangeReason::HostPowerOff, - Some(PowerOffReason::HostPanic) => StateChangeReason::HostPanic, - Some(PowerOffReason::HostBootFailure) => { - StateChangeReason::HostBootFailure - } - Some(PowerOffReason::CpuReset) => StateChangeReason::CpuReset, + Some(reason) => reason, }; loop { // Attempt to move to A2; given we only call this function in // response to a host request, we expect we're currently in A0 and // this should work. - let err = match self - .sequencer - .set_state_with_reason(PowerState::A2, reason) - { - Ok(()) => { - ringbuf_entry!(Trace::SetState { - now: sys_get_timer().now, - state: PowerState::A2, - }); - if reboot { - self.reboot_state = Some(RebootState::WaitingForA2); + let err = + match self.sequencer.set_state_with_reason(PowerState::A2, why) + { + Ok(()) => { + ringbuf_entry!(Trace::SetState { + now: sys_get_timer().now, + why, + state: PowerState::A2, + }); + if reboot { + self.reboot_state = Some(RebootState::WaitingForA2); + } + return; } - return; - } - Err(err) => err, - }; + Err(err) => err, + }; // The only error we should see from `set_state()` is an illegal // transition, if we're not currently in A0. @@ -489,7 +479,7 @@ impl ServerImpl { // we cannot let the SoC simply reset because the true state // of hidden cores is unknown: explicitly bounce to A2 // as if the host had requested it. - self.last_power_off = Some(PowerOffReason::CpuReset); + self.last_power_off = Some(StateChangeReason::CpuReset); self.power_off_host(true); } @@ -827,7 +817,7 @@ impl ServerImpl { HostToSp::HostBootFailure { reason } => { // Indicate that the host boot failed, so that we can then tell // sequencer why we are asking it to power off the system. - self.last_power_off = Some(PowerOffReason::HostBootFailure); + self.last_power_off = Some(StateChangeReason::HostBootFailure); // TODO forward to MGS // // For now, copy it into a static var we can pull out via @@ -849,9 +839,8 @@ impl ServerImpl { // panic. Since a host panic message is also sent after a host // boot failure, only set the last power-off reason if we // haven't just seen a boot failure. - if self.last_power_off != Some(PowerOffReason::HostBootFailure) - { - self.last_power_off = Some(PowerOffReason::HostPanic); + if self.last_power_off.is_none() { + self.last_power_off = Some(StateChangeReason::HostPanic); } // TODO forward to MGS @@ -1396,6 +1385,10 @@ impl NotificationHandler for ServerImpl { handle_reboot_waiting_in_a2_timer( &self.sequencer, &mut self.reboot_state, + // If the last power-off did not set a reason, it must + // be a host-requested reboot. + self.last_power_off + .unwrap_or(StateChangeReason::HostReboot), ); } Timers::TxPeriodicZeroByte => { @@ -1449,6 +1442,7 @@ fn parse_received_message( fn handle_reboot_waiting_in_a2_timer( sequencer: &Sequencer, reboot_state: &mut Option, + why: StateChangeReason, ) { // If we're past the deadline for transitioning to A0, attempt to do so. if let Some(RebootState::WaitingInA2RebootDelay) = reboot_state { @@ -1460,12 +1454,10 @@ fn handle_reboot_waiting_in_a2_timer( // we've done what we can to reboot, so clear out `reboot_state`. ringbuf_entry!(Trace::SetState { now: sys_get_timer().now, + why, state: PowerState::A0, }); - _ = sequencer.set_state_with_reason( - PowerState::A0, - StateChangeReason::HostReboot, - ); + _ = sequencer.set_state_with_reason(PowerState::A0, why); *reboot_state = None; } } From 007470bdac2b56c18e959144b5566205691d81f8 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 18 Dec 2024 09:54:55 -0800 Subject: [PATCH 13/14] don't have IPC methods call each other --- drv/gimlet-seq-server/src/main.rs | 5 ++-- drv/grapefruit-seq-server/src/main.rs | 35 ++++++++++++++------------ drv/mock-gimlet-seq-server/src/main.rs | 31 +++++++++++++---------- 3 files changed, 40 insertions(+), 31 deletions(-) diff --git a/drv/gimlet-seq-server/src/main.rs b/drv/gimlet-seq-server/src/main.rs index 8bd9a6756..2e66c25b5 100644 --- a/drv/gimlet-seq-server/src/main.rs +++ b/drv/gimlet-seq-server/src/main.rs @@ -1046,10 +1046,11 @@ impl idl::InOrderSequencerImpl for ServerImpl { fn set_state( &mut self, - msg: &RecvMessage, + _: &RecvMessage, state: PowerState, ) -> Result<(), RequestError> { - self.set_state_with_reason(msg, state, StateChangeReason::Other) + self.set_state_internal(state, StateChangeReason::Other) + .map_err(RequestError::from) } fn set_state_with_reason( diff --git a/drv/grapefruit-seq-server/src/main.rs b/drv/grapefruit-seq-server/src/main.rs index 75717f33e..0045b582b 100644 --- a/drv/grapefruit-seq-server/src/main.rs +++ b/drv/grapefruit-seq-server/src/main.rs @@ -269,6 +269,17 @@ impl ServerImpl { fn set_state_impl(&self, state: PowerState) { self.jefe.set_state(state as u32); } + + fn validate_state_change(&self, state: PowerState) -> Result<(), SeqError> { + match (self.get_state_impl(), state) { + (PowerState::A2, PowerState::A0) + | (PowerState::A0, PowerState::A2) + | (PowerState::A0PlusHP, PowerState::A2) + | (PowerState::A0Thermtrip, PowerState::A2) => Ok(()), + + _ => Err(SeqError::IllegalTransition), + } + } } // The `Sequencer` implementation for Grapefruit is copied from @@ -286,8 +297,10 @@ impl idl::InOrderSequencerImpl for ServerImpl { &mut self, msg: &RecvMessage, state: PowerState, - ) -> Result<(), RequestError> { - self.set_state_with_reason(msg, state, StateChangeReason::Other) + ) -> Result<(), RequestError> { + self.validate_state_change(state)?; + self.set_state_impl(state); + Ok(()) } fn set_state_with_reason( @@ -295,20 +308,10 @@ impl idl::InOrderSequencerImpl for ServerImpl { _: &RecvMessage, state: PowerState, _: StateChangeReason, - ) -> Result<(), RequestError> { - match (self.get_state_impl(), state) { - (PowerState::A2, PowerState::A0) - | (PowerState::A0, PowerState::A2) - | (PowerState::A0PlusHP, PowerState::A2) - | (PowerState::A0Thermtrip, PowerState::A2) => { - self.set_state_impl(state); - Ok(()) - } - - _ => Err(RequestError::Runtime( - drv_cpu_seq_api::SeqError::IllegalTransition, - )), - } + ) -> Result<(), RequestError> { + self.validate_state_change(state)?; + self.set_state_impl(state); + Ok(()) } fn send_hardware_nmi( diff --git a/drv/mock-gimlet-seq-server/src/main.rs b/drv/mock-gimlet-seq-server/src/main.rs index ce7750db8..46d9f48c4 100644 --- a/drv/mock-gimlet-seq-server/src/main.rs +++ b/drv/mock-gimlet-seq-server/src/main.rs @@ -44,6 +44,17 @@ impl ServerImpl { fn set_state_impl(&self, state: PowerState) { self.jefe.set_state(state as u32); } + + fn validate_state_change(&self, state: PowerState) -> Result<(), SeqError> { + match (self.get_state_impl(), state) { + (PowerState::A2, PowerState::A0) + | (PowerState::A0, PowerState::A2) + | (PowerState::A0PlusHP, PowerState::A2) + | (PowerState::A0Thermtrip, PowerState::A2) => Ok(()), + + _ => Err(SeqError::IllegalTransition), + } + } } impl idl::InOrderSequencerImpl for ServerImpl { @@ -56,10 +67,12 @@ impl idl::InOrderSequencerImpl for ServerImpl { fn set_state( &mut self, - msg: &RecvMessage, + _: &RecvMessage, state: PowerState, ) -> Result<(), RequestError> { - self.set_state_with_reason(msg, state, StateChangeReason::Other) + self.validate_state_change(state)?; + self.set_state_impl(state); + Ok(()) } fn set_state_with_reason( @@ -68,17 +81,9 @@ impl idl::InOrderSequencerImpl for ServerImpl { state: PowerState, _: StateChangeReason, ) -> Result<(), RequestError> { - match (self.get_state_impl(), state) { - (PowerState::A2, PowerState::A0) - | (PowerState::A0, PowerState::A2) - | (PowerState::A0PlusHP, PowerState::A2) - | (PowerState::A0Thermtrip, PowerState::A2) => { - self.set_state_impl(state); - Ok(()) - } - - _ => Err(RequestError::Runtime(SeqError::IllegalTransition)), - } + self.validate_state_change(state)?; + self.set_state_impl(state); + Ok(()) } fn send_hardware_nmi( From 4e838fbd381d737dc4728578ab5b59b319977868 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 18 Dec 2024 15:00:18 -0800 Subject: [PATCH 14/14] unbreak grapefruit again --- drv/grapefruit-seq-server/src/main.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drv/grapefruit-seq-server/src/main.rs b/drv/grapefruit-seq-server/src/main.rs index 0045b582b..5787e7cec 100644 --- a/drv/grapefruit-seq-server/src/main.rs +++ b/drv/grapefruit-seq-server/src/main.rs @@ -270,14 +270,17 @@ impl ServerImpl { self.jefe.set_state(state as u32); } - fn validate_state_change(&self, state: PowerState) -> Result<(), SeqError> { + fn validate_state_change( + &self, + state: PowerState, + ) -> Result<(), drv_cpu_seq_api::SeqError> { match (self.get_state_impl(), state) { (PowerState::A2, PowerState::A0) | (PowerState::A0, PowerState::A2) | (PowerState::A0PlusHP, PowerState::A2) | (PowerState::A0Thermtrip, PowerState::A2) => Ok(()), - _ => Err(SeqError::IllegalTransition), + _ => Err(drv_cpu_seq_api::SeqError::IllegalTransition), } } } @@ -295,9 +298,9 @@ impl idl::InOrderSequencerImpl for ServerImpl { fn set_state( &mut self, - msg: &RecvMessage, + _: &RecvMessage, state: PowerState, - ) -> Result<(), RequestError> { + ) -> Result<(), RequestError> { self.validate_state_change(state)?; self.set_state_impl(state); Ok(()) @@ -308,7 +311,7 @@ impl idl::InOrderSequencerImpl for ServerImpl { _: &RecvMessage, state: PowerState, _: StateChangeReason, - ) -> Result<(), RequestError> { + ) -> Result<(), RequestError> { self.validate_state_change(state)?; self.set_state_impl(state); Ok(())