Authority overload monitor (MystenLabs#16050)

Roll forward MystenLabs#15981 Previously, the PR was causing CI failure because `max_txn_age_in_queue` was used in the config. Therefore, although `overload_threshold_config` has a default, since `max_txn_age_in_queue` is specified, serde expects the rest of config to show up in the config file. Because we don't have a serde default set for the rest of the field, it crashes. This PR added serde default for all the field in OverloadThresholdConfig in addition to the original MystenLabs#15981 . I manually verified that it can parse the config correctly when only partial config is set.
tnowacki · Feb 2, 2024 · 4a22454 · 4a22454
1 parent 9898f10
commit 4a22454
Show file tree

Hide file tree

Showing 11 changed files with 485 additions and 11 deletions.
diff --git a/crates/sui-config/src/node.rs b/crates/sui-config/src/node.rs
@@ -685,16 +685,69 @@ pub struct TransactionKeyValueStoreWriteConfig {
 /// stop processing new transactions and/or certificates until the congestion
 /// resolves.
 #[derive(Clone, Debug, Deserialize, Serialize)]
+#[serde(rename_all = "kebab-case")]
 pub struct OverloadThresholdConfig {
+    #[serde(default = "default_max_txn_age_in_queue")]
     pub max_txn_age_in_queue: Duration,
+
+    // The interval of checking overload signal.
+    #[serde(default = "default_overload_monitor_interval")]
+    pub overload_monitor_interval: Duration,
+
+    // The execution queueing latency when entering load shedding mode.
+    #[serde(default = "default_execution_queue_latency_soft_limit")]
+    pub execution_queue_latency_soft_limit: Duration,
+
+    // The execution queueing latency when entering aggressive load shedding mode.
+    #[serde(default = "default_execution_queue_latency_hard_limit")]
+    pub execution_queue_latency_hard_limit: Duration,
+
+    // The maximum percentage of transactions to shed in load shedding mode.
+    #[serde(default = "default_max_load_shedding_percentage")]
+    pub max_load_shedding_percentage: u32,
+
+    // When in aggressive load shedding mode, the the minimum percentage of
+    // transactions to shed.
+    #[serde(default = "default_min_load_shedding_percentage_above_hard_limit")]
+    pub min_load_shedding_percentage_above_hard_limit: u32,
     // TODO: Move other thresholds here as well, including `MAX_TM_QUEUE_LENGTH`
     // and `MAX_PER_OBJECT_QUEUE_LENGTH`.
 }
 
+fn default_max_txn_age_in_queue() -> Duration {
+    Duration::from_secs(1)
+}
+
+fn default_overload_monitor_interval() -> Duration {
+    Duration::from_secs(10)
+}
+
+fn default_execution_queue_latency_soft_limit() -> Duration {
+    Duration::from_secs(1)
+}
+
+fn default_execution_queue_latency_hard_limit() -> Duration {
+    Duration::from_secs(10)
+}
+
+fn default_max_load_shedding_percentage() -> u32 {
+    95
+}
+
+fn default_min_load_shedding_percentage_above_hard_limit() -> u32 {
+    50
+}
+
 impl Default for OverloadThresholdConfig {
     fn default() -> Self {
         Self {
-            max_txn_age_in_queue: Duration::from_secs(1), // 1 second
+            max_txn_age_in_queue: default_max_txn_age_in_queue(),
+            overload_monitor_interval: default_overload_monitor_interval(),
+            execution_queue_latency_soft_limit: default_execution_queue_latency_soft_limit(),
+            execution_queue_latency_hard_limit: default_execution_queue_latency_hard_limit(),
+            max_load_shedding_percentage: default_max_load_shedding_percentage(),
+            min_load_shedding_percentage_above_hard_limit:
+                default_min_load_shedding_percentage_above_hard_limit(),
         }
     }
 }

diff --git a/crates/sui-core/src/authority.rs b/crates/sui-core/src/authority.rs
@@ -140,7 +140,9 @@ use crate::epoch::committee_store::CommitteeStore;
 use crate::execution_driver::execution_process;
 use crate::in_mem_execution_cache::{ExecutionCache, ExecutionCacheRead, ExecutionCacheWrite};
 use crate::metrics::LatencyObserver;
+use crate::metrics::RateTracker;
 use crate::module_cache_metrics::ResolverMetrics;
+use crate::overload_monitor::{overload_monitor, AuthorityOverloadInfo};
 use crate::stake_aggregator::StakeAggregator;
 use crate::state_accumulator::{StateAccumulator, WrappedObject};
 use crate::subscription_handler::SubscriptionHandler;
@@ -237,6 +239,9 @@ pub struct AuthorityMetrics {
     pub(crate) skipped_consensus_txns: IntCounter,
     pub(crate) skipped_consensus_txns_cache_hit: IntCounter,
 
+    pub(crate) authority_overload_status: IntGauge,
+    pub(crate) authority_load_shedding_percentage: IntGauge,
+
     /// Post processing metrics
     post_processing_total_events_emitted: IntCounter,
     post_processing_total_tx_indexed: IntCounter,
@@ -269,6 +274,18 @@ pub struct AuthorityMetrics {
     // Tracks recent average txn queueing delay between when it is ready for execution
     // until it starts executing.
     pub execution_queueing_latency: LatencyObserver,
+
+    // Tracks the rate of transactions become ready for execution in transaction manager.
+    // The need for the Mutex is that the tracker is updated in transaction manager and read
+    // in the overload_monitor. There should be low mutex contention because
+    // transaction manager is single threaded and the read rate in overload_monitor is
+    // low. In the case where transaction manager becomes multi-threaded, we can
+    // create one rate tracker per thread.
+    pub txn_ready_rate_tracker: Arc<Mutex<RateTracker>>,
+
+    // Tracks the rate of transactions starts execution in execution driver.
+    // Similar reason for using a Mutex here as to `txn_ready_rate_tracker`.
+    pub execution_rate_tracker: Arc<Mutex<RateTracker>>,
 }
 
 // Override default Prom buckets for positive numbers in 0-50k range
@@ -453,6 +470,16 @@ impl AuthorityMetrics {
                 registry,
             )
             .unwrap(),
+            authority_overload_status: register_int_gauge_with_registry!(
+                "authority_overload_status",
+                "Whether authority is current experiencing overload and enters load shedding mode.",
+                registry)
+            .unwrap(),
+            authority_load_shedding_percentage: register_int_gauge_with_registry!(
+                "authority_load_shedding_percentage",
+                "The percentage of transactions is shed when the authority is in load shedding mode.",
+                registry)
+            .unwrap(),
             transaction_manager_object_cache_misses: register_int_counter_with_registry!(
                 "transaction_manager_object_cache_misses",
                 "Number of object-availability cache misses in TransactionManager",
@@ -618,6 +645,8 @@ impl AuthorityMetrics {
                 registry
             ).unwrap(),
             execution_queueing_latency: LatencyObserver::new(),
+            txn_ready_rate_tracker: Arc::new(Mutex::new(RateTracker::new(Duration::from_secs(10)))),
+            execution_rate_tracker: Arc::new(Mutex::new(RateTracker::new(Duration::from_secs(10)))),
         }
     }
 }
@@ -684,6 +713,9 @@ pub struct AuthorityState {
 
     /// Config for when we consider the node overloaded.
     overload_threshold_config: OverloadThresholdConfig,
+
+    /// Current overload status in this authority. Updated periodically.
+    pub overload_info: AuthorityOverloadInfo,
 }
 
 /// The authority state encapsulates all state, drives execution, and ensures safety.
@@ -2452,17 +2484,21 @@ impl AuthorityState {
             transaction_deny_config,
             certificate_deny_config,
             debug_dump_config,
-            overload_threshold_config,
+            overload_threshold_config: overload_threshold_config.clone(),
+            overload_info: AuthorityOverloadInfo::default(),
         });
 
         // Start a task to execute ready certificates.
         let authority_state = Arc::downgrade(&state);
         spawn_monitored_task!(execution_process(
             authority_state,
             rx_ready_certificates,
-            rx_execution_shutdown
+            rx_execution_shutdown,
         ));
 
+        let authority_state = Arc::downgrade(&state);
+        spawn_monitored_task!(overload_monitor(authority_state, overload_threshold_config));
+
         // TODO: This doesn't belong to the constructor of AuthorityState.
         state
             .create_owner_index_if_empty(genesis_objects, &epoch_store)

diff --git a/crates/sui-core/src/consensus_throughput_calculator.rs b/crates/sui-core/src/consensus_throughput_calculator.rs
@@ -469,6 +469,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg_attr(msim, ignore)]
     pub fn test_consensus_throughput_calculator() {
         let metrics = Arc::new(AuthorityMetrics::new(&Registry::new()));
         let max_observation_points: NonZeroU64 = NonZeroU64::new(3).unwrap();
@@ -513,6 +514,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg_attr(msim, ignore)]
     pub fn test_throughput_calculator_same_timestamp_observations() {
         let metrics = Arc::new(AuthorityMetrics::new(&Registry::new()));
         let max_observation_points: NonZeroU64 = NonZeroU64::new(2).unwrap();
@@ -543,6 +545,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg_attr(msim, ignore)]
     pub fn test_consensus_throughput_profiler() {
         let metrics = Arc::new(AuthorityMetrics::new(&Registry::new()));
         let throughput_profile_update_interval: TimestampSecs = 5;
@@ -610,6 +613,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg_attr(msim, ignore)]
     pub fn test_consensus_throughput_profiler_update_interval() {
         let metrics = Arc::new(AuthorityMetrics::new(&Registry::new()));
         let throughput_profile_update_interval: TimestampSecs = 5;
@@ -662,6 +666,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg_attr(msim, ignore)]
     pub fn test_consensus_throughput_profiler_cool_down() {
         let metrics = Arc::new(AuthorityMetrics::new(&Registry::new()));
         let throughput_profile_update_window: TimestampSecs = 3;

diff --git a/crates/sui-core/src/execution_driver.rs b/crates/sui-core/src/execution_driver.rs
@@ -104,6 +104,8 @@ pub async fn execution_process(
             }
         }
 
+        authority.metrics.execution_rate_tracker.lock().record();
+
         // Certificate execution can take significant time, so run it in a separate task.
         spawn_monitored_task!(async move {
             let _scope = monitored_scope("ExecutionDriver::task");

diff --git a/crates/sui-core/src/lib.rs b/crates/sui-core/src/lib.rs
@@ -22,6 +22,7 @@ pub mod in_mem_execution_cache;
 pub mod metrics;
 pub mod module_cache_metrics;
 pub mod mysticeti_adapter;
+mod overload_monitor;
 pub(crate) mod post_consensus_tx_reorder;
 pub mod quorum_driver;
 pub mod safe_client;