Skip to content

Commit

Permalink
add metrics for system overload (MystenLabs#14573)
Browse files Browse the repository at this point in the history
## Description 

Add some metrics for 
- How long transactions wait in execution queue,
- Rejections at signing and executing stage due to system overload,

---
If your changes are not user-facing and not a breaking change, you can
skip the following section. Otherwise, please indicate what changed, and
then add to the Release Notes section as highlighted during the release
process.

### Type of Change (Check all that apply)

- [ ] protocol change
- [ ] user-visible impact
- [ ] breaking change for a client SDKs
- [ ] breaking change for FNs (FN binary must upgrade)
- [ ] breaking change for validators or node operators (must upgrade
binaries)
- [ ] breaking change for on-chain data layout
- [ ] necessitate either a data wipe or data migration

### Release notes
  • Loading branch information
emmazzz authored Nov 1, 2023
1 parent adbfd98 commit 8c09795
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 6 deletions.
8 changes: 8 additions & 0 deletions crates/sui-core/src/authority.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ pub struct AuthorityMetrics {
pub(crate) transaction_manager_package_cache_hits: IntCounter,
pub(crate) transaction_manager_package_cache_misses: IntCounter,
pub(crate) transaction_manager_package_cache_evictions: IntCounter,
pub(crate) transaction_manager_transaction_queue_age_s: Histogram,

pub(crate) execution_driver_executed_transactions: IntCounter,
pub(crate) execution_driver_dispatch_queue: IntGauge,
Expand Down Expand Up @@ -469,6 +470,13 @@ impl AuthorityMetrics {
registry,
)
.unwrap(),
transaction_manager_transaction_queue_age_s: register_histogram_with_registry!(
"transaction_manager_transaction_queue_age_s",
"Time spent in waiting for transaction in the queue",
LATENCY_SEC_BUCKETS.to_vec(),
registry,
)
.unwrap(),
execution_driver_executed_transactions: register_int_counter_with_registry!(
"execution_driver_executed_transactions",
"Cumulative number of transaction executed by execution driver",
Expand Down
41 changes: 38 additions & 3 deletions crates/sui-core/src/authority_server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@ use anyhow::Result;
use async_trait::async_trait;
use mysten_metrics::histogram::Histogram as MystenHistogram;
use mysten_metrics::spawn_monitored_task;
use prometheus::{register_int_counter_with_registry, IntCounter, Registry};
use prometheus::{
register_int_counter_vec_with_registry, register_int_counter_with_registry, IntCounter,
IntCounterVec, Registry,
};
use std::{io, sync::Arc};
use sui_network::{
api::{Validator, ValidatorServer},
Expand Down Expand Up @@ -148,6 +151,8 @@ pub struct ValidatorServiceMetrics {

num_rejected_tx_in_epoch_boundary: IntCounter,
num_rejected_cert_in_epoch_boundary: IntCounter,
num_rejected_tx_during_overload: IntCounterVec,
num_rejected_cert_during_overload: IntCounterVec,
}

impl ValidatorServiceMetrics {
Expand Down Expand Up @@ -206,6 +211,20 @@ impl ValidatorServiceMetrics {
registry,
)
.unwrap(),
num_rejected_tx_during_overload: register_int_counter_vec_with_registry!(
"validator_service_num_rejected_tx_during_overload",
"Number of rejected transaction due to system overload",
&["error_type"],
registry,
)
.unwrap(),
num_rejected_cert_during_overload: register_int_counter_vec_with_registry!(
"validator_service_num_rejected_cert_during_overload",
"Number of rejected transaction certificate due to system overload",
&["error_type"],
registry,
)
.unwrap(),
}
}

Expand Down Expand Up @@ -307,7 +326,15 @@ impl ValidatorService {
}
.into()
);
state.check_system_overload(&consensus_adapter, transaction.data())?;
let overload_check_res =
state.check_system_overload(&consensus_adapter, transaction.data());
if let Err(error) = overload_check_res {
metrics
.num_rejected_tx_during_overload
.with_label_values(&[error.as_ref()])
.inc();
return Err(error.into());
}
let _handle_tx_metrics_guard = metrics.handle_transaction_latency.start_timer();

let tx_verif_metrics_guard = metrics.tx_verification_latency.start_timer();
Expand Down Expand Up @@ -395,7 +422,15 @@ impl ValidatorService {
);

// Check system overload
state.check_system_overload(&consensus_adapter, certificate.data())?;
let overload_check_res =
state.check_system_overload(&consensus_adapter, certificate.data());
if let Err(error) = overload_check_res {
metrics
.num_rejected_cert_during_overload
.with_label_values(&[error.as_ref()])
.inc();
return Err(error.into());
}

// code block within reconfiguration lock
let certificate = {
Expand Down
11 changes: 8 additions & 3 deletions crates/sui-core/src/transaction_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,7 @@ impl Inner {
&mut self,
input_key: InputKey,
update_cache: bool,
metrics: &Arc<AuthorityMetrics>,
) -> Vec<PendingCertificate> {
if update_cache {
self.available_objects_cache.insert(&input_key);
Expand Down Expand Up @@ -327,8 +328,12 @@ impl Inner {
)
});
for digest in digests.iter() {
let age_opt = input_txns.shift_remove(digest);
// The digest of the transaction must be inside the map.
assert!(input_txns.shift_remove(digest).is_some());
assert!(age_opt.is_some());
metrics
.transaction_manager_transaction_queue_age_s
.observe(age_opt.unwrap().elapsed().as_secs_f64());
}

if input_txns.is_empty() {
Expand Down Expand Up @@ -754,7 +759,7 @@ impl TransactionManager {

for input_key in input_keys {
trace!(?input_key, "object available");
for ready_cert in inner.try_acquire_lock(input_key, update_cache) {
for ready_cert in inner.try_acquire_lock(input_key, update_cache, &self.metrics) {
self.certificate_ready(inner, ready_cert);
}
}
Expand Down Expand Up @@ -805,7 +810,7 @@ impl TransactionManager {
"Certificate {:?} not found among readonly lock holders",
digest
);
for ready_cert in inner.try_acquire_lock(key, true) {
for ready_cert in inner.try_acquire_lock(key, true, &self.metrics) {
self.certificate_ready(&mut inner, ready_cert);
}
}
Expand Down

0 comments on commit 8c09795

Please sign in to comment.