From 49ad3009c32e14dac925d64055abfacd90528cce Mon Sep 17 00:00:00 2001 From: Xiangyi Zheng Date: Wed, 9 Oct 2024 16:36:38 -0700 Subject: [PATCH 01/10] add log to verify state() hangs --- chain-signatures/node/src/mesh/connection.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/chain-signatures/node/src/mesh/connection.rs b/chain-signatures/node/src/mesh/connection.rs index ef6029ac3..4096b72a2 100644 --- a/chain-signatures/node/src/mesh/connection.rs +++ b/chain-signatures/node/src/mesh/connection.rs @@ -49,7 +49,17 @@ impl Pool { continue; }; + tracing::debug!( + "going to send /state call to {:?} url {} /state", + participant, + info.url + ); let Ok(resp) = self.http.get(url.clone()).send().await else { + tracing::debug!( + "finish send /state call to {:?} url {} /state", + participant, + info.url + ); tracing::warn!( "Pool.ping resp err participant {:?} url {}", participant, @@ -58,6 +68,12 @@ impl Pool { continue; }; + tracing::debug!( + "finish send /state call to {:?} url {} /state", + participant, + info.url + ); + let Ok(state): Result = resp.json().await else { tracing::warn!( "Pool.ping state view err participant {:?} url {}", From 7ae09922627f1e7b0868f8f919ca33993998998e Mon Sep 17 00:00:00 2001 From: Xiangyi Zheng Date: Wed, 9 Oct 2024 17:18:35 -0700 Subject: [PATCH 02/10] more logs --- chain-signatures/node/src/protocol/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/chain-signatures/node/src/protocol/mod.rs b/chain-signatures/node/src/protocol/mod.rs index 73cea146e..954f6d2e6 100644 --- a/chain-signatures/node/src/protocol/mod.rs +++ b/chain-signatures/node/src/protocol/mod.rs @@ -284,10 +284,13 @@ impl MpcSignProtocol { } if last_pinged.elapsed() > Duration::from_millis(300) { + tracing::debug!("doing ping now"); self.ctx.mesh.ping().await; last_pinged = Instant::now(); } + tracing::debug!("finish ping now"); + let state = { let guard = self.state.read().await; guard.clone() From 0799a1e07f10804540b73045dc059c3cbe9484ea Mon Sep 17 00:00:00 2001 From: Xiangyi Zheng Date: Wed, 9 Oct 2024 17:19:50 -0700 Subject: [PATCH 03/10] more logs --- chain-signatures/node/src/protocol/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/chain-signatures/node/src/protocol/mod.rs b/chain-signatures/node/src/protocol/mod.rs index 954f6d2e6..34b134f51 100644 --- a/chain-signatures/node/src/protocol/mod.rs +++ b/chain-signatures/node/src/protocol/mod.rs @@ -296,6 +296,7 @@ impl MpcSignProtocol { guard.clone() }; + tracing::debug!("finish reading state now"); let crypto_time = Instant::now(); let mut state = match state.progress(&mut self).await { Ok(state) => { From 75186f7575a99b8e73c30b2537cab3fbfbf2f244 Mon Sep 17 00:00:00 2001 From: Xiangyi Zheng Date: Wed, 9 Oct 2024 17:24:00 -0700 Subject: [PATCH 04/10] log ping latency --- chain-signatures/node/src/protocol/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/chain-signatures/node/src/protocol/mod.rs b/chain-signatures/node/src/protocol/mod.rs index 34b134f51..ea1cd5455 100644 --- a/chain-signatures/node/src/protocol/mod.rs +++ b/chain-signatures/node/src/protocol/mod.rs @@ -285,7 +285,10 @@ impl MpcSignProtocol { if last_pinged.elapsed() > Duration::from_millis(300) { tracing::debug!("doing ping now"); + let start = Instant::now(); self.ctx.mesh.ping().await; + let ping_latency = start.elapsed().as_secs(); + tracing::debug!("ping latency was {ping_latency} seconds"); last_pinged = Instant::now(); } From e1739f521bf14cdc53be76b3c5583fac0386b234 Mon Sep 17 00:00:00 2001 From: Xiangyi Zheng Date: Wed, 9 Oct 2024 17:51:25 -0700 Subject: [PATCH 05/10] add timeout to requests --- chain-signatures/node/src/cli.rs | 2 ++ chain-signatures/node/src/http_client.rs | 1 + chain-signatures/node/src/mesh/connection.rs | 4 ++-- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/chain-signatures/node/src/cli.rs b/chain-signatures/node/src/cli.rs index c47d86604..c74f2b23c 100644 --- a/chain-signatures/node/src/cli.rs +++ b/chain-signatures/node/src/cli.rs @@ -8,6 +8,7 @@ use local_ip_address::local_ip; use near_account_id::AccountId; use near_crypto::{InMemorySigner, SecretKey}; use std::sync::Arc; +use std::time::Duration; use tokio::sync::{mpsc, RwLock}; use tracing_stackdriver::layer as stackdriver_layer; use tracing_subscriber::{layer::SubscriberExt, EnvFilter, Registry}; @@ -135,6 +136,7 @@ fn is_running_on_gcp() -> bool { let resp = reqwest::blocking::Client::new() .get("http://metadata.google.internal/computeMetadata/v1/instance/id") .header("Metadata-Flavor", "Google") + .timeout(Duration::from_secs(1)) .send(); match resp { diff --git a/chain-signatures/node/src/http_client.rs b/chain-signatures/node/src/http_client.rs index 40a80a5a9..17d7253db 100644 --- a/chain-signatures/node/src/http_client.rs +++ b/chain-signatures/node/src/http_client.rs @@ -46,6 +46,7 @@ async fn send_encrypted( .post(url.clone()) .header("content-type", "application/json") .json(&message) + .timeout(Duration::from_secs(1)) .send() .await .map_err(SendError::ReqwestClientError)?; diff --git a/chain-signatures/node/src/mesh/connection.rs b/chain-signatures/node/src/mesh/connection.rs index 4096b72a2..17c751d76 100644 --- a/chain-signatures/node/src/mesh/connection.rs +++ b/chain-signatures/node/src/mesh/connection.rs @@ -54,7 +54,7 @@ impl Pool { participant, info.url ); - let Ok(resp) = self.http.get(url.clone()).send().await else { + let Ok(resp) = self.http.get(url.clone()).timeout(Duration::from_secs(1)).send().await else { tracing::debug!( "finish send /state call to {:?} url {} /state", participant, @@ -109,7 +109,7 @@ impl Pool { continue; }; - let Ok(resp) = self.http.get(url).send().await else { + let Ok(resp) = self.http.get(url).timeout(Duration::from_secs(1)).send().await else { continue; }; From 7fe23d1e67da0aec8955aa6339deebef1cb6c62f Mon Sep 17 00:00:00 2001 From: Xiangyi Zheng Date: Wed, 9 Oct 2024 17:52:29 -0700 Subject: [PATCH 06/10] increase timeout --- chain-signatures/node/src/http_client.rs | 2 +- chain-signatures/node/src/mesh/connection.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/chain-signatures/node/src/http_client.rs b/chain-signatures/node/src/http_client.rs index 17d7253db..406db32f9 100644 --- a/chain-signatures/node/src/http_client.rs +++ b/chain-signatures/node/src/http_client.rs @@ -46,7 +46,7 @@ async fn send_encrypted( .post(url.clone()) .header("content-type", "application/json") .json(&message) - .timeout(Duration::from_secs(1)) + .timeout(Duration::from_secs(5)) .send() .await .map_err(SendError::ReqwestClientError)?; diff --git a/chain-signatures/node/src/mesh/connection.rs b/chain-signatures/node/src/mesh/connection.rs index 17c751d76..20a578eb9 100644 --- a/chain-signatures/node/src/mesh/connection.rs +++ b/chain-signatures/node/src/mesh/connection.rs @@ -54,7 +54,7 @@ impl Pool { participant, info.url ); - let Ok(resp) = self.http.get(url.clone()).timeout(Duration::from_secs(1)).send().await else { + let Ok(resp) = self.http.get(url.clone()).timeout(Duration::from_secs(2)).send().await else { tracing::debug!( "finish send /state call to {:?} url {} /state", participant, @@ -109,7 +109,7 @@ impl Pool { continue; }; - let Ok(resp) = self.http.get(url).timeout(Duration::from_secs(1)).send().await else { + let Ok(resp) = self.http.get(url).timeout(Duration::from_secs(2)).send().await else { continue; }; From 2eb8859d699146dc8259b223130d4f2bdf3361d8 Mon Sep 17 00:00:00 2001 From: Xiangyi Zheng Date: Wed, 9 Oct 2024 18:45:57 -0700 Subject: [PATCH 07/10] add counter of iter at start of loop --- chain-signatures/node/src/metrics.rs | 9 +++++++++ chain-signatures/node/src/protocol/mod.rs | 3 +++ 2 files changed, 12 insertions(+) diff --git a/chain-signatures/node/src/metrics.rs b/chain-signatures/node/src/metrics.rs index ed36d4547..ea7f14f90 100644 --- a/chain-signatures/node/src/metrics.rs +++ b/chain-signatures/node/src/metrics.rs @@ -389,6 +389,15 @@ pub(crate) static SIGNATURE_PUBLISH_FAILURES: Lazy = Lazy::new(|| { .unwrap() }); +pub(crate) static PROTOCOL_LATENCY_ITER_CNT: Lazy = Lazy::new(|| { + try_create_counter_vec( + "multichain_protocol_iter_count", + "Count of multichain protocol iter", + &["node_account_id"], + ) + .unwrap() +}); + pub(crate) static SIGNATURE_PUBLISH_RESPONSE_ERRORS: Lazy = Lazy::new(|| { try_create_counter_vec( "multichain_signature_publish_response_errors", diff --git a/chain-signatures/node/src/protocol/mod.rs b/chain-signatures/node/src/protocol/mod.rs index ea1cd5455..5ff7f8826 100644 --- a/chain-signatures/node/src/protocol/mod.rs +++ b/chain-signatures/node/src/protocol/mod.rs @@ -227,6 +227,9 @@ impl MpcSignProtocol { loop { let protocol_time = Instant::now(); tracing::debug!("trying to advance chain signatures protocol"); + crate::metrics::PROTOCOL_LATENCY_ITER_CNT + .with_label_values(&[my_account_id.as_str()]) + .inc(); loop { let msg_result = self.receiver.try_recv(); match msg_result { From 0bece6223b0cee648b1dba014a0ed69f03753d45 Mon Sep 17 00:00:00 2001 From: Xiangyi Zheng Date: Wed, 9 Oct 2024 18:50:40 -0700 Subject: [PATCH 08/10] remove logging and add metric --- chain-signatures/node/src/mesh/connection.rs | 32 +++++++++----------- chain-signatures/node/src/metrics.rs | 12 ++++---- chain-signatures/node/src/protocol/mod.rs | 8 +---- 3 files changed, 21 insertions(+), 31 deletions(-) diff --git a/chain-signatures/node/src/mesh/connection.rs b/chain-signatures/node/src/mesh/connection.rs index 20a578eb9..12098eeb8 100644 --- a/chain-signatures/node/src/mesh/connection.rs +++ b/chain-signatures/node/src/mesh/connection.rs @@ -49,17 +49,13 @@ impl Pool { continue; }; - tracing::debug!( - "going to send /state call to {:?} url {} /state", - participant, - info.url - ); - let Ok(resp) = self.http.get(url.clone()).timeout(Duration::from_secs(2)).send().await else { - tracing::debug!( - "finish send /state call to {:?} url {} /state", - participant, - info.url - ); + let Ok(resp) = self + .http + .get(url.clone()) + .timeout(Duration::from_secs(2)) + .send() + .await + else { tracing::warn!( "Pool.ping resp err participant {:?} url {}", participant, @@ -68,12 +64,6 @@ impl Pool { continue; }; - tracing::debug!( - "finish send /state call to {:?} url {} /state", - participant, - info.url - ); - let Ok(state): Result = resp.json().await else { tracing::warn!( "Pool.ping state view err participant {:?} url {}", @@ -109,7 +99,13 @@ impl Pool { continue; }; - let Ok(resp) = self.http.get(url).timeout(Duration::from_secs(2)).send().await else { + let Ok(resp) = self + .http + .get(url) + .timeout(Duration::from_secs(2)) + .send() + .await + else { continue; }; diff --git a/chain-signatures/node/src/metrics.rs b/chain-signatures/node/src/metrics.rs index ea7f14f90..57e71afce 100644 --- a/chain-signatures/node/src/metrics.rs +++ b/chain-signatures/node/src/metrics.rs @@ -389,19 +389,19 @@ pub(crate) static SIGNATURE_PUBLISH_FAILURES: Lazy = Lazy::new(|| { .unwrap() }); -pub(crate) static PROTOCOL_LATENCY_ITER_CNT: Lazy = Lazy::new(|| { +pub(crate) static SIGNATURE_PUBLISH_RESPONSE_ERRORS: Lazy = Lazy::new(|| { try_create_counter_vec( - "multichain_protocol_iter_count", - "Count of multichain protocol iter", + "multichain_signature_publish_response_errors", + "number of respond calls with response that cannot be converted to json", &["node_account_id"], ) .unwrap() }); -pub(crate) static SIGNATURE_PUBLISH_RESPONSE_ERRORS: Lazy = Lazy::new(|| { +pub(crate) static PROTOCOL_LATENCY_ITER_CNT: Lazy = Lazy::new(|| { try_create_counter_vec( - "multichain_signature_publish_response_errors", - "number of respond calls with response that cannot be converted to json", + "multichain_protocol_iter_count", + "Count of multichain protocol iter", &["node_account_id"], ) .unwrap() diff --git a/chain-signatures/node/src/protocol/mod.rs b/chain-signatures/node/src/protocol/mod.rs index 5ff7f8826..33fb58c50 100644 --- a/chain-signatures/node/src/protocol/mod.rs +++ b/chain-signatures/node/src/protocol/mod.rs @@ -230,6 +230,7 @@ impl MpcSignProtocol { crate::metrics::PROTOCOL_LATENCY_ITER_CNT .with_label_values(&[my_account_id.as_str()]) .inc(); + loop { let msg_result = self.receiver.try_recv(); match msg_result { @@ -287,22 +288,15 @@ impl MpcSignProtocol { } if last_pinged.elapsed() > Duration::from_millis(300) { - tracing::debug!("doing ping now"); - let start = Instant::now(); self.ctx.mesh.ping().await; - let ping_latency = start.elapsed().as_secs(); - tracing::debug!("ping latency was {ping_latency} seconds"); last_pinged = Instant::now(); } - tracing::debug!("finish ping now"); - let state = { let guard = self.state.read().await; guard.clone() }; - tracing::debug!("finish reading state now"); let crypto_time = Instant::now(); let mut state = match state.progress(&mut self).await { Ok(state) => { From b70a537ee84ccc656bf08b04d680edbc87498b3c Mon Sep 17 00:00:00 2001 From: Xiangyi Zheng Date: Thu, 10 Oct 2024 10:17:32 -0700 Subject: [PATCH 09/10] change name --- chain-signatures/node/src/metrics.rs | 2 +- chain-signatures/node/src/protocol/mod.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/chain-signatures/node/src/metrics.rs b/chain-signatures/node/src/metrics.rs index 57e71afce..b5f388171 100644 --- a/chain-signatures/node/src/metrics.rs +++ b/chain-signatures/node/src/metrics.rs @@ -398,7 +398,7 @@ pub(crate) static SIGNATURE_PUBLISH_RESPONSE_ERRORS: Lazy = Lazy::ne .unwrap() }); -pub(crate) static PROTOCOL_LATENCY_ITER_CNT: Lazy = Lazy::new(|| { +pub(crate) static PROTOCOL_ITER_CNT: Lazy = Lazy::new(|| { try_create_counter_vec( "multichain_protocol_iter_count", "Count of multichain protocol iter", diff --git a/chain-signatures/node/src/protocol/mod.rs b/chain-signatures/node/src/protocol/mod.rs index 33fb58c50..6960364c3 100644 --- a/chain-signatures/node/src/protocol/mod.rs +++ b/chain-signatures/node/src/protocol/mod.rs @@ -227,7 +227,7 @@ impl MpcSignProtocol { loop { let protocol_time = Instant::now(); tracing::debug!("trying to advance chain signatures protocol"); - crate::metrics::PROTOCOL_LATENCY_ITER_CNT + crate::metrics::PROTOCOL_ITER_CNT .with_label_values(&[my_account_id.as_str()]) .inc(); From d798f96f70a2800f7ff3f43901075236100a5ca2 Mon Sep 17 00:00:00 2001 From: Xiangyi Zheng Date: Thu, 10 Oct 2024 10:32:02 -0700 Subject: [PATCH 10/10] lower timeout --- chain-signatures/node/src/http_client.rs | 2 +- chain-signatures/node/src/mesh/connection.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/chain-signatures/node/src/http_client.rs b/chain-signatures/node/src/http_client.rs index 406db32f9..147392bc8 100644 --- a/chain-signatures/node/src/http_client.rs +++ b/chain-signatures/node/src/http_client.rs @@ -46,7 +46,7 @@ async fn send_encrypted( .post(url.clone()) .header("content-type", "application/json") .json(&message) - .timeout(Duration::from_secs(5)) + .timeout(Duration::from_secs(2)) .send() .await .map_err(SendError::ReqwestClientError)?; diff --git a/chain-signatures/node/src/mesh/connection.rs b/chain-signatures/node/src/mesh/connection.rs index 12098eeb8..4367102f1 100644 --- a/chain-signatures/node/src/mesh/connection.rs +++ b/chain-signatures/node/src/mesh/connection.rs @@ -52,7 +52,7 @@ impl Pool { let Ok(resp) = self .http .get(url.clone()) - .timeout(Duration::from_secs(2)) + .timeout(Duration::from_secs(1)) .send() .await else { @@ -102,7 +102,7 @@ impl Pool { let Ok(resp) = self .http .get(url) - .timeout(Duration::from_secs(2)) + .timeout(Duration::from_secs(1)) .send() .await else {