Skip to content

Commit

Permalink
FEAT: hardware metrics (#876)
Browse files Browse the repository at this point in the history
* added 2 dev nodes and cleaned up some actions workflow files

* added 12th node and updated some workflows

* pretending to be a rust dev

* adding account ID to metric labels

* added a 5 sec timer to metrics refresh, added total disk space

* added logging config to partner terraform

* Made label adjustments, recalculated CPU usage and disk space

* removed Total memory metric and changed testnet watchtower delay

* added environment flag to workflow file for run restrictions

* name metrics better

* fix tests

* added loop metric

* formatting

* deleted redundant line
  • Loading branch information
kmaus-near authored Oct 11, 2024
1 parent 3715df1 commit 9b495c5
Show file tree
Hide file tree
Showing 7 changed files with 243 additions and 5 deletions.
1 change: 1 addition & 0 deletions .github/workflows/multichain-update-prod-nodes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ on:
jobs:
build-mpc-recovery:
runs-on: ubuntu-latest
environment: prod
steps:
- uses: actions/checkout@v3
name: "Checkout mpc-recovery"
Expand Down
122 changes: 119 additions & 3 deletions chain-signatures/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions chain-signatures/node/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,4 @@ itertools = "0.12.0"
http = "1.1.0"
prometheus = { version = "0.13.3" }
once_cell = "1.13.1"
sysinfo = "0.32.0"
50 changes: 50 additions & 0 deletions chain-signatures/node/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,56 @@ pub(crate) static SIGNATURE_PUBLISH_FAILURES: Lazy<CounterVec> = Lazy::new(|| {
.unwrap()
});

// CPU Usage Percentage Metric
pub(crate) static CPU_USAGE_PERCENTAGE: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_cpu_usage_percentage",
"CPU Usage Percentage",
&["global", "node_account_id"],
)
.unwrap()
});

// Available Memory Metric
pub(crate) static AVAILABLE_MEMORY_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_available_memory_bytes",
"Available Memory in Bytes",
&["available_mem", "node_account_id"],
)
.unwrap()
});

// Used Memory Metric
pub(crate) static USED_MEMORY_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_used_memory_bytes",
"Used Memory in Bytes",
&["used", "node_account_id"],
)
.unwrap()
});

// Disk Space Metric
pub(crate) static AVAILABLE_DISK_SPACE_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_available_disk_space_bytes",
"Available Disk Space in Bytes",
&["available_disk", "node_account_id"],
)
.unwrap()
});

// Total Disk Space Metric
pub(crate) static TOTAL_DISK_SPACE_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_total_disk_space_bytes",
"Total Disk Space in Bytes",
&["total_disk", "node_account_id"],
)
.unwrap()
});

pub(crate) static SIGNATURE_PUBLISH_RESPONSE_ERRORS: Lazy<CounterVec> = Lazy::new(|| {
try_create_counter_vec(
"multichain_signature_publish_response_errors",
Expand Down
62 changes: 61 additions & 1 deletion chain-signatures/node/src/protocol/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ pub use message::MpcMessage;
pub use signature::SignQueue;
pub use signature::SignRequest;
pub use state::NodeState;
pub use sysinfo::{Components, CpuRefreshKind, Disks, RefreshKind, System};

use self::consensus::ConsensusCtx;
use self::cryptography::CryptographicCtx;
Expand All @@ -34,6 +35,7 @@ use cait_sith::protocol::Participant;
use near_account_id::AccountId;
use near_crypto::InMemorySigner;
use reqwest::IntoUrl;
use std::path::Path;
use std::time::Instant;
use std::{sync::Arc, time::Duration};
use tokio::sync::mpsc::{self, error::TryRecvError};
Expand Down Expand Up @@ -212,6 +214,7 @@ impl MpcSignProtocol {
let mut queue = MpcMessageQueue::default();
let mut last_state_update = Instant::now();
let mut last_config_update = Instant::now();
let last_hardware_pull = Instant::now();
let mut last_pinged = Instant::now();

// Sets the latest configurations from the contract:
Expand All @@ -227,10 +230,14 @@ impl MpcSignProtocol {
loop {
let protocol_time = Instant::now();
tracing::debug!("trying to advance chain signatures protocol");
// Hardware metric refresh
if last_hardware_pull.elapsed() > Duration::from_secs(5) {
update_system_metrics(&my_account_id);
}

crate::metrics::PROTOCOL_ITER_CNT
.with_label_values(&[my_account_id.as_str()])
.inc();

loop {
let msg_result = self.receiver.try_recv();
match msg_result {
Expand Down Expand Up @@ -385,3 +392,56 @@ fn node_version() -> i64 {
};
(rc_num + version.patch * 1000 + version.minor * 1000000 + version.major * 1000000000) as i64
}

fn update_system_metrics(node_account_id: &str) {
let mut system = System::new_all();

// Refresh only the necessary components
system.refresh_all();

let mut s =
System::new_with_specifics(RefreshKind::new().with_cpu(CpuRefreshKind::everything()));
// Wait a bit because CPU usage is based on diff.
std::thread::sleep(sysinfo::MINIMUM_CPU_UPDATE_INTERVAL);
// Refresh CPUs again to get actual value.
s.refresh_cpu_specifics(CpuRefreshKind::everything());

// Update CPU usage metric
let cpu_usage = s.global_cpu_usage() as i64;
crate::metrics::CPU_USAGE_PERCENTAGE
.with_label_values(&["global", node_account_id])
.set(cpu_usage);

// Update available memory metric
let available_memory = system.available_memory() as i64;
crate::metrics::AVAILABLE_MEMORY_BYTES
.with_label_values(&["available_mem", node_account_id])
.set(available_memory);

// Update used memory metric
let used_memory = system.used_memory() as i64;
crate::metrics::USED_MEMORY_BYTES
.with_label_values(&["used", node_account_id])
.set(used_memory);

let root_mount_point = Path::new("/");
// Update available disk space metric
let available_disk_space = Disks::new_with_refreshed_list()
.iter()
.find(|d| d.mount_point() == root_mount_point)
.expect("No disk found mounted at '/'")
.available_space() as i64;
crate::metrics::AVAILABLE_DISK_SPACE_BYTES
.with_label_values(&["available_disk", node_account_id])
.set(available_disk_space);

// Update total disk space metric
let total_disk_space = Disks::new_with_refreshed_list()
.iter()
.find(|d| d.mount_point() == root_mount_point)
.expect("No disk found mounted at '/'")
.total_space() as i64;
crate::metrics::TOTAL_DISK_SPACE_BYTES
.with_label_values(&["total_disk", node_account_id])
.set(total_disk_space);
}
5 changes: 5 additions & 0 deletions infra/partner-mainnet/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ provider "google" {
provider "google-beta" {
project = var.project_id
}

resource "google_compute_project_metadata_item" "project_logging" {
key = "google-logging-enabled"
value = "true"
}
module "gce-container" {
count = length(var.node_configs)
source = "terraform-google-modules/container-vm/google"
Expand Down
7 changes: 6 additions & 1 deletion infra/partner-testnet/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ provider "google" {
provider "google-beta" {
project = var.project_id
}

resource "google_compute_project_metadata_item" "project_logging" {
key = "google-logging-enabled"
value = "true"
}
module "gce-container" {
count = length(var.node_configs)
source = "terraform-google-modules/container-vm/google"
Expand Down Expand Up @@ -103,7 +108,7 @@ module "ig_template" {
source_image_project = "cos-cloud"
machine_type = "n2d-standard-2"

startup_script = "docker rm watchtower ; docker run -d --name watchtower -v /var/run/docker.sock:/var/run/docker.sock containrrr/watchtower --debug --interval 3600"
startup_script = "docker rm watchtower ; docker run -d --name watchtower -v /var/run/docker.sock:/var/run/docker.sock containrrr/watchtower --debug --interval 30"

source_image = reverse(split("/", module.gce-container[count.index].source_image))[0]
metadata = merge(var.additional_metadata, { "gce-container-declaration" = module.gce-container["${count.index}"].metadata_value })
Expand Down

0 comments on commit 9b495c5

Please sign in to comment.