diff --git a/docs/source/procedures/datadog/datadog.rules.yml b/docs/source/procedures/datadog/datadog.rules.yml index 3e85b60ab..7bdc8b2c5 100644 --- a/docs/source/procedures/datadog/datadog.rules.yml +++ b/docs/source/procedures/datadog/datadog.rules.yml @@ -5,247 +5,206 @@ groups: expr: sum(scylla_storage_proxy_coordinator_read_latency_count{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_coordinator_read_count expr: sum(rate(scylla_storage_proxy_coordinator_read_latency_count{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, scheduling_group_name) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_coordinator_read_count expr: sum(rate(scylla_storage_proxy_coordinator_read_latency_count{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, scheduling_group_name) labels: by: "dc" - level: "1" dd: "1" - record: scylla_coordinator_read_count expr: sum(rate(scylla_storage_proxy_coordinator_read_latency_count{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, scheduling_group_name) labels: by: "instance" - level: "1" dd: "1" - record: scylla_total_requests_total expr: sum(scylla_transport_requests_served{}) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_total_requests expr: sum(rate(scylla_transport_requests_served{}[60s])) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_total_requests expr: sum(rate(scylla_transport_requests_served{}[60s])) by (cluster, dc) labels: by: "dc" - level: "1" dd: "1" - record: scylla_total_requests expr: sum(rate(scylla_transport_requests_served{}[60s])) by (cluster, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: scylla_coordinator_write_count_total expr: sum(scylla_storage_proxy_coordinator_write_latency_count{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_coordinator_write_count expr: sum(rate(scylla_storage_proxy_coordinator_write_latency_count{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, scheduling_group_name) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_coordinator_write_count expr: sum(rate(scylla_storage_proxy_coordinator_write_latency_count{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, scheduling_group_name) labels: by: "dc" - level: "1" dd: "1" - record: scylla_coordinator_write_count expr: sum(rate(scylla_storage_proxy_coordinator_write_latency_count{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, scheduling_group_name) labels: by: "instance" - level: "1" dd: "1" - record: scylla_ag_cache_row_hits expr: sum(rate(scylla_cache_row_hits{}[60s])) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_ag_cache_row_hits expr: sum(rate(scylla_cache_row_hits{}[60s])) by (cluster, dc) labels: by: "dc" - level: "1" dd: "1" - record: scylla_ag_cache_row_hits expr: sum(rate(scylla_cache_row_hits{}[60s])) by (cluster, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: scylla_ag_cache_row_misses expr: sum(rate(scylla_cache_row_misses{}[60s])) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_ag_cache_row_misses expr: sum(rate(scylla_cache_row_misses{}[60s])) by (cluster, dc) labels: by: "dc" - level: "1" dd: "1" - record: scylla_ag_cache_row_misses expr: sum(rate(scylla_cache_row_misses{}[60s])) by (cluster, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: scylla_node_filesystem_avail_bytes expr: avg(node_filesystem_avail_bytes) by (cluster, mountpoint) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_node_filesystem_total_avail_bytes expr: sum(node_filesystem_avail_bytes) by (cluster, mountpoint) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_node_filesystem_avail_bytes expr: avg(node_filesystem_avail_bytes) by (cluster, mountpoint, dc) labels: by: "dc" - level: "1" dd: "1" - record: scylla_node_filesystem_avail_bytes expr: avg(node_filesystem_avail_bytes) by (cluster, mountpoint, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: scylla_node_filesystem_size_bytes expr: avg(node_filesystem_size_bytes) by (cluster, mountpoint) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_node_filesystem_total_size_bytes expr: sum(node_filesystem_size_bytes) by (cluster, mountpoint) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_node_filesystem_out_of_space expr: count(min(node_filesystem_avail_bytes/node_filesystem_size_bytes) by (cluster, instance) < 0.1) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_node_filesystem_size_bytes expr: avg(node_filesystem_size_bytes) by (cluster, mountpoint, dc) labels: by: "dc" - level: "1" dd: "1" - record: scylla_node_filesystem_size_bytes expr: avg(node_filesystem_size_bytes) by (cluster, mountpoint, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: scylla_node_network_receive_bytes_total expr: sum(node_network_receive_bytes_total) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_node_network_transmit_bytes_total expr: sum(node_network_transmit_bytes_total) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_node_disk_read_bytes expr: sum(rate(node_disk_read_bytes_total[2m])) by (cluster, device) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_node_disk_read_bytes_total expr: sum(node_disk_read_bytes_total) by (cluster, device) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_node_disk_written_bytes expr: sum(rate(node_disk_written_bytes_total[2m])) by (cluster, device) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_node_disk_written_bytes_total expr: sum(node_disk_written_bytes_total) by (cluster, device) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_scylladb_current_version_total expr: count(scylla_scylladb_current_version) by (cluster, version) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_ag_cache_bytes_used expr: avg(rate(scylla_cache_bytes_used{}[60s])) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_ag_cache_bytes_used expr: avg(rate(scylla_cache_bytes_used{}[60s])) by (cluster, dc) labels: by: "dc" - level: "1" dd: "1" - record: scylla_ag_cache_bytes_used expr: avg(rate(scylla_cache_bytes_used{}[60s])) by (cluster, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: scylla_storage_proxy_coordinator_read_timeouts_ag expr: sum(rate(scylla_storage_proxy_coordinator_read_timeouts{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (scheduling_group_name, cluster, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: scylla_storage_proxy_coordinator_read_timeouts_ag expr: sum(rate(scylla_storage_proxy_coordinator_read_timeouts{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (scheduling_group_name, cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_storage_proxy_coordinator_read_timeouts_total expr: sum(scylla_storage_proxy_coordinator_read_timeouts{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_reactor_utilization_ag expr: avg(scylla_reactor_utilization{}) by (cluster, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: scylla_reactor_utilization_ag expr: avg(scylla_reactor_utilization{}) by (cluster) @@ -256,313 +215,261 @@ groups: expr: sum(rate(scylla_storage_proxy_coordinator_read_unavailable{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (scheduling_group_name, cluster, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: scylla_storage_proxy_coordinator_read_unavailable_ag expr: sum(rate(scylla_storage_proxy_coordinator_read_unavailable{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (scheduling_group_name, cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_storage_proxy_coordinator_read_unavailable_total expr: sum(scylla_storage_proxy_coordinator_read_unavailable{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_storage_proxy_coordinator_write_timeouts_ag expr: sum(rate(scylla_storage_proxy_coordinator_write_timeouts{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (scheduling_group_name, cluster, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: scylla_storage_proxy_coordinator_write_timeouts_ag expr: sum(rate(scylla_storage_proxy_coordinator_write_timeouts{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (scheduling_group_name, cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_storage_proxy_coordinator_write_timeouts_total expr: sum(scylla_storage_proxy_coordinator_write_timeouts{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_storage_proxy_coordinator_write_unavailable_ag expr: sum(rate(scylla_storage_proxy_coordinator_write_unavailable{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, scheduling_group_name, instance) labels: by: "instance" - level: "1" dd: "1" - record: scylla_storage_proxy_coordinator_write_unavailable_ag expr: sum(rate(scylla_storage_proxy_coordinator_write_unavailable{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, scheduling_group_name) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_storage_proxy_coordinator_write_unavailable_total expr: sum(scylla_storage_proxy_coordinator_write_unavailable{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: node_network_receive_packets expr: sum(rate(node_network_receive_packets_total{}[2m])) by (device,cluster, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: node_network_receive_packets expr: sum(rate(node_network_receive_packets_total{}[2m])) by (device,cluster, dc) labels: by: "dc" - level: "1" dd: "1" - record: node_network_receive_packets expr: sum(rate(node_network_receive_packets_total{}[2m])) by (device,cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_node_network_receive_packets_total expr: sum(node_network_receive_packets_total) by (device,cluster) labels: by: "cluster" - level: "1" dd: "1" - record: node_network_transmit_packets expr: sum(rate(node_network_transmit_packets_total{}[2m])) by (device,cluster, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: node_network_transmit_packets expr: sum(rate(node_network_transmit_packets_total{}[2m])) by (device,cluster, dc) labels: by: "dc" - level: "1" dd: "1" - record: scylla_node_network_transmit_packets_total expr: sum(node_network_transmit_packets_total) by (device,cluster) labels: by: "cluster" - level: "1" dd: "1" - record: node_network_receive_bytes expr: sum(rate(node_network_receive_bytes_total{}[2m])) by (device,cluster, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: node_network_receive_bytes expr: sum(rate(node_network_receive_bytes_total{}[2m])) by (device,cluster, dc) labels: by: "dc" - level: "1" dd: "1" - record: node_network_receive_bytes expr: sum(rate(node_network_receive_bytes_total{}[2m])) by (device,cluster) labels: by: "cluster" - level: "1" dd: "1" - record: node_network_transmit_bytes expr: sum(rate(node_network_transmit_bytes_total{}[2m])) by (device,cluster, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: node_network_transmit_bytes expr: sum(rate(node_network_transmit_bytes_total{}[2m])) by (device,cluster, dc) labels: by: "dc" - level: "1" dd: "1" - record: node_network_transmit_bytes expr: sum(rate(node_network_transmit_bytes_total{}[2m])) by (device,cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_total_connection expr: sum(scylla_transport_current_connections) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_total_nodes expr: count(scylla_scylladb_current_version{job="scylla"}) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_total_unreachable_nodes expr: count(scrape_samples_scraped{job="scylla"}==0) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_total_joining_nodes expr: count(scylla_node_operation_mode<3) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_total_leaving_nodes expr: count(scylla_node_operation_mode>3) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_total_manager_tasks expr: sum(scylla_manager_task_active_count{type=~"repair|backup"}) by (cluster, type) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_total_compactios expr: sum(scylla_compaction_manager_completed_compactions) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_storage_proxy_coordinator_background_writes_ag expr: sum(scylla_storage_proxy_coordinator_background_writes{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}) by (cluster, scheduling_group_name) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_storage_proxy_coordinator_background_writes_ag expr: sum(scylla_storage_proxy_coordinator_background_writes{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}) by (cluster, dc, scheduling_group_name) labels: by: "dc" - level: "1" dd: "1" - record: scylla_storage_proxy_coordinator_background_writes_ag expr: sum(scylla_storage_proxy_coordinator_background_writes{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}) by (cluster, dc, instance, scheduling_group_name) labels: by: "instance" - level: "1" dd: "1" - record: scylla_hints_manager_written_ag expr: sum(rate(scylla_hints_manager_written[60s])) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_hints_manager_written_ag expr: sum(rate(scylla_hints_manager_written[60s])) by (cluster, dc) labels: by: "dc" - level: "1" dd: "1" - record: scylla_hints_manager_written_ag expr: sum(rate(scylla_hints_manager_written[60s])) by (cluster, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: scylla_hints_manager_sent_ag expr: sum(rate(scylla_hints_manager_sent[60s])) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_hints_manager_sent_ag expr: sum(rate(scylla_hints_manager_sent[60s])) by (cluster, dc) labels: by: "dc" - level: "1" dd: "1" - record: scylla_hints_manager_sent_ag expr: sum(rate(scylla_hints_manager_sent[60s])) by (cluster, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: scylla_database_total_writes_failed_ag expr: sum(rate(scylla_database_total_writes_failed[60s])) by (cluster, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: scylla_database_total_writes_failed_ag expr: sum(rate(scylla_database_total_writes_failed[60s])) by (cluster, dc) labels: by: "dc" - level: "1" dd: "1" - record: scylla_database_total_writes_failed_ag expr: sum(rate(scylla_database_total_writes_failed[60s])) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_database_total_writes_timedout_ag expr: sum(rate(scylla_database_total_writes_timedout[60s])) by (cluster, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: scylla_database_total_writes_timedout_ag expr: sum(rate(scylla_database_total_writes_timedout[60s])) by (cluster, dc) labels: by: "dc" - level: "1" dd: "1" - record: scylla_database_total_writes_timedout_ag expr: sum(rate(scylla_database_total_writes_timedout[60s])) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_database_total_reads_failed_ag expr: sum(rate(scylla_database_total_reads_failed{class="user"}[60s])) by (cluster, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: scylla_database_total_reads_failed_ag expr: sum(rate(scylla_database_total_reads_failed{class="user"}[60s])) by (cluster, dc) labels: by: "dc" - level: "1" dd: "1" - record: scylla_database_total_reads_failed_ag expr: sum(rate(scylla_database_total_reads_failed{class="user"}[60s])) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_database_total_reads_rate_limited_ag expr: sum(rate(scylla_database_total_reads_rate_limited[60s])) by (cluster, dc, instance) labels: by: "instance" - level: "1" dd: "1" - record: scylla_database_total_reads_rate_limited_ag expr: sum(rate(scylla_database_total_reads_rate_limited[60s])) by (cluster, dc) labels: by: "dc" - level: "1" dd: "1" - record: scylla_database_total_reads_rate_limited_ag expr: sum(rate(scylla_database_total_reads_rate_limited[60s])) by (cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_manager_last_success_ts expr: max(scylla_manager_scheduler_last_success{}) by(cluster, type) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_nodes_uptime_ts expr: min(node_boot_time_seconds) by(cluster) labels: by: "cluster" - level: "1" dd: "1" - record: scylla_total_cores expr: count(scylla_reactor_utilization{}) by (cluster) diff --git a/prometheus/prom_rules/prometheus.latency.rules.yml b/prometheus/prom_rules/prometheus.latency.rules.yml index 62b3f7032..0224037ea 100644 --- a/prometheus/prom_rules/prometheus.latency.rules.yml +++ b/prometheus/prom_rules/prometheus.latency.rules.yml @@ -48,7 +48,6 @@ groups: - record: scylla_manager_repair_progress expr: sum(manager:repair_progress) by (cluster) labels: - level: "1" dd: "1" by: "cluster" - record: manager:backup_progress @@ -56,7 +55,6 @@ groups: - record: scylla_manager_backup_progress expr: sum(manager:backup_progress) by (cluster) labels: - level: "1" dd: "1" by: "cluster" - record: wlatencyp99 @@ -69,19 +67,16 @@ groups: expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, scheduling_group_name, le)) labels: by: "instance" - level: "1" dd: "1" - record: wlatencyp99 expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, scheduling_group_name, le)) labels: by: "dc" - level: "1" dd: "1" - record: wlatencyp99 expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, scheduling_group_name, le)) labels: by: "cluster" - level: "1" dd: "1" - record: rlatencyp99 expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{shard=~".+", scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, shard, scheduling_group_name, le)) @@ -93,19 +88,16 @@ groups: expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, scheduling_group_name, le)) labels: by: "instance" - level: "1" dd: "1" - record: rlatencyp99 expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, scheduling_group_name, le)) labels: by: "dc" - level: "1" dd: "1" - record: rlatencyp99 expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, scheduling_group_name, le)) labels: by: "cluster" - level: "1" dd: "1" - record: wlatencyp95 expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{shard=~".+", scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, shard, scheduling_group_name, le)) @@ -117,19 +109,16 @@ groups: expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, scheduling_group_name, le)) labels: by: "instance" - level: "1" dd: "1" - record: wlatencyp95 expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, scheduling_group_name, le)) labels: by: "dc" - level: "1" dd: "1" - record: wlatencyp95 expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, scheduling_group_name, le)) labels: by: "cluster" - level: "1" dd: "1" - record: rlatencyp95 expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{shard=~".+", scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, shard, scheduling_group_name, le)) @@ -141,19 +130,16 @@ groups: expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, scheduling_group_name, le)) labels: by: "instance" - level: "1" dd: "1" - record: rlatencyp95 expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, scheduling_group_name, le)) labels: by: "dc" - level: "1" dd: "1" - record: rlatencyp95 expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, scheduling_group_name, le)) labels: by: "cluster" - level: "1" dd: "1" - record: wlatencya expr: histogram_quantile(0.5, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{shard=~".+", scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, shard, scheduling_group_name, le)) @@ -165,19 +151,16 @@ groups: expr: histogram_quantile(0.5, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, scheduling_group_name, le)) labels: by: "instance" - level: "1" dd: "1" - record: wlatencya expr: histogram_quantile(0.5, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, scheduling_group_name, le)) labels: by: "dc" - level: "1" dd: "1" - record: wlatencya expr: histogram_quantile(0.5, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, scheduling_group_name, le)) labels: by: "cluster" - level: "1" dd: "1" - record: rlatencya expr: histogram_quantile(0.5, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{shard=~".+", scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, shard, scheduling_group_name, le)) @@ -189,19 +172,16 @@ groups: expr: histogram_quantile(0.5, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, scheduling_group_name, le)) labels: by: "instance" - level: "1" dd: "1" - record: rlatencya expr: histogram_quantile(0.5, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, scheduling_group_name, le)) labels: by: "dc" - level: "1" dd: "1" - record: rlatencya expr: histogram_quantile(0.5, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, scheduling_group_name, le)) labels: by: "cluster" - level: "1" dd: "1" - record: wlatencyp99 expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{shard=~".+", scheduling_group_name=~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, shard, scheduling_group_name, le)) @@ -310,19 +290,16 @@ groups: expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_read_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, le, scheduling_group_name)) labels: by: "instance" - level: "1" dd: "1" - record: casrlatencyp95 expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_read_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, le, scheduling_group_name)) labels: by: "dc" - level: "1" dd: "1" - record: casrlatencyp95 expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_read_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, le, scheduling_group_name)) labels: by: "cluster" - level: "1" dd: "1" - record: caswlatencyp95 expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_write_latency_bucket{shard=~".+", scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, shard, le, scheduling_group_name)) @@ -334,19 +311,16 @@ groups: expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_write_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, le, scheduling_group_name)) labels: by: "instance" - level: "1" dd: "1" - record: caswlatencyp95 expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_write_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, le, scheduling_group_name)) labels: by: "dc" - level: "1" dd: "1" - record: caswlatencyp95 expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_write_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, le, scheduling_group_name)) labels: by: "cluster" - level: "1" dd: "1" - record: casrlatencya expr: histogram_quantile(0.5, sum(rate(scylla_storage_proxy_coordinator_cas_read_latency_bucket{shard=~".+", scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, shard, le, scheduling_group_name)) @@ -358,19 +332,16 @@ groups: expr: histogram_quantile(0.5, sum(rate(scylla_storage_proxy_coordinator_cas_read_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, le, scheduling_group_name)) labels: by: "instance" - level: "1" dd: "1" - record: casrlatencya expr: histogram_quantile(0.5, sum(rate(scylla_storage_proxy_coordinator_cas_read_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, le, scheduling_group_name)) labels: by: "dc" - level: "1" dd: "1" - record: casrlatencya expr: histogram_quantile(0.5, sum(rate(scylla_storage_proxy_coordinator_cas_read_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, le, scheduling_group_name)) labels: by: "cluster" - level: "1" dd: "1" - record: caswlatencya expr: histogram_quantile(0.5, sum(rate(scylla_storage_proxy_coordinator_cas_write_latency_bucket{shard=~".+", scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, shard, le, scheduling_group_name)) @@ -382,19 +353,16 @@ groups: expr: histogram_quantile(0.5, sum(rate(scylla_storage_proxy_coordinator_cas_write_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, instance, le, scheduling_group_name)) labels: by: "instance" - level: "1" dd: "1" - record: caswlatencya expr: histogram_quantile(0.5, sum(rate(scylla_storage_proxy_coordinator_cas_write_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, dc, le, scheduling_group_name)) labels: by: "dc" - level: "1" dd: "1" - record: caswlatencya expr: histogram_quantile(0.5, sum(rate(scylla_storage_proxy_coordinator_cas_write_latency_bucket{scheduling_group_name!~"atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache"}[60s])) by (cluster, le, scheduling_group_name)) labels: by: "cluster" - level: "1" dd: "1" - record: all_scheduling_group expr: sum by (cluster, scheduling_group_name) (scylla_storage_proxy_coordinator_write_latency_count + scylla_storage_proxy_coordinator_read_latency_count) > 0 diff --git a/prometheus/prometheus.yml.template b/prometheus/prometheus.yml.template index 2b077c1dc..d8ada941e 100644 --- a/prometheus/prometheus.yml.template +++ b/prometheus/prometheus.yml.template @@ -124,26 +124,14 @@ scrape_configs: regex: '(.latency..?.?|cas.latency..?.?)' target_label: by replacement: 'instance,shard' - - source_labels: [__name__] - regex: '(scylla_storage_proxy_coordinator_read_timeouts|scylla_reactor_utilization|scylla_storage_proxy_coordinator_read_timeouts|scylla_storage_proxy_coordinator_read_unavailable|scylla_storage_proxy_coordinator_write_timeouts|scylla_storage_proxy_coordinator_write_unavailable|.latency..?.?)' - replacement: '2' - target_label: level - source_labels: [__name__] regex: '(scylla_storage_proxy_coordinator_read_timeouts|scylla_reactor_utilization|scylla_storage_proxy_coordinator_read_timeouts|scylla_storage_proxy_coordinator_read_unavailable|scylla_storage_proxy_coordinator_write_timeouts|scylla_storage_proxy_coordinator_write_unavailable|.latency..?.?)' replacement: '2' target_label: dd - - source_labels: [__name__] - regex: '(scylla_node_operation_mode)' - replacement: '1' - target_label: level - source_labels: [__name__] regex: '(scylla_node_operation_mode)' replacement: '1' target_label: dd - - source_labels: [scheduling_group_name] - regex: '(atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache)' - replacement: '' - target_label: level - source_labels: [scheduling_group_name] regex: '(atexit|gossip|mem_compaction|memtable|streaming|background_reclaim|compaction|main|memtable_to_cache)' replacement: '' @@ -172,7 +160,7 @@ scrape_configs: metric_relabel_configs: - source_labels: [__name__] regex: '(node_filesystem_size_bytes|node_filesystem_avail_bytes|node_network_receive_packets_total|node_network_transmit_packets_total|node_network_receive_bytes_total|node_network_transmit_bytes_total)' - target_label: level + target_label: dd replacement: '1' - job_name: manager_agent @@ -205,7 +193,7 @@ scrape_configs: target_label: instance - source_labels: [__name__] regex: '(scylla_manager_scheduler_last_duration)' - target_label: level + target_label: dd replacement: '1' - source_labels: [__name__] regex: '(scylla_manager_scheduler_last_duration)'