Skip to content

Commit

Permalink
feature(sct-runner): collect runner metrics using node_exporter
Browse files Browse the repository at this point in the history
* create a new version of sct-runner with node exporter baked in
* set it up on the monitoring stack

this would help use collect data about what's going on with the
sct-runner, and help us pin point issues with the test code
or the need to pick bigger instances/disks from some tests.
  • Loading branch information
fruch committed Jan 17, 2024
1 parent 63d11c1 commit d240eb0
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 5 deletions.
2 changes: 1 addition & 1 deletion sct.py
Original file line number Diff line number Diff line change
Expand Up @@ -1535,7 +1535,7 @@ def configure_aws_peering(regions):
@click.option("-r", "--region", required=True, type=CloudRegion(), help="Cloud region")
@click.option("-z", "--availability-zone", default="", type=str, help="Name of availability zone, ex. 'a'")
def create_runner_image(cloud_provider, region, availability_zone):
if cloud_provider == "aws" and availability_zone != "":
if cloud_provider == "aws":
assert len(availability_zone) == 1, f"Invalid AZ: {availability_zone}, availability-zone is one-letter a-z."
add_file_logger()
sct_runner = get_sct_runner(cloud_provider=cloud_provider, region_name=region, availability_zone=availability_zone)
Expand Down
2 changes: 2 additions & 0 deletions sdcm/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@
ComparableScyllaVersion,
SCYLLA_VERSION_RE,
)
from sdcm.utils.net import get_my_ip
from sdcm.utils.node import build_node_api_command
from sdcm.sct_events import Severity
from sdcm.sct_events.base import LogEvent, add_severity_limit_rules, max_severity
Expand Down Expand Up @@ -5398,6 +5399,7 @@ def reconfigure_scylla_monitoring(self):

with node._remote_yaml(f'{self.monitoring_conf_dir}/node_exporter_servers.yml') as exporter_yaml: # pylint: disable=protected-access
exporter_yaml[0]['targets'] += [f'{normalize_ipv6_url(node.private_ip_address)}:9100']
exporter_yaml[0]['targets'] += [f'{normalize_ipv6_url(get_my_ip())}:9100']
exporter_yaml[0]['targets'] = list(set(exporter_yaml[0]['targets'])) # remove duplicates

if self.params.get("cloud_prom_bearer_token"):
Expand Down
9 changes: 6 additions & 3 deletions sdcm/node_exporter_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@

class NodeExporterSetup: # pylint: disable=too-few-public-methods
@staticmethod
def install(node):
node.install_package('wget')
node.remoter.sudo(shell_script_cmd(f"""
def install(node: "BaseNode | None" = None, remoter: "Remoter | None" = None):
assert node or remoter, "node or remoter much be pass to this function"
if node:
node.install_package('wget')
remoter = node.remoter
remoter.sudo(shell_script_cmd(f"""
if ! id node_exporter > /dev/null 2>&1; then
useradd -rs /bin/false node_exporter
fi
Expand Down
7 changes: 6 additions & 1 deletion sdcm/sct_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
from sdcm.utils.azure_region import AzureOsState, AzureRegion, region_name_to_location
from sdcm.utils.context_managers import environment
from sdcm.test_config import TestConfig
from sdcm.node_exporter_setup import NodeExporterSetup

if TYPE_CHECKING:
# pylint: disable=ungrouped-imports
Expand Down Expand Up @@ -129,7 +130,7 @@ def terminate(self) -> None:

class SctRunner(ABC):
"""Provision and configure the SCT runner."""
VERSION = "1.6" # Version of the Image
VERSION = "1.7" # Version of the Image
NODE_TYPE = "sct-runner"
RUNNER_NAME = "SCT-Runner"
LOGIN_USER = "ubuntu"
Expand Down Expand Up @@ -242,6 +243,10 @@ def install_prereqs(self, public_ip: str, connect_timeout: Optional[int] = None)
# Jenkins pipelines run /bin/sh for some reason.
ln -sf /bin/bash /bin/sh
"""), ignore_status=True)

node_exporter_setup = NodeExporterSetup()
node_exporter_setup.install(remoter=remoter)

remoter.stop()
if result.ok:
LOGGER.info("All packages successfully installed.")
Expand Down

0 comments on commit d240eb0

Please sign in to comment.