Skip to content

Commit

Permalink
feature(nemesis): introduce lock for target selection
Browse files Browse the repository at this point in the history
since we run into multiple cases on parallel nemesis
that there were multiple nemesis using the same node

we are introducing a lock over the selection of target nodes
so we won't be able to pick it multiple times

Fixes: #6553
  • Loading branch information
fruch committed Dec 26, 2023
1 parent 751e689 commit 7954f29
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 28 deletions.
3 changes: 1 addition & 2 deletions sdcm/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -4335,8 +4335,7 @@ def _rotate_kms_key(kms_key_alias_name, kms_key_rotation_interval, db_cluster):
message=f"Failed to rotate AWS KMS key for the '{kms_key_alias_name}' alias",
traceback=traceback.format_exc()).publish()
try:
target_node = [node for node in db_cluster.nodes if not node.running_nemesis][0]
with run_nemesis(node=target_node, nemesis_name="KMS encryption check"):
with run_nemesis(nodes=db_cluster.nodes, nemesis_name="KMS encryption check") as target_node:
ks_cf = db_cluster.get_non_system_ks_cf_list(db_node=target_node, filter_out_mv=True)[0]
sstable_util = SstableUtils(db_node=target_node, ks_cf=ks_cf)
encryption_results = sstable_util.is_sstable_encrypted()
Expand Down
49 changes: 26 additions & 23 deletions sdcm/nemesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,8 @@
"disrupt_terminate_kubernetes_host_then_decommission_and_add_scylla_node",
)

NEMESIS_TARGET_SELECTION_LOCK = Lock()


class DefaultValue: # pylint: disable=too-few-public-methods
"""
Expand Down Expand Up @@ -318,12 +320,14 @@ def publish_event(self, disrupt, status=True, data=None):
DisruptionEvent(nemesis_name=disrupt, severity=severity, **data).publish()

def set_current_running_nemesis(self, node):
node.running_nemesis = self.current_disruption
with NEMESIS_TARGET_SELECTION_LOCK:
node.running_nemesis = self.current_disruption

@staticmethod
def unset_current_running_nemesis(node):
if node is not None:
node.running_nemesis = None
with NEMESIS_TARGET_SELECTION_LOCK:
node.running_nemesis = None

def _get_target_nodes(
self,
Expand Down Expand Up @@ -363,21 +367,22 @@ def set_target_node(self, dc_idx: Optional[int] = None, rack: Optional[int] = No
if is_seed is DefaultValue - if self.filter_seed is True it act as if is_seed=False,
otherwise it will act as if is_seed is None
"""
self.unset_current_running_nemesis(self.target_node)
nodes = self._get_target_nodes(is_seed=is_seed, dc_idx=dc_idx, rack=rack)
if not nodes:
dc_str = '' if dc_idx is None else f'dc {dc_idx} '
rack_str = '' if rack is None else f'rack {rack} '
raise UnsupportedNemesis(
f"Can't allocate node from {dc_str}{rack_str}to run nemesis on")
if allow_only_last_node_in_rack:
self.target_node = nodes[-1]
else:
self.target_node = random.choice(nodes)
with NEMESIS_TARGET_SELECTION_LOCK:
self.unset_current_running_nemesis(self.target_node)
nodes = self._get_target_nodes(is_seed=is_seed, dc_idx=dc_idx, rack=rack)
if not nodes:
dc_str = '' if dc_idx is None else f'dc {dc_idx} '
rack_str = '' if rack is None else f'rack {rack} '
raise UnsupportedNemesis(
f"Can't allocate node from {dc_str}{rack_str}to run nemesis on")
if allow_only_last_node_in_rack:
self.target_node = nodes[-1]
else:
self.target_node = random.choice(nodes)

self.set_current_running_nemesis(node=self.target_node)
self.log.info('Current Target: %s with running nemesis: %s',
self.target_node, self.target_node.running_nemesis)
self.target_node.running_nemesis = self.current_disruption
self.log.info('Current Target: %s with running nemesis: %s',
self.target_node, self.target_node.running_nemesis)

@raise_event_on_failure
def run(self, interval=None, cycles_count: int = -1):
Expand Down Expand Up @@ -1107,11 +1112,7 @@ def get_disrupt_name(self):
def get_class_name(self):
return self.__class__.__name__.replace('Monkey', '')

def _set_current_disruption(self, label=None, node=None):
self.target_node = node if node else self.target_node

if not label:
label = "%s on target node %s" % (self.__class__.__name__, self.target_node)
def set_current_disruption(self, label=None):
self.log.debug('Set current_disruption -> %s', label)
self.current_disruption = label

Expand Down Expand Up @@ -5030,8 +5031,10 @@ def wrapper(*args, **kwargs): # pylint: disable=too-many-statements
# NOTE: exclusive nemesis will wait before the end of all other ones
time.sleep(10)

args[0].set_target_node()
args[0].current_disruption = "".join(p.capitalize() for p in method_name.replace("disrupt_", "").split("_"))
args[0].set_current_disruption(f"{args[0].current_disruption}")
args[0].set_target_node()

args[0].cluster.check_cluster_health()
num_nodes_before = len(args[0].cluster.nodes)
start_time = time.time()
Expand All @@ -5042,7 +5045,7 @@ def wrapper(*args, **kwargs): # pylint: disable=too-many-statements
result = None
status = True
# pylint: disable=protected-access
args[0]._set_current_disruption(f"{args[0].current_disruption} {args[0].target_node}")

args[0].set_current_running_nemesis(node=args[0].target_node)
log_info = {
'operation': args[0].current_disruption,
Expand Down
11 changes: 8 additions & 3 deletions sdcm/utils/context_managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
import os
from contextlib import contextmanager

from sdcm.nemesis import NEMESIS_TARGET_SELECTION_LOCK


@contextmanager
def environment(**kwargs):
Expand Down Expand Up @@ -46,9 +48,12 @@ def nodetool_context(node, start_command, end_command):


@contextmanager
def run_nemesis(node: 'BaseNode', nemesis_name: str):
node.running_nemesis = nemesis_name
def run_nemesis(nodes: list['BaseNode'], nemesis_name: str):
with NEMESIS_TARGET_SELECTION_LOCK:
node = [node for node in nodes if not node.running_nemesis][0]
node.running_nemesis = nemesis_name
try:
yield node
finally:
node.running_nemesis = None
with NEMESIS_TARGET_SELECTION_LOCK:
node.running_nemesis = None

0 comments on commit 7954f29

Please sign in to comment.