Skip to content

Commit

Permalink
fix(RAFT): fix wrong raft.is_enabled usage
Browse files Browse the repository at this point in the history
the whole idea of putting RAFT into separate module is to remove
hundreds of spaghetti lines like:
    if not current_node.raft.is_enabled

and put Raft helper functions under
node.raft.<HELPER> to avoid hundreds of helpers imports

it can be exceptions in case of using raft.is_enabled as part of a complex 'if'
when code logic depends not only on the Raft feature
  • Loading branch information
temichus authored and soyacz committed Oct 25, 2024
1 parent ee40cd4 commit 6a3ca71
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 25 deletions.
23 changes: 1 addition & 22 deletions sdcm/utils/health_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,25 +268,4 @@ def check_schema_agreement_in_gossip_and_peers(node, retries: int = CHECK_NODE_H
def check_group0_tokenring_consistency(group0_members: list[dict[str, str]],
tokenring_members: list[dict[str, str]],
current_node) -> HealthEventsGenerator:
if not current_node.raft.is_enabled:
LOGGER.debug("Raft feature is disabled on node %s (host_id=%s)", current_node.name, current_node.host_id)
return
LOGGER.debug("Check group0 and token ring consistency on node %s (host_id=%s)...",
current_node.name, current_node.host_id)
token_ring_node_ids = [member["host_id"] for member in tokenring_members]
for member in group0_members:
if member["voter"] and member["host_id"] in token_ring_node_ids:
continue
error_message = f"Node {current_node.name} has group0 member with host_id {member['host_id']} with " \
f"can_vote {member['voter']} and " \
f"presents in token ring {member['host_id'] in token_ring_node_ids}. " \
f"Inconsistency between group0: {group0_members} " \
f"and tokenring: {tokenring_members}"
LOGGER.error(error_message)
yield ClusterHealthValidatorEvent.Group0TokenRingInconsistency(
severity=Severity.ERROR,
node=current_node.name,
error=error_message,
)
LOGGER.debug("Group0 and token-ring are consistent on node %s (host_id=%s)...",
current_node.name, current_node.host_id)
return current_node.raft.check_group0_tokenring_consistency(group0_members, tokenring_members)
41 changes: 41 additions & 0 deletions sdcm/utils/raft/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@

from sdcm.sct_events.database import DatabaseLogEvent
from sdcm.sct_events.filters import EventsSeverityChangerFilter
from sdcm.sct_events.health import ClusterHealthValidatorEvent
from sdcm.sct_events import Severity
from sdcm.utils.features import is_consistent_topology_changes_feature_enabled, is_consistent_cluster_management_feature_enabled
from sdcm.utils.health_checker import HealthEventsGenerator
from sdcm.wait import wait_for

LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -80,6 +82,12 @@ class RaftFeatureOperations(ABC):
@property
@abstractmethod
def is_enabled(self) -> bool:
"""
Please do not use this method in simple statements like:
if not current_node.raft.is_enabled
move logic into separate methods of RaftFeature/NoRaft
classes instead
"""
...

@property
Expand Down Expand Up @@ -107,6 +115,11 @@ def get_group0_non_voters(self) -> list[dict[str, str]]:
def clean_group0_garbage(self, raise_exception: bool = False) -> None:
...

def check_group0_tokenring_consistency(
self, group0_members: list[dict[str, str]],
tokenring_members: list[dict[str, str]]) -> [HealthEventsGenerator | None]:
...

def get_message_waiting_timeout(self, message_position: MessagePosition) -> MessageTimeout:
backend = self._node.parent_cluster.params["cluster_backend"]
if backend not in BACKEND_TIMEOUTS:
Expand Down Expand Up @@ -311,6 +324,29 @@ def is_cluster_topology_consistent(self) -> bool:

return not diff and not non_voters_ids and len(group0_ids) == len(token_ring_ids) == num_of_nodes

def check_group0_tokenring_consistency(
self, group0_members: list[dict[str, str]],
tokenring_members: list[dict[str, str]]) -> HealthEventsGenerator:
LOGGER.debug("Check group0 and token ring consistency on node %s (host_id=%s)...",
self._node.name, self._node.host_id)
token_ring_node_ids = [member["host_id"] for member in tokenring_members]
for member in group0_members:
if member["voter"] and member["host_id"] in token_ring_node_ids:
continue
error_message = f"Node {self._node.name} has group0 member with host_id {member['host_id']} with " \
f"can_vote {member['voter']} and " \
f"presents in token ring {member['host_id'] in token_ring_node_ids}. " \
f"Inconsistency between group0: {group0_members} " \
f"and tokenring: {tokenring_members}"
LOGGER.error(error_message)
yield ClusterHealthValidatorEvent.Group0TokenRingInconsistency(
severity=Severity.ERROR,
node=self._node.name,
error=error_message,
)
LOGGER.debug("Group0 and token-ring are consistent on node %s (host_id=%s)...",
self._node.name, self._node.host_id)


class NoRaft(RaftFeatureOperations):
TOPOLOGY_OPERATION_LOG_PATTERNS = {
Expand Down Expand Up @@ -363,6 +399,11 @@ def is_cluster_topology_consistent(self) -> bool:

return len(token_ring_ids) == num_of_nodes

def check_group0_tokenring_consistency(
self, group0_members: list[dict[str, str]],
tokenring_members: list[dict[str, str]]) -> None:
LOGGER.debug("Raft feature is disabled on node %s (host_id=%s)", self._node.name, self._node.host_id)


def get_raft_mode(node) -> RaftFeature | NoRaft:
with node.parent_cluster.cql_connection_patient(node) as session:
Expand Down
3 changes: 0 additions & 3 deletions sdcm/utils/tablets/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,6 @@ def wait_for_tablets_balanced(node):
"currently a small time window after adding nodes and before load balancing starts during which
topology may appear as quiesced because the state machine goes through an idle state before it enters load balancing state"
"""
if not node.raft.is_enabled:
LOGGER.info("Raft is disabled, skipping wait for balance")
return
with node.parent_cluster.cql_connection_patient(node=node) as session:
if not is_tablets_feature_enabled(session):
LOGGER.info("Tablets are disabled, skipping wait for balance")
Expand Down

0 comments on commit 6a3ca71

Please sign in to comment.