diff --git a/sdcm/utils/health_checker.py b/sdcm/utils/health_checker.py index b0c8f89b6d..d2e1d7d263 100644 --- a/sdcm/utils/health_checker.py +++ b/sdcm/utils/health_checker.py @@ -268,25 +268,4 @@ def check_schema_agreement_in_gossip_and_peers(node, retries: int = CHECK_NODE_H def check_group0_tokenring_consistency(group0_members: list[dict[str, str]], tokenring_members: list[dict[str, str]], current_node) -> HealthEventsGenerator: - if not current_node.raft.is_enabled: - LOGGER.debug("Raft feature is disabled on node %s (host_id=%s)", current_node.name, current_node.host_id) - return - LOGGER.debug("Check group0 and token ring consistency on node %s (host_id=%s)...", - current_node.name, current_node.host_id) - token_ring_node_ids = [member["host_id"] for member in tokenring_members] - for member in group0_members: - if member["voter"] and member["host_id"] in token_ring_node_ids: - continue - error_message = f"Node {current_node.name} has group0 member with host_id {member['host_id']} with " \ - f"can_vote {member['voter']} and " \ - f"presents in token ring {member['host_id'] in token_ring_node_ids}. " \ - f"Inconsistency between group0: {group0_members} " \ - f"and tokenring: {tokenring_members}" - LOGGER.error(error_message) - yield ClusterHealthValidatorEvent.Group0TokenRingInconsistency( - severity=Severity.ERROR, - node=current_node.name, - error=error_message, - ) - LOGGER.debug("Group0 and token-ring are consistent on node %s (host_id=%s)...", - current_node.name, current_node.host_id) + return current_node.raft.check_group0_tokenring_consistency(group0_members, tokenring_members) diff --git a/sdcm/utils/raft/__init__.py b/sdcm/utils/raft/__init__.py index 4041411a41..8a60765684 100644 --- a/sdcm/utils/raft/__init__.py +++ b/sdcm/utils/raft/__init__.py @@ -8,8 +8,10 @@ from sdcm.sct_events.database import DatabaseLogEvent from sdcm.sct_events.filters import EventsSeverityChangerFilter +from sdcm.sct_events.health import ClusterHealthValidatorEvent from sdcm.sct_events import Severity from sdcm.utils.features import is_consistent_topology_changes_feature_enabled, is_consistent_cluster_management_feature_enabled +from sdcm.utils.health_checker import HealthEventsGenerator from sdcm.wait import wait_for LOGGER = logging.getLogger(__name__) @@ -80,6 +82,12 @@ class RaftFeatureOperations(ABC): @property @abstractmethod def is_enabled(self) -> bool: + """ + Please do not use this method in simple statements like: + if not current_node.raft.is_enabled + move logic into separate methods of RaftFeature/NoRaft + classes instead + """ ... @property @@ -107,6 +115,11 @@ def get_group0_non_voters(self) -> list[dict[str, str]]: def clean_group0_garbage(self, raise_exception: bool = False) -> None: ... + def check_group0_tokenring_consistency( + self, group0_members: list[dict[str, str]], + tokenring_members: list[dict[str, str]]) -> [HealthEventsGenerator | None]: + ... + def get_message_waiting_timeout(self, message_position: MessagePosition) -> MessageTimeout: backend = self._node.parent_cluster.params["cluster_backend"] if backend not in BACKEND_TIMEOUTS: @@ -311,6 +324,29 @@ def is_cluster_topology_consistent(self) -> bool: return not diff and not non_voters_ids and len(group0_ids) == len(token_ring_ids) == num_of_nodes + def check_group0_tokenring_consistency( + self, group0_members: list[dict[str, str]], + tokenring_members: list[dict[str, str]]) -> HealthEventsGenerator: + LOGGER.debug("Check group0 and token ring consistency on node %s (host_id=%s)...", + self._node.name, self._node.host_id) + token_ring_node_ids = [member["host_id"] for member in tokenring_members] + for member in group0_members: + if member["voter"] and member["host_id"] in token_ring_node_ids: + continue + error_message = f"Node {self._node.name} has group0 member with host_id {member['host_id']} with " \ + f"can_vote {member['voter']} and " \ + f"presents in token ring {member['host_id'] in token_ring_node_ids}. " \ + f"Inconsistency between group0: {group0_members} " \ + f"and tokenring: {tokenring_members}" + LOGGER.error(error_message) + yield ClusterHealthValidatorEvent.Group0TokenRingInconsistency( + severity=Severity.ERROR, + node=self._node.name, + error=error_message, + ) + LOGGER.debug("Group0 and token-ring are consistent on node %s (host_id=%s)...", + self._node.name, self._node.host_id) + class NoRaft(RaftFeatureOperations): TOPOLOGY_OPERATION_LOG_PATTERNS = { @@ -363,6 +399,11 @@ def is_cluster_topology_consistent(self) -> bool: return len(token_ring_ids) == num_of_nodes + def check_group0_tokenring_consistency( + self, group0_members: list[dict[str, str]], + tokenring_members: list[dict[str, str]]) -> None: + LOGGER.debug("Raft feature is disabled on node %s (host_id=%s)", self._node.name, self._node.host_id) + def get_raft_mode(node) -> RaftFeature | NoRaft: with node.parent_cluster.cql_connection_patient(node) as session: diff --git a/sdcm/utils/tablets/common.py b/sdcm/utils/tablets/common.py index e108bc48c0..778f0b5cdc 100644 --- a/sdcm/utils/tablets/common.py +++ b/sdcm/utils/tablets/common.py @@ -31,9 +31,6 @@ def wait_for_tablets_balanced(node): "currently a small time window after adding nodes and before load balancing starts during which topology may appear as quiesced because the state machine goes through an idle state before it enters load balancing state" """ - if not node.raft.is_enabled: - LOGGER.info("Raft is disabled, skipping wait for balance") - return with node.parent_cluster.cql_connection_patient(node=node) as session: if not is_tablets_feature_enabled(session): LOGGER.info("Tablets are disabled, skipping wait for balance")