From 558925b49a60a15952f3cbb0f692a73e50e7e058 Mon Sep 17 00:00:00 2001 From: Valerii Ponomarov Date: Wed, 20 Sep 2023 19:31:39 +0300 Subject: [PATCH] fix(k8s): add class providing key based locks and use it for K8S The 'test_add_new_node_and_check_old_nodes_are_cleaned_up' K8S functional test has race condition where it may fail with the following error: Command: 'kubectl cp /tmp/tmpj8_823ao.yaml \ scylla/sct-cluster-dc-1-kind-0:/tmp/tmpj8_823ao.yaml -c scylla' Exit code: 2 Stderr: tar: tmpj8_823ao.yaml: Cannot open: File exists tar: Exiting with failure status due to previous errors command terminated with exit code 2 It is caused by the concurrent cqlsh cmd calls to the same Scylla pods. In this test there are about 7 concurrent such calls equal to the number of keyspaces in Scylla. So, fix it by using lock mechanism per each pod. --- sdcm/remote/kubernetes_cmd_runner.py | 7 +++++-- sdcm/utils/common.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/sdcm/remote/kubernetes_cmd_runner.py b/sdcm/remote/kubernetes_cmd_runner.py index 941c2549a4..85aef40a53 100644 --- a/sdcm/remote/kubernetes_cmd_runner.py +++ b/sdcm/remote/kubernetes_cmd_runner.py @@ -33,6 +33,7 @@ from sdcm.utils.common import ( deprecation, generate_random_string, + KeyBasedLock, ) from sdcm.utils.decorators import retrying from sdcm.wait import wait_for @@ -41,6 +42,7 @@ from .remote_base import RemoteCmdRunnerBase, StreamWatcher LOGGER = logging.getLogger(__name__) +KEY_BASED_LOCKS = KeyBasedLock() def is_scylla_bench_command(command): @@ -213,8 +215,9 @@ def receive_files(self, src, dst, delete_dst=False, preserve_perm=True, preserve # pylint: disable=too-many-arguments,unused-argument @retrying(n=3, sleep_time=5, allowed_exceptions=(RetryableNetworkException, )) def send_files(self, src, dst, delete_dst=False, preserve_symlinks=False, verbose=False): - KubernetesOps.copy_file(self.kluster, src, f"{self.namespace}/{self.pod_name}:{dst}", - container=self.container, timeout=300) + with KEY_BASED_LOCKS.get_lock(f"k8s--{self.kluster.name}--{self.namespace}--{self.pod_name}"): + KubernetesOps.copy_file(self.kluster, src, f"{self.namespace}/{self.pod_name}:{dst}", + container=self.container, timeout=300) return True def _run_on_retryable_exception(self, exc: Exception, new_session: bool) -> bool: diff --git a/sdcm/utils/common.py b/sdcm/utils/common.py index 2e3502115e..df043f13a5 100644 --- a/sdcm/utils/common.py +++ b/sdcm/utils/common.py @@ -98,6 +98,20 @@ SCYLLA_GCE_IMAGES_PROJECT = "scylla-images" +class KeyBasedLock(): # pylint: disable=too-few-public-methods + """Class designed for creating locks based on hashable keys.""" + + def __init__(self): + self.key_lock_mapping = {} + self.handler_lock = threading.Lock() + + def get_lock(self, hashable_key): + with self.handler_lock: + if hashable_key not in self.key_lock_mapping: + self.key_lock_mapping[hashable_key] = threading.Lock() + return self.key_lock_mapping[hashable_key] + + def deprecation(message): warnings.warn(message, DeprecationWarning, stacklevel=3)