Skip to content

Commit

Permalink
fix(k8s): add class providing key based locks and use it for K8S
Browse files Browse the repository at this point in the history
The 'test_add_new_node_and_check_old_nodes_are_cleaned_up' K8S
functional test has race condition where it may fail with the following
error:

  Command: 'kubectl cp /tmp/tmpj8_823ao.yaml \
    scylla/sct-cluster-dc-1-kind-0:/tmp/tmpj8_823ao.yaml -c scylla'
  Exit code: 2
  Stderr:
  tar: tmpj8_823ao.yaml: Cannot open: File exists
  tar: Exiting with failure status due to previous errors
  command terminated with exit code 2

It is caused by the concurrent cqlsh cmd calls to the same Scylla pods.
In this test there are about 7 concurrent such calls equal to the number
of keyspaces in Scylla.

So, fix it by using lock mechanism per each pod.
  • Loading branch information
vponomaryov committed Sep 22, 2023
1 parent 730bd7a commit 558925b
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 2 deletions.
7 changes: 5 additions & 2 deletions sdcm/remote/kubernetes_cmd_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from sdcm.utils.common import (
deprecation,
generate_random_string,
KeyBasedLock,
)
from sdcm.utils.decorators import retrying
from sdcm.wait import wait_for
Expand All @@ -41,6 +42,7 @@
from .remote_base import RemoteCmdRunnerBase, StreamWatcher

LOGGER = logging.getLogger(__name__)
KEY_BASED_LOCKS = KeyBasedLock()


def is_scylla_bench_command(command):
Expand Down Expand Up @@ -213,8 +215,9 @@ def receive_files(self, src, dst, delete_dst=False, preserve_perm=True, preserve
# pylint: disable=too-many-arguments,unused-argument
@retrying(n=3, sleep_time=5, allowed_exceptions=(RetryableNetworkException, ))
def send_files(self, src, dst, delete_dst=False, preserve_symlinks=False, verbose=False):
KubernetesOps.copy_file(self.kluster, src, f"{self.namespace}/{self.pod_name}:{dst}",
container=self.container, timeout=300)
with KEY_BASED_LOCKS.get_lock(f"k8s--{self.kluster.name}--{self.namespace}--{self.pod_name}"):
KubernetesOps.copy_file(self.kluster, src, f"{self.namespace}/{self.pod_name}:{dst}",
container=self.container, timeout=300)
return True

def _run_on_retryable_exception(self, exc: Exception, new_session: bool) -> bool:
Expand Down
14 changes: 14 additions & 0 deletions sdcm/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,20 @@
SCYLLA_GCE_IMAGES_PROJECT = "scylla-images"


class KeyBasedLock(): # pylint: disable=too-few-public-methods
"""Class designed for creating locks based on hashable keys."""

def __init__(self):
self.key_lock_mapping = {}
self.handler_lock = threading.Lock()

def get_lock(self, hashable_key):
with self.handler_lock:
if hashable_key not in self.key_lock_mapping:
self.key_lock_mapping[hashable_key] = threading.Lock()
return self.key_lock_mapping[hashable_key]


def deprecation(message):
warnings.warn(message, DeprecationWarning, stacklevel=3)

Expand Down

0 comments on commit 558925b

Please sign in to comment.