Skip to content

Commit

Permalink
controller: fix frequent sidecar restarts (#12)
Browse files Browse the repository at this point in the history
# Description
Containers in the controller pod would frequently restart

Upon closer inspection we found that only sidecar containers would
restart with the same errors at roughly the same time. Judging by apiserver logs it looks like etcd throttled the requests
because they were too frequent, which led the sidecars to restart. To
fix this we double the lease durations, renew deadlines and retry
periods for the leader elections of all sidecars.
We also add some missing RBAC rules.
  • Loading branch information
sauterp authored Feb 20, 2024
1 parent e526506 commit 78eaf5e
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 3 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## Unreleased

### Bug fixes

* controller: fix frequent sidecar restarts #12

## 0.29.1

### Improvements
Expand Down
10 changes: 8 additions & 2 deletions deployment/controller-rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ metadata:
rules:
- apiGroups: [""]
resources: ["persistentvolumes"]
verbs: ["get", "list", "watch"]
verbs: ["get", "list", "watch", "delete"]
- apiGroups: [""]
resources: ["persistentvolumeclaims"]
verbs: ["get", "list", "watch"]
Expand All @@ -35,7 +35,7 @@ rules:
verbs: ["get", "list", "watch", "update", "create", "patch"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["get", "watch", "list", "delete", "update", "create"]
verbs: ["get", "watch", "list", "create", "update", "patch", "delete"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
Expand Down Expand Up @@ -98,6 +98,9 @@ rules:
- apiGroups: ["apiextensions.k8s.io"]
resources: ["customresourcedefinitions", "leases"]
verbs: ["get", "watch", "list", "create", "update", "patch", "delete"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["get", "watch", "list", "create", "update", "patch", "delete"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
Expand All @@ -123,6 +126,9 @@ rules:
- apiGroups: [""]
resources: ["pods", "events"]
verbs: ["get", "watch", "list"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["get", "watch", "list", "create", "update", "patch", "delete"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
Expand Down
15 changes: 15 additions & 0 deletions deployment/controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ spec:
- "--v=5"
- "--csi-address=$(CSI_ADDRESS)"
- "--leader-election"
- "--leader-election-lease-duration=30s"
- "--leader-election-renew-deadline=20s"
- "--leader-election-retry-period=10s"
- "--feature-gates=Topology=true"
- "--default-fstype=ext4"
env:
Expand All @@ -87,6 +90,9 @@ spec:
- "--v=5"
- "--csi-address=$(CSI_ADDRESS)"
- "--leader-election"
- "--leader-election-lease-duration=30s"
- "--leader-election-renew-deadline=20s"
- "--leader-election-retry-period=10s"
env:
- name: CSI_ADDRESS
value: /var/lib/csi/sockets/pluginproxy/csi.sock
Expand All @@ -106,6 +112,9 @@ spec:
- "--v=5"
- "--csi-address=$(CSI_ADDRESS)"
- "--leader-election"
- "--leader-election-lease-duration=30s"
- "--leader-election-renew-deadline=20s"
- "--leader-election-retry-period=10s"
env:
- name: CSI_ADDRESS
value: /var/lib/csi/sockets/pluginproxy/csi.sock
Expand All @@ -124,6 +133,9 @@ spec:
args:
- "--v=5"
- "--leader-election"
- "--leader-election-lease-duration=30s"
- "--leader-election-renew-deadline=20s"
- "--leader-election-retry-period=10s"
resources:
limits:
cpu: 400m
Expand All @@ -137,6 +149,9 @@ spec:
- "--v=5"
- "--csi-address=$(CSI_ADDRESS)"
- "--leader-election"
- "--leader-election-lease-duration=30s"
- "--leader-election-renew-deadline=20s"
- "--leader-election-retry-period=10s"
env:
- name: CSI_ADDRESS
value: /var/lib/csi/sockets/pluginproxy/csi.sock
Expand Down
5 changes: 5 additions & 0 deletions internal/integ/cluster/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"log/slog"
"os"
"time"

"github.com/exoscale/exoscale/csi-driver/internal/integ/flags"

Expand Down Expand Up @@ -150,5 +151,9 @@ func Setup() error {
}
}

// give the CSI some time to become ready
// TODO find a more appropriate way to do this.
time.Sleep(30 * time.Second)

return nil
}
2 changes: 1 addition & 1 deletion internal/integ/integ_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ func awaitExpectation(t *testing.T, expected interface{}, get getFunc) {
for i := 0; i < 10; i++ {
actual = get()

time.Sleep(5 * time.Second)
time.Sleep(10 * time.Second)

if assert.ObjectsAreEqualValues(expected, actual) {
break
Expand Down

0 comments on commit 78eaf5e

Please sign in to comment.