Skip to content

Commit

Permalink
Allow parallel restart of all already bootstrapped nodes (#673)
Browse files Browse the repository at this point in the history
* If node is not ready yet, do not add it to the NodeStatuses

* Add new feature to quickly restart all the pods that have previously bootstrapped. This feature is behind a featuregate with annotation allow-parallel-starts

* Downgrade controller-tools for this run

* Add Stopped/Resumed test case in unit test and e2e

* Only do fastpath if currently seedCount > 0
  • Loading branch information
burmanm authored Jul 10, 2024
1 parent ddda8e1 commit 14ce1a9
Show file tree
Hide file tree
Showing 5 changed files with 396 additions and 10 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Changelog for Cass Operator, new PRs should update the `main / unreleased` secti
## unreleased

* [ENHANCEMENT] [#648](https://github.com/k8ssandra/cass-operator/issues/648) Make MinReadySeconds configurable value in the Spec.
* [FEATURE] [#646](https://github.com/k8ssandra/cass-operator/issues/646) Allow starting multiple parallel pods if they have already previously bootstrapped and not planned for replacement. Set annotation ``cassandra.datastax.com/allow-parallel-starts: true`` to enable this feature.

## v1.21.1

Expand Down
3 changes: 3 additions & 0 deletions apis/cassandra/v1beta1/cassandradatacenter_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ const (
// UpdateAllowedAnnotation marks the Datacenter to allow upgrades to StatefulSets Spec even if CassandraDatacenter object was not modified. Allowed values are "once" and "always"
UpdateAllowedAnnotation = "cassandra.datastax.com/autoupdate-spec"

// AllowParallelStartsAnnotations allows the operator to start multiple server nodes at the same time if they have already bootstrapped.
AllowParallelStartsAnnotations = "cassandra.datastax.com/allow-parallel-starts"

AllowUpdateAlways AllowUpdateType = "always"
AllowUpdateOnce AllowUpdateType = "once"

Expand Down
53 changes: 47 additions & 6 deletions pkg/reconciliation/reconcile_racks.go
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,19 @@ func (rc *ReconciliationContext) CheckPodsReady(endpointData httphelper.CassMeta
return result.Error(err)
}

// step 0 - fastpath
if seedCount > 0 && metav1.HasAnnotation(rc.Datacenter.ObjectMeta, api.AllowParallelStartsAnnotations) && rc.Datacenter.Annotations[api.AllowParallelStartsAnnotations] == "true" {
notReadyPods, err := rc.startBootstrappedNodes(endpointData)
if err != nil {
return result.Error(err)
}

// Technically this is checked in the next part, but there could be cache issues
if notReadyPods {
return result.RequeueSoon(2)
}
}

// step 1 - see if any nodes are already coming up

nodeIsStarting, _, err := rc.findStartingNodes()
Expand Down Expand Up @@ -928,13 +941,16 @@ func (rc *ReconciliationContext) CreateUsers() result.ReconcileResult {
return result.Continue()
}

func findHostIdForIpFromEndpointsData(endpointsData []httphelper.EndpointState, ip string) string {
func findHostIdForIpFromEndpointsData(endpointsData []httphelper.EndpointState, ip string) (bool, string) {
for _, data := range endpointsData {
if net.ParseIP(data.GetRpcAddress()).Equal(net.ParseIP(ip)) {
return data.HostID
if data.HasStatus(httphelper.StatusNormal) {
return true, data.HostID
}
return false, data.HostID
}
}
return ""
return false, ""
}

func getRpcAddress(dc *api.CassandraDatacenter, pod *corev1.Pod) string {
Expand Down Expand Up @@ -984,11 +1000,14 @@ func (rc *ReconciliationContext) UpdateCassandraNodeStatus(force bool) error {
if force || nodeStatus.HostID == "" {
endpointsResponse, err := rc.NodeMgmtClient.CallMetadataEndpointsEndpoint(pod)
if err == nil {
nodeStatus.HostID = findHostIdForIpFromEndpointsData(
ready, hostId := findHostIdForIpFromEndpointsData(
endpointsResponse.Entity, ip)
if nodeStatus.HostID == "" {
logger.Info("Failed to find host ID", "pod", pod.Name)
}
if ready {
nodeStatus.HostID = hostId
}
}
}
}
Expand Down Expand Up @@ -1753,6 +1772,29 @@ func (rc *ReconciliationContext) findStartingNodes() (bool, bool, error) {
return false, false, nil
}

func (rc *ReconciliationContext) startBootstrappedNodes(endpointData httphelper.CassMetadataEndpoints) (bool, error) {
rc.ReqLogger.Info("reconcile_racks::startBootstrappedNodes")

startingNodes := false

for _, pod := range rc.dcPods {
if _, ok := rc.Datacenter.Status.NodeStatuses[pod.Name]; ok {
// Verify pod is not going to be replaced
if utils.IndexOfString(rc.Datacenter.Status.NodeReplacements, pod.Name) > -1 {
continue
}
notReady, err := rc.startNode(pod, false, endpointData)
if err != nil {
return startingNodes, err
}

startingNodes = startingNodes || notReady
}
}

return startingNodes, nil
}

func (rc *ReconciliationContext) findStartedNotReadyNodes() (bool, error) {
rc.ReqLogger.Info("reconcile_racks::findStartedNotReadyNodes")

Expand Down Expand Up @@ -1922,8 +1964,7 @@ func (rc *ReconciliationContext) startNode(pod *corev1.Pod, labelSeedBeforeStart
"Labeled pod a seed node %s", pod.Name)
}

err := rc.startCassandra(endpointData, pod)
if err != nil {
if err := rc.startCassandra(endpointData, pod); err != nil {
return true, err
}
}
Expand Down
Loading

0 comments on commit 14ce1a9

Please sign in to comment.