Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow parallel restart of all already bootstrapped nodes #673

Merged
merged 5 commits into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Changelog for Cass Operator, new PRs should update the `main / unreleased` secti
## unreleased

* [ENHANCEMENT] [#648](https://github.com/k8ssandra/cass-operator/issues/648) Make MinReadySeconds configurable value in the Spec.
* [FEATURE] [#646](https://github.com/k8ssandra/cass-operator/issues/646) Allow starting multiple parallel pods if they have already previously bootstrapped and not planned for replacement. Set annotation ``cassandra.datastax.com/allow-parallel-starts: true`` to enable this feature.

## v1.21.1

Expand Down
3 changes: 3 additions & 0 deletions apis/cassandra/v1beta1/cassandradatacenter_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ const (
// UpdateAllowedAnnotation marks the Datacenter to allow upgrades to StatefulSets Spec even if CassandraDatacenter object was not modified. Allowed values are "once" and "always"
UpdateAllowedAnnotation = "cassandra.datastax.com/autoupdate-spec"

// AllowParallelStartsAnnotations allows the operator to start multiple server nodes at the same time if they have already bootstrapped.
AllowParallelStartsAnnotations = "cassandra.datastax.com/allow-parallel-starts"

AllowUpdateAlways AllowUpdateType = "always"
AllowUpdateOnce AllowUpdateType = "once"

Expand Down
53 changes: 47 additions & 6 deletions pkg/reconciliation/reconcile_racks.go
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,19 @@ func (rc *ReconciliationContext) CheckPodsReady(endpointData httphelper.CassMeta
return result.Error(err)
}

// step 0 - fastpath
if seedCount > 0 && metav1.HasAnnotation(rc.Datacenter.ObjectMeta, api.AllowParallelStartsAnnotations) && rc.Datacenter.Annotations[api.AllowParallelStartsAnnotations] == "true" {
notReadyPods, err := rc.startBootstrappedNodes(endpointData)
if err != nil {
return result.Error(err)
}

// Technically this is checked in the next part, but there could be cache issues
if notReadyPods {
return result.RequeueSoon(2)
}
}

// step 1 - see if any nodes are already coming up

nodeIsStarting, _, err := rc.findStartingNodes()
Expand Down Expand Up @@ -928,13 +941,16 @@ func (rc *ReconciliationContext) CreateUsers() result.ReconcileResult {
return result.Continue()
}

func findHostIdForIpFromEndpointsData(endpointsData []httphelper.EndpointState, ip string) string {
func findHostIdForIpFromEndpointsData(endpointsData []httphelper.EndpointState, ip string) (bool, string) {
for _, data := range endpointsData {
if net.ParseIP(data.GetRpcAddress()).Equal(net.ParseIP(ip)) {
return data.HostID
if data.HasStatus(httphelper.StatusNormal) {
return true, data.HostID
}
return false, data.HostID
}
}
return ""
return false, ""
}

func getRpcAddress(dc *api.CassandraDatacenter, pod *corev1.Pod) string {
Expand Down Expand Up @@ -984,11 +1000,14 @@ func (rc *ReconciliationContext) UpdateCassandraNodeStatus(force bool) error {
if force || nodeStatus.HostID == "" {
endpointsResponse, err := rc.NodeMgmtClient.CallMetadataEndpointsEndpoint(pod)
if err == nil {
nodeStatus.HostID = findHostIdForIpFromEndpointsData(
ready, hostId := findHostIdForIpFromEndpointsData(
endpointsResponse.Entity, ip)
if nodeStatus.HostID == "" {
logger.Info("Failed to find host ID", "pod", pod.Name)
}
if ready {
nodeStatus.HostID = hostId
}
}
}
}
Expand Down Expand Up @@ -1753,6 +1772,29 @@ func (rc *ReconciliationContext) findStartingNodes() (bool, bool, error) {
return false, false, nil
}

func (rc *ReconciliationContext) startBootstrappedNodes(endpointData httphelper.CassMetadataEndpoints) (bool, error) {
rc.ReqLogger.Info("reconcile_racks::startBootstrappedNodes")

startingNodes := false

for _, pod := range rc.dcPods {
if _, ok := rc.Datacenter.Status.NodeStatuses[pod.Name]; ok {
// Verify pod is not going to be replaced
if utils.IndexOfString(rc.Datacenter.Status.NodeReplacements, pod.Name) > -1 {
continue
}
notReady, err := rc.startNode(pod, false, endpointData)
burmanm marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return startingNodes, err
}

startingNodes = startingNodes || notReady
}
}

return startingNodes, nil
}

func (rc *ReconciliationContext) findStartedNotReadyNodes() (bool, error) {
rc.ReqLogger.Info("reconcile_racks::findStartedNotReadyNodes")

Expand Down Expand Up @@ -1922,8 +1964,7 @@ func (rc *ReconciliationContext) startNode(pod *corev1.Pod, labelSeedBeforeStart
"Labeled pod a seed node %s", pod.Name)
}

err := rc.startCassandra(endpointData, pod)
if err != nil {
if err := rc.startCassandra(endpointData, pod); err != nil {
return true, err
}
}
Expand Down
Loading
Loading