Skip to content

Commit

Permalink
additionalRequestors reconcile flow
Browse files Browse the repository at this point in the history
nodemaintenance controller modifications to support
additionalRequestors.

- check that no additional requestors are present
  before proceeding with object removal flow when
  object is in ready state
- update controller tests

Signed-off-by: adrianc <[email protected]>
  • Loading branch information
adrianchiris committed Aug 22, 2024
1 parent a6006b6 commit 0f5f558
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 41 deletions.
76 changes: 36 additions & 40 deletions internal/controller/nodemaintenance_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,10 @@ import (
)

var (
defaultMaxNodeMaintenanceTime = 1600 * time.Second
waitPodCompletionRequeueTime = 10 * time.Second
drainReqeueTime = 10 * time.Second
defaultMaxNodeMaintenanceTime = 1600 * time.Second
waitPodCompletionRequeueTime = 10 * time.Second
drainReqeueTime = 10 * time.Second
additionalRequestorsRequeueTime = 10 * time.Second
)

const (
Expand Down Expand Up @@ -198,7 +199,7 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
reqLog.Error(err, "failed to handle drain state for NodeMaintenance object")
}
case maintenancev1.ConditionReasonReady:
err = r.handleReadyState(ctx, reqLog, nm, node)
res, err = r.handleReadyState(ctx, reqLog, nm, node)
if err != nil {
reqLog.Error(err, "failed to handle Ready state for NodeMaintenance object")
}
Expand All @@ -207,6 +208,16 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
return res, err
}

// handleFinalizerRemoval handles the removal of node maintenance finalizer
func (r *NodeMaintenanceReconciler) handleFinalizerRemoval(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance) error {
reqLog.Info("removing maintenance finalizer", "namespace", nm.Namespace, "name", nm.Name)
err := k8sutils.RemoveFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName)
if err != nil {
reqLog.Error(err, "failed to remove finalizer for NodeMaintenance", "namespace", nm.Namespace, "name", nm.Name)
}
return err
}

// handleUninitiaizedState handles NodeMaintenance in ConditionReasonUninitialized state
// it eventually sets NodeMaintenance Ready condition Reason to ConditionReasonPending
func (r *NodeMaintenanceReconciler) handleUninitiaizedState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance) error {
Expand All @@ -230,8 +241,8 @@ func (r *NodeMaintenanceReconciler) handleUninitiaizedState(ctx context.Context,
// it eventually sets NodeMaintenance Ready condition Reason to ConditionReasonWaitForPodCompletion
func (r *NodeMaintenanceReconciler) handleScheduledState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance) error {
reqLog.Info("Handle Scheduled NodeMaintenance")
// handle finalizers
var err error

if nm.GetDeletionTimestamp().IsZero() {
// conditionally add finalizer
err = k8sutils.AddFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName)
Expand All @@ -241,12 +252,8 @@ func (r *NodeMaintenanceReconciler) handleScheduledState(ctx context.Context, re
}
} else {
// object is being deleted, remove finalizer if exists and return
reqLog.Info("NodeMaintenance object is deleting, removing maintenance finalizer", "namespace", nm.Namespace, "name", nm.Name)
err = k8sutils.RemoveFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName)
if err != nil {
reqLog.Error(err, "failed to remove finalizer for NodeMaintenance", "namespace", nm.Namespace, "name", nm.Name)
}
return err
reqLog.Info("NodeMaintenance object is deleting", "namespace", nm.Namespace, "name", nm.Name)
return r.handleFinalizerRemoval(ctx, reqLog, nm)
}

// TODO(adrianc): in openshift, we should pause MCP here
Expand All @@ -271,6 +278,7 @@ func (r *NodeMaintenanceReconciler) handleCordonState(ctx context.Context, reqLo

if !nm.GetDeletionTimestamp().IsZero() {
reqLog.Info("NodeMaintenance object is deleting")

if nm.Spec.Cordon {
reqLog.Info("handle uncordon of node, ", "node", node.Name)
err = r.CordonHandler.HandleUnCordon(ctx, reqLog, nm, node)
Expand All @@ -280,13 +288,7 @@ func (r *NodeMaintenanceReconciler) handleCordonState(ctx context.Context, reqLo
}

// TODO(adrianc): unpause MCP in OCP when support is added.

reqLog.Info("remove maintenance finalizer for node maintenance", "namespace", nm.Namespace, "name", nm.Name)
err = k8sutils.RemoveFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName)
if err != nil {
reqLog.Error(err, "failed to remove finalizer for NodeMaintenance", "namespace", nm.Namespace, "name", nm.Name)
}
return err
return r.handleFinalizerRemoval(ctx, reqLog, nm)
}

if nm.Spec.Cordon {
Expand Down Expand Up @@ -317,6 +319,7 @@ func (r *NodeMaintenanceReconciler) handleWaitPodCompletionState(ctx context.Con
if !nm.GetDeletionTimestamp().IsZero() {
// object is being deleted, handle cleanup.
reqLog.Info("NodeMaintenance object is deleting")

if nm.Spec.Cordon {
reqLog.Info("handle uncordon of node, ", "node", node.Name)
err = r.CordonHandler.HandleUnCordon(ctx, reqLog, nm, node)
Expand All @@ -326,13 +329,7 @@ func (r *NodeMaintenanceReconciler) handleWaitPodCompletionState(ctx context.Con
}

// TODO(adrianc): unpause MCP in OCP when support is added.

// remove finalizer if exists and return
reqLog.Info("NodeMaintenance object is deleting, removing maintenance finalizer", "namespace", nm.Namespace, "name", nm.Name)
err = k8sutils.RemoveFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName)
if err != nil {
reqLog.Error(err, "failed to remove finalizer for NodeMaintenance", "namespace", nm.Namespace, "name", nm.Name)
}
err = r.handleFinalizerRemoval(ctx, reqLog, nm)
return res, err
}

Expand Down Expand Up @@ -394,10 +391,7 @@ func (r *NodeMaintenanceReconciler) handleDrainState(ctx context.Context, reqLog

// remove finalizer if exists and return
reqLog.Info("NodeMaintenance object is deleting, removing maintenance finalizer", "namespace", nm.Namespace, "name", nm.Name)
err = k8sutils.RemoveFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName)
if err != nil {
reqLog.Error(err, "failed to remove finalizer for NodeMaintenance", "namespace", nm.Namespace, "name", nm.Name)
}
err = r.handleFinalizerRemoval(ctx, reqLog, nm)
return res, err
}

Expand Down Expand Up @@ -512,31 +506,33 @@ func (r *NodeMaintenanceReconciler) updateDrainStatus(ctx context.Context, nm *m
return nil
}

func (r *NodeMaintenanceReconciler) handleReadyState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance, node *corev1.Node) error {
func (r *NodeMaintenanceReconciler) handleReadyState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance, node *corev1.Node) (ctrl.Result, error) {
reqLog.Info("Handle Ready NodeMaintenance")
// handle finalizers
var err error
var res ctrl.Result

if !nm.GetDeletionTimestamp().IsZero() {
// object is being deleted, handle cleanup.
reqLog.Info("NodeMaintenance object is deleting")

if len(nm.Spec.AdditionalRequestors) > 0 {
reqLog.Info("additional requestors for node maintenance. waiting for list to clear, requeue request", "additionalRequestors", nm.Spec.AdditionalRequestors)
return ctrl.Result{Requeue: true, RequeueAfter: additionalRequestorsRequeueTime}, nil
}

if nm.Spec.Cordon {
reqLog.Info("handle uncordon of node, ", "node", node.Name)
err = r.CordonHandler.HandleUnCordon(ctx, reqLog, nm, node)
if err != nil {
return err
return res, err
}
}

// TODO(adrianc): unpause MCP in OCP when support is added.

// remove finalizer if exists and return
reqLog.Info("NodeMaintenance object is deleting, removing maintenance finalizer", "namespace", nm.Namespace, "name", nm.Name)
err = k8sutils.RemoveFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName)
if err != nil {
reqLog.Error(err, "failed to remove finalizer for NodeMaintenance", "namespace", nm.Namespace, "name", nm.Name)
}
return err
err = r.handleFinalizerRemoval(ctx, reqLog, nm)
return res, err
}

// set ready-time annotation
Expand All @@ -545,11 +541,11 @@ func (r *NodeMaintenanceReconciler) handleReadyState(ctx context.Context, reqLog
metav1.SetMetaDataAnnotation(&nm.ObjectMeta, ReadyTimeAnnotation, time.Now().UTC().Format(time.RFC3339))
err := r.Update(ctx, nm)
if err != nil {
return err
return res, err
}
}

return nil
return res, nil
}

// SetupWithManager sets up the controller with the Manager.
Expand Down
25 changes: 24 additions & 1 deletion internal/controller/nodemaintenance_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,11 @@ var _ = Describe("NodeMaintenance Controller", func() {
Within(time.Second).WithPolling(100 * time.Millisecond).
Should(Equal(maintenancev1.ConditionReasonWaitForPodCompletion))

By("Setting Additional requestor for node maintenance")
Expect(k8sClient.Get(testCtx, types.NamespacedName{Namespace: "default", Name: "test-nm"}, nm)).ToNot(HaveOccurred())
nm.Spec.AdditionalRequestors = append(nm.Spec.AdditionalRequestors, "another-requestor.nvidia.com")
Expect(k8sClient.Update(testCtx, nm)).ToNot(HaveOccurred())

By("After deleting wait for completion pod, NodeMaintenance is Draining")
// NOTE(adrianc): for pods we must provide DeleteOptions as below else apiserver will not delete pod object
var grace int64
Expand Down Expand Up @@ -244,8 +249,21 @@ var _ = Describe("NodeMaintenance Controller", func() {
maintenancev1.ConditionReasonWaitForPodCompletion, maintenancev1.ConditionReasonDraining,
maintenancev1.ConditionReasonReady))

By("Should Uncordon node after NodeMaintenance is deleted")
By("Object is not removed when deleted due to having AdditionalRequestors")
Expect(k8sClient.Delete(testCtx, nm)).ToNot(HaveOccurred())
Consistently(k8sClient.Get(testCtx, client.ObjectKeyFromObject(nm), nm)).
Within(time.Second).WithPolling(100 * time.Millisecond).
Should(Succeed())

By("Node remains cordoned")
Expect(k8sClient.Get(testCtx, client.ObjectKeyFromObject(node), node)).ToNot(HaveOccurred())
Expect(node.Spec.Unschedulable).To(BeTrue())

By("Remove AdditionalRequestros")
nm.Spec.AdditionalRequestors = nil
Expect(k8sClient.Update(testCtx, nm)).ToNot(HaveOccurred())

By("Should Uncordon node after NodeMaintenance is deleted")
Eventually(func() bool {
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Expand All @@ -255,6 +273,11 @@ var _ = Describe("NodeMaintenance Controller", func() {
Expect(k8sClient.Get(testCtx, client.ObjectKeyFromObject(node), node)).ToNot(HaveOccurred())
return node.Spec.Unschedulable
}).WithTimeout(10 * time.Second).WithPolling(1 * time.Second).Should(BeFalse())

By("Node maintenance no longer exists")
err := k8sClient.Get(testCtx, client.ObjectKeyFromObject(nm), nm)
Expect(err).To(HaveOccurred())
Expect(k8serrors.IsNotFound(err)).To(BeTrue())
})
})

Expand Down

0 comments on commit 0f5f558

Please sign in to comment.