From ec0c56629348d65c872623fea9472e73e9a4c555 Mon Sep 17 00:00:00 2001 From: adrianc Date: Tue, 20 Aug 2024 17:02:27 +0300 Subject: [PATCH] additionalRequestors reconcile flow nodemaintenance controller modifications to support additionalRequestors. - check that no additional requestors are present before proceeding with object removal flow when object is in ready state - update controller tests Signed-off-by: adrianc --- .../controller/nodemaintenance_controller.go | 76 +++++++++---------- .../nodemaintenance_controller_test.go | 25 +++++- 2 files changed, 60 insertions(+), 41 deletions(-) diff --git a/internal/controller/nodemaintenance_controller.go b/internal/controller/nodemaintenance_controller.go index fed27e4..a499c3e 100644 --- a/internal/controller/nodemaintenance_controller.go +++ b/internal/controller/nodemaintenance_controller.go @@ -50,9 +50,10 @@ import ( ) var ( - defaultMaxNodeMaintenanceTime = 1600 * time.Second - waitPodCompletionRequeueTime = 10 * time.Second - drainReqeueTime = 10 * time.Second + defaultMaxNodeMaintenanceTime = 1600 * time.Second + waitPodCompletionRequeueTime = 10 * time.Second + drainReqeueTime = 10 * time.Second + additionalRequestorsRequeueTime = 10 * time.Second ) const ( @@ -198,7 +199,7 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ reqLog.Error(err, "failed to handle drain state for NodeMaintenance object") } case maintenancev1.ConditionReasonReady: - err = r.handleReadyState(ctx, reqLog, nm, node) + res, err = r.handleReadyState(ctx, reqLog, nm, node) if err != nil { reqLog.Error(err, "failed to handle Ready state for NodeMaintenance object") } @@ -207,6 +208,16 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ return res, err } +// handleFinalizerRemoval handles the removal of node maintenance finalizer +func (r *NodeMaintenanceReconciler) handleFinalizerRemoval(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance) error { + reqLog.Info("removing maintenance finalizer", "namespace", nm.Namespace, "name", nm.Name) + err := k8sutils.RemoveFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName) + if err != nil { + reqLog.Error(err, "failed to remove finalizer for NodeMaintenance", "namespace", nm.Namespace, "name", nm.Name) + } + return err +} + // handleUninitiaizedState handles NodeMaintenance in ConditionReasonUninitialized state // it eventually sets NodeMaintenance Ready condition Reason to ConditionReasonPending func (r *NodeMaintenanceReconciler) handleUninitiaizedState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance) error { @@ -230,8 +241,8 @@ func (r *NodeMaintenanceReconciler) handleUninitiaizedState(ctx context.Context, // it eventually sets NodeMaintenance Ready condition Reason to ConditionReasonWaitForPodCompletion func (r *NodeMaintenanceReconciler) handleScheduledState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance) error { reqLog.Info("Handle Scheduled NodeMaintenance") - // handle finalizers var err error + if nm.GetDeletionTimestamp().IsZero() { // conditionally add finalizer err = k8sutils.AddFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName) @@ -241,12 +252,8 @@ func (r *NodeMaintenanceReconciler) handleScheduledState(ctx context.Context, re } } else { // object is being deleted, remove finalizer if exists and return - reqLog.Info("NodeMaintenance object is deleting, removing maintenance finalizer", "namespace", nm.Namespace, "name", nm.Name) - err = k8sutils.RemoveFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName) - if err != nil { - reqLog.Error(err, "failed to remove finalizer for NodeMaintenance", "namespace", nm.Namespace, "name", nm.Name) - } - return err + reqLog.Info("NodeMaintenance object is deleting", "namespace", nm.Namespace, "name", nm.Name) + return r.handleFinalizerRemoval(ctx, reqLog, nm) } // TODO(adrianc): in openshift, we should pause MCP here @@ -271,6 +278,7 @@ func (r *NodeMaintenanceReconciler) handleCordonState(ctx context.Context, reqLo if !nm.GetDeletionTimestamp().IsZero() { reqLog.Info("NodeMaintenance object is deleting") + if nm.Spec.Cordon { reqLog.Info("handle uncordon of node, ", "node", node.Name) err = r.CordonHandler.HandleUnCordon(ctx, reqLog, nm, node) @@ -280,13 +288,7 @@ func (r *NodeMaintenanceReconciler) handleCordonState(ctx context.Context, reqLo } // TODO(adrianc): unpause MCP in OCP when support is added. - - reqLog.Info("remove maintenance finalizer for node maintenance", "namespace", nm.Namespace, "name", nm.Name) - err = k8sutils.RemoveFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName) - if err != nil { - reqLog.Error(err, "failed to remove finalizer for NodeMaintenance", "namespace", nm.Namespace, "name", nm.Name) - } - return err + return r.handleFinalizerRemoval(ctx, reqLog, nm) } if nm.Spec.Cordon { @@ -317,6 +319,7 @@ func (r *NodeMaintenanceReconciler) handleWaitPodCompletionState(ctx context.Con if !nm.GetDeletionTimestamp().IsZero() { // object is being deleted, handle cleanup. reqLog.Info("NodeMaintenance object is deleting") + if nm.Spec.Cordon { reqLog.Info("handle uncordon of node, ", "node", node.Name) err = r.CordonHandler.HandleUnCordon(ctx, reqLog, nm, node) @@ -326,13 +329,7 @@ func (r *NodeMaintenanceReconciler) handleWaitPodCompletionState(ctx context.Con } // TODO(adrianc): unpause MCP in OCP when support is added. - - // remove finalizer if exists and return - reqLog.Info("NodeMaintenance object is deleting, removing maintenance finalizer", "namespace", nm.Namespace, "name", nm.Name) - err = k8sutils.RemoveFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName) - if err != nil { - reqLog.Error(err, "failed to remove finalizer for NodeMaintenance", "namespace", nm.Namespace, "name", nm.Name) - } + err = r.handleFinalizerRemoval(ctx, reqLog, nm) return res, err } @@ -394,10 +391,7 @@ func (r *NodeMaintenanceReconciler) handleDrainState(ctx context.Context, reqLog // remove finalizer if exists and return reqLog.Info("NodeMaintenance object is deleting, removing maintenance finalizer", "namespace", nm.Namespace, "name", nm.Name) - err = k8sutils.RemoveFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName) - if err != nil { - reqLog.Error(err, "failed to remove finalizer for NodeMaintenance", "namespace", nm.Namespace, "name", nm.Name) - } + err = r.handleFinalizerRemoval(ctx, reqLog, nm) return res, err } @@ -508,31 +502,33 @@ func (r *NodeMaintenanceReconciler) updateDrainStatus(ctx context.Context, nm *m return nil } -func (r *NodeMaintenanceReconciler) handleReadyState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance, node *corev1.Node) error { +func (r *NodeMaintenanceReconciler) handleReadyState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance, node *corev1.Node) (ctrl.Result, error) { reqLog.Info("Handle Ready NodeMaintenance") - // handle finalizers var err error + var res ctrl.Result if !nm.GetDeletionTimestamp().IsZero() { // object is being deleted, handle cleanup. reqLog.Info("NodeMaintenance object is deleting") + + if len(nm.Spec.AdditionalRequestors) > 0 { + reqLog.Info("additional requestors for node maintenance. waiting for list to clear, requeue request", "additionalRequestors", nm.Spec.AdditionalRequestors) + return ctrl.Result{Requeue: true, RequeueAfter: additionalRequestorsRequeueTime}, nil + } + if nm.Spec.Cordon { reqLog.Info("handle uncordon of node, ", "node", node.Name) err = r.CordonHandler.HandleUnCordon(ctx, reqLog, nm, node) if err != nil { - return err + return res, err } } // TODO(adrianc): unpause MCP in OCP when support is added. // remove finalizer if exists and return - reqLog.Info("NodeMaintenance object is deleting, removing maintenance finalizer", "namespace", nm.Namespace, "name", nm.Name) - err = k8sutils.RemoveFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName) - if err != nil { - reqLog.Error(err, "failed to remove finalizer for NodeMaintenance", "namespace", nm.Namespace, "name", nm.Name) - } - return err + err = r.handleFinalizerRemoval(ctx, reqLog, nm) + return res, err } // set ready-time annotation @@ -541,11 +537,11 @@ func (r *NodeMaintenanceReconciler) handleReadyState(ctx context.Context, reqLog metav1.SetMetaDataAnnotation(&nm.ObjectMeta, ReadyTimeAnnotation, time.Now().UTC().Format(time.RFC3339)) err := r.Update(ctx, nm) if err != nil { - return err + return res, err } } - return nil + return res, nil } // SetupWithManager sets up the controller with the Manager. diff --git a/internal/controller/nodemaintenance_controller_test.go b/internal/controller/nodemaintenance_controller_test.go index a35257b..059b266 100644 --- a/internal/controller/nodemaintenance_controller_test.go +++ b/internal/controller/nodemaintenance_controller_test.go @@ -205,6 +205,11 @@ var _ = Describe("NodeMaintenance Controller", func() { Within(time.Second).WithPolling(100 * time.Millisecond). Should(Equal(maintenancev1.ConditionReasonWaitForPodCompletion)) + By("Setting Additional requestor for node maintenance") + Expect(k8sClient.Get(testCtx, types.NamespacedName{Namespace: "default", Name: "test-nm"}, nm)).ToNot(HaveOccurred()) + nm.Spec.AdditionalRequestors = append(nm.Spec.AdditionalRequestors, "another-requestor.nvidia.com") + Expect(k8sClient.Update(testCtx, nm)).ToNot(HaveOccurred()) + By("After deleting wait for completion pod, NodeMaintenance is Draining") // NOTE(adrianc): for pods we must provide DeleteOptions as below else apiserver will not delete pod object var grace int64 @@ -244,8 +249,21 @@ var _ = Describe("NodeMaintenance Controller", func() { maintenancev1.ConditionReasonWaitForPodCompletion, maintenancev1.ConditionReasonDraining, maintenancev1.ConditionReasonReady)) - By("Should Uncordon node after NodeMaintenance is deleted") + By("Object is not removed when deleted due to having AdditionalRequestors") Expect(k8sClient.Delete(testCtx, nm)).ToNot(HaveOccurred()) + Consistently(k8sClient.Get(testCtx, client.ObjectKeyFromObject(nm), nm)). + Within(time.Second).WithPolling(100 * time.Millisecond). + Should(Succeed()) + + By("Node remains cordoned") + Expect(k8sClient.Get(testCtx, client.ObjectKeyFromObject(node), node)).ToNot(HaveOccurred()) + Expect(node.Spec.Unschedulable).To(BeTrue()) + + By("Remove AdditionalRequestros") + nm.Spec.AdditionalRequestors = nil + Expect(k8sClient.Update(testCtx, nm)).ToNot(HaveOccurred()) + + By("Should Uncordon node after NodeMaintenance is deleted") Eventually(func() bool { node := &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -255,6 +273,11 @@ var _ = Describe("NodeMaintenance Controller", func() { Expect(k8sClient.Get(testCtx, client.ObjectKeyFromObject(node), node)).ToNot(HaveOccurred()) return node.Spec.Unschedulable }).WithTimeout(10 * time.Second).WithPolling(1 * time.Second).Should(BeFalse()) + + By("Node maintenance no longer exists") + err := k8sClient.Get(testCtx, client.ObjectKeyFromObject(nm), nm) + Expect(err).To(HaveOccurred()) + Expect(k8serrors.IsNotFound(err)).To(BeTrue()) }) })