Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional requestors #11

Merged
merged 2 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions api/v1alpha1/nodemaintenance_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,13 @@ type NodeMaintenanceSpec struct {
// +kubebuilder:validation:MinLength=2
RequestorID string `json:"requestorID"`

// AdditionalRequestors is a set of additional requestor IDs which are using the same NodeMaintenance
// request. addition or removal of requiestor IDs to this list MUST be made with update operation (and retry on failure)
// which will replace the entire list.
// +kubebuilder:validation:Optional
// +listType=set
AdditionalRequestors []string `json:"additionalRequestors,omitempty"`

// NodeName is The name of the node that maintenance operation will be performed on
// creation fails if node obj does not exist (webhook)
NodeName string `json:"nodeName"`
Expand Down Expand Up @@ -143,11 +150,9 @@ type PodEvictionFiterEntry struct {
type NodeMaintenanceStatus struct {
// Conditions represents observations of NodeMaintenance current state
// +kubebuilder:validation:Optional
// +patchMergeKey=type
// +patchStrategy=merge
// +listType=map
// +listMapKey=type
Conditions []metav1.Condition `json:"conditions,omitempty" patchMergeKey:"type" patchStrategy:"merge" protobuf:"bytes,1,rep,name=conditions"`
Conditions []metav1.Condition `json:"conditions,omitempty"`

// WaitForCompletion is the list of namespaced named pods that we wait to complete
WaitForCompletion []string `json:"waitForCompletion,omitempty"`
Expand Down
5 changes: 5 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions config/crd/bases/maintenance.nvidia.com_nodemaintenances.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,15 @@ spec:
spec:
description: NodeMaintenanceSpec defines the desired state of NodeMaintenance
properties:
additionalRequestors:
description: |-
AdditionalRequestors is a set of additional requestor IDs which are using the same NodeMaintenance
request. addition or removal of requiestor IDs to this list MUST be made with update operation (and retry on failure)
which will replace the entire list.
items:
type: string
type: array
x-kubernetes-list-type: set
cordon:
default: true
description: Cordon if set, marks node as unschedulable during maintenance
Expand Down
76 changes: 36 additions & 40 deletions internal/controller/nodemaintenance_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,10 @@ import (
)

var (
defaultMaxNodeMaintenanceTime = 1600 * time.Second
waitPodCompletionRequeueTime = 10 * time.Second
drainReqeueTime = 10 * time.Second
defaultMaxNodeMaintenanceTime = 1600 * time.Second
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Time > Seconds?
It is ok for me keep the "time" postfix if you prefer so

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lets keeps as is :)

waitPodCompletionRequeueTime = 10 * time.Second
drainReqeueTime = 10 * time.Second
additionalRequestorsRequeueTime = 10 * time.Second
)

const (
Expand Down Expand Up @@ -198,7 +199,7 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
reqLog.Error(err, "failed to handle drain state for NodeMaintenance object")
}
case maintenancev1.ConditionReasonReady:
err = r.handleReadyState(ctx, reqLog, nm, node)
res, err = r.handleReadyState(ctx, reqLog, nm, node)
if err != nil {
reqLog.Error(err, "failed to handle Ready state for NodeMaintenance object")
}
Expand All @@ -207,6 +208,16 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
return res, err
}

// handleFinalizerRemoval handles the removal of node maintenance finalizer
func (r *NodeMaintenanceReconciler) handleFinalizerRemoval(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance) error {
reqLog.Info("removing maintenance finalizer", "namespace", nm.Namespace, "name", nm.Name)
err := k8sutils.RemoveFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName)
if err != nil {
reqLog.Error(err, "failed to remove finalizer for NodeMaintenance", "namespace", nm.Namespace, "name", nm.Name)
}
return err
}

// handleUninitiaizedState handles NodeMaintenance in ConditionReasonUninitialized state
// it eventually sets NodeMaintenance Ready condition Reason to ConditionReasonPending
func (r *NodeMaintenanceReconciler) handleUninitiaizedState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance) error {
Expand All @@ -230,8 +241,8 @@ func (r *NodeMaintenanceReconciler) handleUninitiaizedState(ctx context.Context,
// it eventually sets NodeMaintenance Ready condition Reason to ConditionReasonWaitForPodCompletion
func (r *NodeMaintenanceReconciler) handleScheduledState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance) error {
reqLog.Info("Handle Scheduled NodeMaintenance")
// handle finalizers
var err error

if nm.GetDeletionTimestamp().IsZero() {
// conditionally add finalizer
err = k8sutils.AddFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName)
Expand All @@ -241,12 +252,8 @@ func (r *NodeMaintenanceReconciler) handleScheduledState(ctx context.Context, re
}
} else {
// object is being deleted, remove finalizer if exists and return
reqLog.Info("NodeMaintenance object is deleting, removing maintenance finalizer", "namespace", nm.Namespace, "name", nm.Name)
err = k8sutils.RemoveFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName)
if err != nil {
reqLog.Error(err, "failed to remove finalizer for NodeMaintenance", "namespace", nm.Namespace, "name", nm.Name)
}
return err
reqLog.Info("NodeMaintenance object is deleting", "namespace", nm.Namespace, "name", nm.Name)
return r.handleFinalizerRemoval(ctx, reqLog, nm)
}

// TODO(adrianc): in openshift, we should pause MCP here
Expand All @@ -271,6 +278,7 @@ func (r *NodeMaintenanceReconciler) handleCordonState(ctx context.Context, reqLo

if !nm.GetDeletionTimestamp().IsZero() {
reqLog.Info("NodeMaintenance object is deleting")

if nm.Spec.Cordon {
reqLog.Info("handle uncordon of node, ", "node", node.Name)
err = r.CordonHandler.HandleUnCordon(ctx, reqLog, nm, node)
Expand All @@ -280,13 +288,7 @@ func (r *NodeMaintenanceReconciler) handleCordonState(ctx context.Context, reqLo
}

// TODO(adrianc): unpause MCP in OCP when support is added.

reqLog.Info("remove maintenance finalizer for node maintenance", "namespace", nm.Namespace, "name", nm.Name)
err = k8sutils.RemoveFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName)
if err != nil {
reqLog.Error(err, "failed to remove finalizer for NodeMaintenance", "namespace", nm.Namespace, "name", nm.Name)
}
return err
return r.handleFinalizerRemoval(ctx, reqLog, nm)
}

if nm.Spec.Cordon {
Expand Down Expand Up @@ -317,6 +319,7 @@ func (r *NodeMaintenanceReconciler) handleWaitPodCompletionState(ctx context.Con
if !nm.GetDeletionTimestamp().IsZero() {
// object is being deleted, handle cleanup.
reqLog.Info("NodeMaintenance object is deleting")

if nm.Spec.Cordon {
reqLog.Info("handle uncordon of node, ", "node", node.Name)
err = r.CordonHandler.HandleUnCordon(ctx, reqLog, nm, node)
Expand All @@ -326,13 +329,7 @@ func (r *NodeMaintenanceReconciler) handleWaitPodCompletionState(ctx context.Con
}

// TODO(adrianc): unpause MCP in OCP when support is added.

// remove finalizer if exists and return
reqLog.Info("NodeMaintenance object is deleting, removing maintenance finalizer", "namespace", nm.Namespace, "name", nm.Name)
err = k8sutils.RemoveFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName)
if err != nil {
reqLog.Error(err, "failed to remove finalizer for NodeMaintenance", "namespace", nm.Namespace, "name", nm.Name)
}
err = r.handleFinalizerRemoval(ctx, reqLog, nm)
return res, err
}

Expand Down Expand Up @@ -394,10 +391,7 @@ func (r *NodeMaintenanceReconciler) handleDrainState(ctx context.Context, reqLog

// remove finalizer if exists and return
reqLog.Info("NodeMaintenance object is deleting, removing maintenance finalizer", "namespace", nm.Namespace, "name", nm.Name)
err = k8sutils.RemoveFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName)
if err != nil {
reqLog.Error(err, "failed to remove finalizer for NodeMaintenance", "namespace", nm.Namespace, "name", nm.Name)
}
err = r.handleFinalizerRemoval(ctx, reqLog, nm)
return res, err
}

Expand Down Expand Up @@ -512,31 +506,33 @@ func (r *NodeMaintenanceReconciler) updateDrainStatus(ctx context.Context, nm *m
return nil
}

func (r *NodeMaintenanceReconciler) handleReadyState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance, node *corev1.Node) error {
func (r *NodeMaintenanceReconciler) handleReadyState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance, node *corev1.Node) (ctrl.Result, error) {
reqLog.Info("Handle Ready NodeMaintenance")
// handle finalizers
var err error
var res ctrl.Result

if !nm.GetDeletionTimestamp().IsZero() {
// object is being deleted, handle cleanup.
reqLog.Info("NodeMaintenance object is deleting")

if len(nm.Spec.AdditionalRequestors) > 0 {
reqLog.Info("additional requestors for node maintenance. waiting for list to clear, requeue request", "additionalRequestors", nm.Spec.AdditionalRequestors)
return ctrl.Result{Requeue: true, RequeueAfter: additionalRequestorsRequeueTime}, nil
}

if nm.Spec.Cordon {
reqLog.Info("handle uncordon of node, ", "node", node.Name)
err = r.CordonHandler.HandleUnCordon(ctx, reqLog, nm, node)
if err != nil {
return err
return res, err
}
}

// TODO(adrianc): unpause MCP in OCP when support is added.

// remove finalizer if exists and return
reqLog.Info("NodeMaintenance object is deleting, removing maintenance finalizer", "namespace", nm.Namespace, "name", nm.Name)
err = k8sutils.RemoveFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName)
if err != nil {
reqLog.Error(err, "failed to remove finalizer for NodeMaintenance", "namespace", nm.Namespace, "name", nm.Name)
}
return err
err = r.handleFinalizerRemoval(ctx, reqLog, nm)
return res, err
}

// set ready-time annotation
Expand All @@ -545,11 +541,11 @@ func (r *NodeMaintenanceReconciler) handleReadyState(ctx context.Context, reqLog
metav1.SetMetaDataAnnotation(&nm.ObjectMeta, ReadyTimeAnnotation, time.Now().UTC().Format(time.RFC3339))
err := r.Update(ctx, nm)
if err != nil {
return err
return res, err
}
}

return nil
return res, nil
}

// SetupWithManager sets up the controller with the Manager.
Expand Down
25 changes: 24 additions & 1 deletion internal/controller/nodemaintenance_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,11 @@ var _ = Describe("NodeMaintenance Controller", func() {
Within(time.Second).WithPolling(100 * time.Millisecond).
Should(Equal(maintenancev1.ConditionReasonWaitForPodCompletion))

By("Setting Additional requestor for node maintenance")
Expect(k8sClient.Get(testCtx, types.NamespacedName{Namespace: "default", Name: "test-nm"}, nm)).ToNot(HaveOccurred())
nm.Spec.AdditionalRequestors = append(nm.Spec.AdditionalRequestors, "another-requestor.nvidia.com")
Expect(k8sClient.Update(testCtx, nm)).ToNot(HaveOccurred())

By("After deleting wait for completion pod, NodeMaintenance is Draining")
// NOTE(adrianc): for pods we must provide DeleteOptions as below else apiserver will not delete pod object
var grace int64
Expand Down Expand Up @@ -244,8 +249,21 @@ var _ = Describe("NodeMaintenance Controller", func() {
maintenancev1.ConditionReasonWaitForPodCompletion, maintenancev1.ConditionReasonDraining,
maintenancev1.ConditionReasonReady))

By("Should Uncordon node after NodeMaintenance is deleted")
By("Object is not removed when deleted due to having AdditionalRequestors")
Expect(k8sClient.Delete(testCtx, nm)).ToNot(HaveOccurred())
Consistently(k8sClient.Get(testCtx, client.ObjectKeyFromObject(nm), nm)).
Within(time.Second).WithPolling(100 * time.Millisecond).
Should(Succeed())

By("Node remains cordoned")
Expect(k8sClient.Get(testCtx, client.ObjectKeyFromObject(node), node)).ToNot(HaveOccurred())
Expect(node.Spec.Unschedulable).To(BeTrue())

By("Remove AdditionalRequestros")
nm.Spec.AdditionalRequestors = nil
Expect(k8sClient.Update(testCtx, nm)).ToNot(HaveOccurred())

By("Should Uncordon node after NodeMaintenance is deleted")
Eventually(func() bool {
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Expand All @@ -255,6 +273,11 @@ var _ = Describe("NodeMaintenance Controller", func() {
Expect(k8sClient.Get(testCtx, client.ObjectKeyFromObject(node), node)).ToNot(HaveOccurred())
return node.Spec.Unschedulable
}).WithTimeout(10 * time.Second).WithPolling(1 * time.Second).Should(BeFalse())

By("Node maintenance no longer exists")
err := k8sClient.Get(testCtx, client.ObjectKeyFromObject(nm), nm)
Expect(err).To(HaveOccurred())
Expect(k8serrors.IsNotFound(err)).To(BeTrue())
})
})

Expand Down
Loading