Skip to content

Commit

Permalink
Convey failed maintenance by requestor
Browse files Browse the repository at this point in the history
This commit adds support for requestors to convey failed
node maintenance.

- Add RequestorFailed condition type, to be set by requestors
  with reason MaintenanceFailed if maintenance operation failed

- Add new condition reason to Ready condition : MaintenanceFailed.
  this reason is set by controller if Ready condition is with Ready
  reason and RequestorFailed condition is preset with value True.

Failed condition type conveys failure in performing node
maintenance by the requestor.

Signed-off-by: adrianc <[email protected]>
  • Loading branch information
adrianchiris committed Aug 27, 2024
1 parent e6859bc commit 5318d07
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 22 deletions.
19 changes: 16 additions & 3 deletions api/v1alpha1/nodemaintenance_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,17 @@ import (
)

const (
// ConditionTypeReady is the Ready Condition.Type for NodeMaintenance
// ConditionTypeReady is the Ready Condition type for NodeMaintenance.
// It is used to convey node readiness for maintenance operation by the requestor.
ConditionTypeReady string = "Ready"
// ConditionTypeRequestorFailed is the RequestorFailed Condition type for NodeMaintenance.
// It is used to convey failure during node maintenance operation by the requestor.
ConditionTypeRequestorFailed string = "RequestorFailed"
)

const (
// Condition Reasons for ConditionTypeReady condition type.

// ConditionReasonUninitialized means that Condition is unset for NodeMaintenance
ConditionReasonUninitialized string = ""
// ConditionReasonPending means that NodeMaintenance is in Pending state
Expand All @@ -40,8 +46,14 @@ const (
ConditionReasonDraining string = "Draining"
// ConditionReasonReady means that NodeMaintenance is in Ready state
ConditionReasonReady string = "Ready"
// ConditionReasonAborted means that NodeMaintenance is in Aborted state
ConditionReasonAborted string = "Aborted"
// ConditionReasonRequestorFailed means that NodeMaintenance operation by the requestor has failed
// garbage collection will not occur if this reason is set.
ConditionReasonRequestorFailed string = ConditionTypeRequestorFailed

// Condition Reasons for ConditionTypeRequestorFailed condition type.

// ConditionReasonFailedMaintenance means that maintenance operation failed in a non-recoverable way
ConditionReasonFailedMaintenance string = "FailedMaintenance"
)

const (
Expand Down Expand Up @@ -186,6 +198,7 @@ type DrainStatus struct {
// +kubebuilder:printcolumn:name="Requestor",type="string",JSONPath=`.spec.requestorID`
// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=`.status.conditions[?(@.type=='Ready')].status`
// +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=`.status.conditions[?(@.type=='Ready')].reason`
// +kubebuilder:printcolumn:name="Failed",type="string",JSONPath=`.status.conditions[?(@.type=='Failed')].reason`

// NodeMaintenance is the Schema for the nodemaintenances API
type NodeMaintenance struct {
Expand Down
3 changes: 3 additions & 0 deletions config/crd/bases/maintenance.nvidia.com_nodemaintenances.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ spec:
- jsonPath: .status.conditions[?(@.type=='Ready')].reason
name: Phase
type: string
- jsonPath: .status.conditions[?(@.type=='Failed')].reason
name: Failed
type: string
name: v1alpha1
schema:
openAPIV3Schema:
Expand Down
2 changes: 2 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ rules:
- events
verbs:
- create
- patch
- update
- apiGroups:
- ""
resources:
Expand Down
74 changes: 55 additions & 19 deletions internal/controller/nodemaintenance_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,18 @@
package controller

import (
"cmp"
"context"
"errors"
"fmt"
"reflect"
"slices"
"time"

"github.com/go-logr/logr"
corev1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/runtime"
Expand Down Expand Up @@ -73,7 +76,7 @@ type NodeMaintenanceReconciler struct {
//+kubebuilder:rbac:groups=maintenance.nvidia.com,resources=nodemaintenances,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=maintenance.nvidia.com,resources=nodemaintenances/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=maintenance.nvidia.com,resources=nodemaintenances/finalizers,verbs=update
//+kubebuilder:rbac:groups="",resources=events,verbs=create
//+kubebuilder:rbac:groups="",resources=events,verbs=create;update;patch
//+kubebuilder:rbac:groups="",resources=nodes,verbs=get;update;patch
//+kubebuilder:rbac:groups="",resources=pods,verbs=get;watch;list;update;patch;delete
//+kubebuilder:rbac:groups="",resources=pods/eviction,verbs=create;get;list;update;patch;delete
Expand Down Expand Up @@ -153,8 +156,8 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
if err != nil {
reqLog.Error(err, "failed to handle drain state for NodeMaintenance object")
}
case maintenancev1.ConditionReasonReady:
res, err = r.handleReadyState(ctx, reqLog, nm, node)
case maintenancev1.ConditionReasonReady, maintenancev1.ConditionReasonRequestorFailed:
res, err = r.handleTerminalState(ctx, reqLog, nm, node)
if err != nil {
reqLog.Error(err, "failed to handle Ready state for NodeMaintenance object")
}
Expand Down Expand Up @@ -461,8 +464,8 @@ func (r *NodeMaintenanceReconciler) updateDrainStatus(ctx context.Context, nm *m
return nil
}

func (r *NodeMaintenanceReconciler) handleReadyState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance, node *corev1.Node) (ctrl.Result, error) {
reqLog.Info("Handle Ready NodeMaintenance")
func (r *NodeMaintenanceReconciler) handleTerminalState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance, node *corev1.Node) (ctrl.Result, error) {
reqLog.Info("Handle Ready/RequestorFailed NodeMaintenance")
var err error
var res ctrl.Result

Expand Down Expand Up @@ -494,12 +497,38 @@ func (r *NodeMaintenanceReconciler) handleReadyState(ctx context.Context, reqLog
if !metav1.HasAnnotation(nm.ObjectMeta, ReadyTimeAnnotation) ||
nm.Annotations[ReadyTimeAnnotation] == "" {
metav1.SetMetaDataAnnotation(&nm.ObjectMeta, ReadyTimeAnnotation, time.Now().UTC().Format(time.RFC3339))
err := r.Update(ctx, nm)
err = r.Update(ctx, nm)
if err != nil {
return res, err
}
}

// check RequestorFailed condition
currentReason := k8sutils.GetReadyConditionReason(nm)
conditionChanged := false
cond := meta.FindStatusCondition(nm.Status.Conditions, maintenancev1.ConditionTypeRequestorFailed)
if cond != nil && cond.Status == metav1.ConditionTrue {
// set Ready condition to RequestorFailed reason if not already set
if currentReason == maintenancev1.ConditionReasonReady {
err = k8sutils.SetReadyConditionReason(ctx, r.Client, nm, maintenancev1.ConditionReasonRequestorFailed)
conditionChanged = true
}
} else {
// set Ready condition to Ready reason if not already set
if currentReason == maintenancev1.ConditionReasonRequestorFailed {
err = k8sutils.SetReadyConditionReason(ctx, r.Client, nm, maintenancev1.ConditionReasonReady)
conditionChanged = true
}
}
if err != nil {
return res, err
}

if conditionChanged {
r.EventRecorder.Event(
nm, corev1.EventTypeNormal, maintenancev1.ConditionChangedEventType, k8sutils.GetReadyConditionReason(nm))
}

return res, nil
}

Expand All @@ -508,32 +537,32 @@ func (r *NodeMaintenanceReconciler) SetupWithManager(mgr ctrl.Manager, log logr.
r.EventRecorder = mgr.GetEventRecorderFor("nodemaintenancereconciler")

return ctrl.NewControllerManagedBy(mgr).
For(&maintenancev1.NodeMaintenance{}, builder.WithPredicates(NewReadyConditionChangedPredicate(log))).
For(&maintenancev1.NodeMaintenance{}, builder.WithPredicates(NewConditionChangedPredicate(log))).
Complete(r)
}

// NewReadyConditionChangedPredicate creates a new ReadyConditionChangedPredicate
func NewReadyConditionChangedPredicate(log logr.Logger) ReadyConditionChangedPredicate {
return ReadyConditionChangedPredicate{
// NewConditionChangedPredicate creates a new ConditionChangedPredicate
func NewConditionChangedPredicate(log logr.Logger) ConditionChangedPredicate {
return ConditionChangedPredicate{
Funcs: predicate.Funcs{},
log: log,
}
}

// ReadyConditionChangedPredicate will trigger enqueue of Event for reconcile in the following cases:
// 1. A change in NodeMaintenance Ready Condition
// ConditionChangedPredicate will trigger enqueue of Event for reconcile in the following cases:
// 1. A change in NodeMaintenance Conditions
// 2. Update to the object occurred and deletion timestamp is set
// 3. NodeMaintenance created
// 4. NodeMaintenance deleted
// 5. generic event received
type ReadyConditionChangedPredicate struct {
type ConditionChangedPredicate struct {
predicate.Funcs

log logr.Logger
}

// Update implements Predicate.
func (p ReadyConditionChangedPredicate) Update(e event.TypedUpdateEvent[client.Object]) bool {
func (p ConditionChangedPredicate) Update(e event.TypedUpdateEvent[client.Object]) bool {
if e.ObjectOld == nil {
p.log.Error(nil, "old object is nil in update event, ignoring event.")
return false
Expand All @@ -555,15 +584,22 @@ func (p ReadyConditionChangedPredicate) Update(e event.TypedUpdateEvent[client.O
return false
}

oldRCR := k8sutils.GetReadyConditionReason(oldO)
newRCR := k8sutils.GetReadyConditionReason(newO)
cmpByType := func(a, b metav1.Condition) int {
return cmp.Compare(a.Type, b.Type)
}

// sort old and new obj.Status.Conditions so they can be compared using DeepEqual
slices.SortFunc(oldO.Status.Conditions, cmpByType)
slices.SortFunc(newO.Status.Conditions, cmpByType)

process := oldRCR != newRCR || !newO.GetDeletionTimestamp().IsZero()
condChanged := !reflect.DeepEqual(oldO.Status.Conditions, newO.Status.Conditions)
deleting := !newO.GetDeletionTimestamp().IsZero()
process := condChanged || deleting

p.log.V(operatorlog.DebugLevel).Info("Update event for NodeMaintenance",
"name", newO.Name, "namespace", newO.Namespace,
"condition-changed", oldRCR != newRCR, "old", oldRCR, "new", newRCR,
"deleting", !newO.GetDeletionTimestamp().IsZero(), "process", process)
"condition-changed", condChanged,
"deleting", deleting, "process", process)

return process
}

0 comments on commit 5318d07

Please sign in to comment.