Skip to content

Commit

Permalink
Merge pull request #18 from adrianchiris/add-garbage-collector
Browse files Browse the repository at this point in the history
Add NodeMaintenance garbage collector
  • Loading branch information
ykulazhenkov authored Aug 27, 2024
2 parents f9fb4e5 + 3c91b75 commit e6859bc
Show file tree
Hide file tree
Showing 7 changed files with 375 additions and 82 deletions.
18 changes: 12 additions & 6 deletions cmd/maintenance-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,9 @@ func main() {
ctx := ctrl.SetupSignalHandler()
mgrClient := mgr.GetClient()

nmrOptions := controller.NewNodeMaintenanceReconcilerOptions()
if err = (&controller.NodeMaintenanceReconciler{
Client: mgrClient,
Scheme: mgr.GetScheme(),
Options: nmrOptions,
CordonHandler: cordon.NewCordonHandler(mgrClient, k8sInterface),
WaitPodCompletionHandler: podcompletion.NewPodCompletionHandler(mgrClient),
DrainManager: drain.NewManager(ctrl.Log.WithName("DrainManager"), ctx, k8sInterface),
Expand All @@ -162,6 +160,14 @@ func main() {
os.Exit(1)
}

gcOptions := controller.NewGarbageCollectorOptions()
gcLog := ctrl.Log.WithName("NodeMaintenanceGarbageCollector")
if err = controller.NewNodeMaintenanceGarbageCollector(
mgrClient, gcOptions, gcLog).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "NodeMaintenanceGarbageCollector")
os.Exit(1)
}

nmsrOptions := controller.NewNodeMaintenanceSchedulerReconcilerOptions()
nmsrLog := ctrl.Log.WithName("NodeMaintenanceScheduler")
if err = (&controller.NodeMaintenanceSchedulerReconciler{
Expand All @@ -176,10 +182,10 @@ func main() {
}

if err = (&controller.MaintenanceOperatorConfigReconciler{
Client: mgrClient,
Scheme: mgr.GetScheme(),
NodeMaintenanceReconcierOptions: nmrOptions,
SchedulerReconcierOptions: nmsrOptions,
Client: mgrClient,
Scheme: mgr.GetScheme(),
GarbageCollectorOptions: gcOptions,
SchedulerReconcierOptions: nmsrOptions,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "MaintenanceOperatorConfig")
os.Exit(1)
Expand Down
169 changes: 169 additions & 0 deletions internal/controller/garbage_collector_controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
/*
Copyright 2024, NVIDIA CORPORATION & AFFILIATES
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controller

import (
"context"
"sync"
"time"

"github.com/go-logr/logr"
"github.com/pkg/errors"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

maintenancev1 "github.com/Mellanox/maintenance-operator/api/v1alpha1"
"github.com/Mellanox/maintenance-operator/internal/log"
)

var (
defaultMaxNodeMaintenanceTime = 1600 * time.Second
defaultGarbageCollectionReconcileTime = 5 * time.Minute
garbageCollectionReconcileTime = defaultGarbageCollectionReconcileTime
)

// GarbageCollectIgnoreAnnotation garbage collector will skip NodeMaintenance with this annotation.
const GarbageCollectIgnoreAnnotation = "maintenance.nvidia.com/garbage-collector.ignore"

// NewGarbageCollectorOptions creates new *GarbageCollectorOptions
func NewGarbageCollectorOptions() *GarbageCollectorOptions {
return &GarbageCollectorOptions{
pendingMaxNodeMaintenanceTime: defaultMaxNodeMaintenanceTime,
maxNodeMaintenanceTime: defaultMaxNodeMaintenanceTime,
}
}

// GarbageCollectorOptions are options for GarbageCollector where values
// are stored by external entity and read by GarbageCollector.
type GarbageCollectorOptions struct {
sync.Mutex

pendingMaxNodeMaintenanceTime time.Duration
maxNodeMaintenanceTime time.Duration
}

// Store maxNodeMaintenanceTime
func (gco *GarbageCollectorOptions) Store(maxNodeMaintenanceTime time.Duration) {
gco.Lock()
defer gco.Unlock()

gco.pendingMaxNodeMaintenanceTime = maxNodeMaintenanceTime
}

// Load loads the last Stored options
func (gco *GarbageCollectorOptions) Load() {
gco.Lock()
defer gco.Unlock()

gco.maxNodeMaintenanceTime = gco.pendingMaxNodeMaintenanceTime
}

// MaxNodeMaintenanceTime returns the last loaded MaxUnavailable option
func (gco *GarbageCollectorOptions) MaxNodeMaintenanceTime() time.Duration {
return gco.maxNodeMaintenanceTime
}

// NewNodeMaintenanceGarbageCollector creates a new NodeMaintenanceGarbageCollector
func NewNodeMaintenanceGarbageCollector(kClient client.Client, options *GarbageCollectorOptions, log logr.Logger) *NodeMaintenanceGarbageCollector {
return &NodeMaintenanceGarbageCollector{
Client: kClient,
options: options,
log: log,
}
}

// NodeMaintenanceGarbageCollector performs garbage collection for NodeMaintennace
type NodeMaintenanceGarbageCollector struct {
client.Client

options *GarbageCollectorOptions
log logr.Logger
}

// SetupWithManager sets up NodeMaintenanceGarbageCollector with controller manager
func (r *NodeMaintenanceGarbageCollector) SetupWithManager(mgr ctrl.Manager) error {
return mgr.Add(r)
}

// Reconcile collects garabage once
func (r *NodeMaintenanceGarbageCollector) Reconcile(ctx context.Context) error {
r.log.Info("periodic reconcile start")
r.options.Load()
r.log.V(log.DebugLevel).Info("loaded options", "maxNodeMaintenanceTime", r.options.MaxNodeMaintenanceTime())

mnl := &maintenancev1.NodeMaintenanceList{}
err := r.List(ctx, mnl)
if err != nil {
return errors.Wrap(err, "failed to list NodeMaintenance")
}

timeNow := time.Now()
for _, nm := range mnl.Items {
// skip NodeMaintenance with
nmLog := r.log.WithValues("namespace", nm.Namespace, "name", nm.Name)

if nm.Annotations[GarbageCollectIgnoreAnnotation] == "true" {
nmLog.Info("skipping NodeMaintenance due to ignore annotation")
continue
}

if nm.Annotations[ReadyTimeAnnotation] != "" {
readyTime, err := time.Parse(time.RFC3339, nm.Annotations[ReadyTimeAnnotation])
if err != nil {
nmLog.Error(err, "failed to parse ready-time annotation for NodeMaintenenace")
continue
}
if timeNow.After(readyTime.Add(r.options.MaxNodeMaintenanceTime())) {
nmLog.Info("NodeMaintenance is due for garbage collection")
if nm.GetDeletionTimestamp().IsZero() {
nmLog.Info("deleting NodeMaintenance")
if err = r.Delete(ctx, &nm); err != nil {
nmLog.Error(err, "failed to delete NodeMaintenance")
}
} else {
r.log.V(log.DebugLevel).Info("deletion timestamp already set for NodeMaintenance")
}
}
}
}

r.log.Info("periodic reconcile end")
return nil
}

// Start NodeMaintenanceGarbageCollector
func (r *NodeMaintenanceGarbageCollector) Start(ctx context.Context) error {
r.log.Info("NodeMaintenanceGarbageCollector Start")

t := time.NewTicker(garbageCollectionReconcileTime)
defer t.Stop()

OUTER:
for {
select {
case <-ctx.Done():
break OUTER
case <-t.C:
err := r.Reconcile(ctx)
if err != nil {
r.log.Error(err, "failed to run reconcile")
}
}
}

return nil
}
180 changes: 180 additions & 0 deletions internal/controller/garbage_collector_controller_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
/*
Copyright 2024, NVIDIA CORPORATION & AFFILIATES
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controller

import (
"context"
"sync"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
ctrllog "sigs.k8s.io/controller-runtime/pkg/log"
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"

maintenancev1 "github.com/Mellanox/maintenance-operator/api/v1alpha1"
"github.com/Mellanox/maintenance-operator/internal/testutils"
)

var _ = Describe("NodeMaintenance Controller", func() {
Context("Envtests", func() {
var nmObjectsToCleanup []*maintenancev1.NodeMaintenance
var reconciler *NodeMaintenanceGarbageCollector
var options *GarbageCollectorOptions
// test context, TODO(adrianc): use ginkgo spec context
var testCtx context.Context

BeforeEach(func() {
testCtx = context.Background()
garbageCollectionReconcileTime = 100 * time.Millisecond
DeferCleanup(func() {
garbageCollectionReconcileTime = defaultGarbageCollectionReconcileTime
})

// create controller manager
By("create controller manager")
mgr, err := ctrl.NewManager(cfg, ctrl.Options{
Scheme: k8sClient.Scheme(),
Metrics: metricsserver.Options{BindAddress: "0"},
})
Expect(err).ToNot(HaveOccurred())

// create reconciler
By("create NodeMaintenanceGarbageCollector")
options = NewGarbageCollectorOptions()
options.Store(1 * time.Second)
reconciler = NewNodeMaintenanceGarbageCollector(
k8sClient, options, ctrllog.Log.WithName("NodeMaintenanceGarbageCollector"))

// setup reconciler with manager
By("setup NodeMaintenanceGarbageCollector with controller manager")
Expect(reconciler.SetupWithManager(mgr)).
ToNot(HaveOccurred())

// start manager
testMgrCtx, cancel := context.WithCancel(testCtx)
By("start manager")
wg := sync.WaitGroup{}
wg.Add(1)
go func() {
defer wg.Done()
defer GinkgoRecover()
By("Start controller manager")
err := mgr.Start(testMgrCtx)
Expect(err).ToNot(HaveOccurred())
}()

DeferCleanup(func() {
By("Shut down controller manager")
cancel()
wg.Wait()
})
})

AfterEach(func() {
By("Cleanup NodeMaintenance resources")
for _, nm := range nmObjectsToCleanup {
err := k8sClient.Delete(testCtx, nm)
if err != nil && k8serrors.IsNotFound(err) {
err = nil
}
Expect(err).ToNot(HaveOccurred())
}
By("Wait for NodeMaintenance resources to be deleted")
for _, nm := range nmObjectsToCleanup {
Eventually(func() bool {
err := k8sClient.Get(testCtx, types.NamespacedName{Namespace: nm.Namespace, Name: nm.Name}, nm)
if err != nil && k8serrors.IsNotFound(err) {
return true
}
return false

}).WithTimeout(10 * time.Second).WithPolling(1 * time.Second).Should(BeTrue())
}
nmObjectsToCleanup = make([]*maintenancev1.NodeMaintenance, 0)
})

It("Should Delete NodeMaintenance with ready time annotation", func() {
nm := testutils.GetTestNodeMaintenance("test-nm", "test-node-0", "some-operator.nvidia.com", "")
metav1.SetMetaDataAnnotation(&nm.ObjectMeta, ReadyTimeAnnotation, time.Now().UTC().Format(time.RFC3339))
Expect(k8sClient.Create(testCtx, nm)).ToNot(HaveOccurred())
nmObjectsToCleanup = append(nmObjectsToCleanup, nm)

By("Consistently NodeMaintenance exists")
Consistently(k8sClient.Get(testCtx, client.ObjectKeyFromObject(nm), nm)).
Within(500 * time.Millisecond).
WithPolling(100 * time.Millisecond).
Should(Succeed())

By("Eventually NodeMaintenance is deleted")
Eventually(func() bool {
err := k8sClient.Get(testCtx, client.ObjectKeyFromObject(nm), nm)
if err != nil && k8serrors.IsNotFound(err) {
return true
}
return false
}).
WithTimeout(1 * time.Second).
Should(BeTrue())
})

It("should not delete NodeMaintenance with ready time annotation if ignore garbage collection annotation set", func() {
nm := testutils.GetTestNodeMaintenance("test-nm", "test-node-0", "some-operator.nvidia.com", "")
metav1.SetMetaDataAnnotation(&nm.ObjectMeta, ReadyTimeAnnotation, time.Now().UTC().Format(time.RFC3339))
metav1.SetMetaDataAnnotation(&nm.ObjectMeta, GarbageCollectIgnoreAnnotation, "true")
Expect(k8sClient.Create(testCtx, nm)).ToNot(HaveOccurred())
nmObjectsToCleanup = append(nmObjectsToCleanup, nm)

By("Consistently NodeMaintenance exists")
Consistently(k8sClient.Get(testCtx, client.ObjectKeyFromObject(nm), nm)).
Within(2 * time.Second).
WithPolling(500 * time.Millisecond).
Should(Succeed())
})

It("should not delete NodeMaintenance without ready time annotation", func() {
nm := testutils.GetTestNodeMaintenance("test-nm", "test-node-0", "some-operator.nvidia.com", "")
Expect(k8sClient.Create(testCtx, nm)).ToNot(HaveOccurred())
nmObjectsToCleanup = append(nmObjectsToCleanup, nm)

By("Consistently NodeMaintenance exists")
Consistently(k8sClient.Get(testCtx, client.ObjectKeyFromObject(nm), nm)).
Within(2 * time.Second).
WithPolling(500 * time.Millisecond).
Should(Succeed())
})
})

Context("UnitTests", func() {
Context("GarbageCollectorOptions", func() {
It("Works", func() {
options := NewGarbageCollectorOptions()
Expect(options.MaxNodeMaintenanceTime()).To(Equal(defaultMaxNodeMaintenanceTime))
newTime := 300 * time.Second
options.Store(newTime)
Expect(options.MaxNodeMaintenanceTime()).To(Equal(defaultMaxNodeMaintenanceTime))
options.Load()
Expect(options.MaxNodeMaintenanceTime()).To(Equal(newTime))
})
})
})
})
Loading

0 comments on commit e6859bc

Please sign in to comment.