-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #18 from adrianchiris/add-garbage-collector
Add NodeMaintenance garbage collector
- Loading branch information
Showing
7 changed files
with
375 additions
and
82 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
/* | ||
Copyright 2024, NVIDIA CORPORATION & AFFILIATES | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package controller | ||
|
||
import ( | ||
"context" | ||
"sync" | ||
"time" | ||
|
||
"github.com/go-logr/logr" | ||
"github.com/pkg/errors" | ||
ctrl "sigs.k8s.io/controller-runtime" | ||
"sigs.k8s.io/controller-runtime/pkg/client" | ||
|
||
maintenancev1 "github.com/Mellanox/maintenance-operator/api/v1alpha1" | ||
"github.com/Mellanox/maintenance-operator/internal/log" | ||
) | ||
|
||
var ( | ||
defaultMaxNodeMaintenanceTime = 1600 * time.Second | ||
defaultGarbageCollectionReconcileTime = 5 * time.Minute | ||
garbageCollectionReconcileTime = defaultGarbageCollectionReconcileTime | ||
) | ||
|
||
// GarbageCollectIgnoreAnnotation garbage collector will skip NodeMaintenance with this annotation. | ||
const GarbageCollectIgnoreAnnotation = "maintenance.nvidia.com/garbage-collector.ignore" | ||
|
||
// NewGarbageCollectorOptions creates new *GarbageCollectorOptions | ||
func NewGarbageCollectorOptions() *GarbageCollectorOptions { | ||
return &GarbageCollectorOptions{ | ||
pendingMaxNodeMaintenanceTime: defaultMaxNodeMaintenanceTime, | ||
maxNodeMaintenanceTime: defaultMaxNodeMaintenanceTime, | ||
} | ||
} | ||
|
||
// GarbageCollectorOptions are options for GarbageCollector where values | ||
// are stored by external entity and read by GarbageCollector. | ||
type GarbageCollectorOptions struct { | ||
sync.Mutex | ||
|
||
pendingMaxNodeMaintenanceTime time.Duration | ||
maxNodeMaintenanceTime time.Duration | ||
} | ||
|
||
// Store maxNodeMaintenanceTime | ||
func (gco *GarbageCollectorOptions) Store(maxNodeMaintenanceTime time.Duration) { | ||
gco.Lock() | ||
defer gco.Unlock() | ||
|
||
gco.pendingMaxNodeMaintenanceTime = maxNodeMaintenanceTime | ||
} | ||
|
||
// Load loads the last Stored options | ||
func (gco *GarbageCollectorOptions) Load() { | ||
gco.Lock() | ||
defer gco.Unlock() | ||
|
||
gco.maxNodeMaintenanceTime = gco.pendingMaxNodeMaintenanceTime | ||
} | ||
|
||
// MaxNodeMaintenanceTime returns the last loaded MaxUnavailable option | ||
func (gco *GarbageCollectorOptions) MaxNodeMaintenanceTime() time.Duration { | ||
return gco.maxNodeMaintenanceTime | ||
} | ||
|
||
// NewNodeMaintenanceGarbageCollector creates a new NodeMaintenanceGarbageCollector | ||
func NewNodeMaintenanceGarbageCollector(kClient client.Client, options *GarbageCollectorOptions, log logr.Logger) *NodeMaintenanceGarbageCollector { | ||
return &NodeMaintenanceGarbageCollector{ | ||
Client: kClient, | ||
options: options, | ||
log: log, | ||
} | ||
} | ||
|
||
// NodeMaintenanceGarbageCollector performs garbage collection for NodeMaintennace | ||
type NodeMaintenanceGarbageCollector struct { | ||
client.Client | ||
|
||
options *GarbageCollectorOptions | ||
log logr.Logger | ||
} | ||
|
||
// SetupWithManager sets up NodeMaintenanceGarbageCollector with controller manager | ||
func (r *NodeMaintenanceGarbageCollector) SetupWithManager(mgr ctrl.Manager) error { | ||
return mgr.Add(r) | ||
} | ||
|
||
// Reconcile collects garabage once | ||
func (r *NodeMaintenanceGarbageCollector) Reconcile(ctx context.Context) error { | ||
r.log.Info("periodic reconcile start") | ||
r.options.Load() | ||
r.log.V(log.DebugLevel).Info("loaded options", "maxNodeMaintenanceTime", r.options.MaxNodeMaintenanceTime()) | ||
|
||
mnl := &maintenancev1.NodeMaintenanceList{} | ||
err := r.List(ctx, mnl) | ||
if err != nil { | ||
return errors.Wrap(err, "failed to list NodeMaintenance") | ||
} | ||
|
||
timeNow := time.Now() | ||
for _, nm := range mnl.Items { | ||
// skip NodeMaintenance with | ||
nmLog := r.log.WithValues("namespace", nm.Namespace, "name", nm.Name) | ||
|
||
if nm.Annotations[GarbageCollectIgnoreAnnotation] == "true" { | ||
nmLog.Info("skipping NodeMaintenance due to ignore annotation") | ||
continue | ||
} | ||
|
||
if nm.Annotations[ReadyTimeAnnotation] != "" { | ||
readyTime, err := time.Parse(time.RFC3339, nm.Annotations[ReadyTimeAnnotation]) | ||
if err != nil { | ||
nmLog.Error(err, "failed to parse ready-time annotation for NodeMaintenenace") | ||
continue | ||
} | ||
if timeNow.After(readyTime.Add(r.options.MaxNodeMaintenanceTime())) { | ||
nmLog.Info("NodeMaintenance is due for garbage collection") | ||
if nm.GetDeletionTimestamp().IsZero() { | ||
nmLog.Info("deleting NodeMaintenance") | ||
if err = r.Delete(ctx, &nm); err != nil { | ||
nmLog.Error(err, "failed to delete NodeMaintenance") | ||
} | ||
} else { | ||
r.log.V(log.DebugLevel).Info("deletion timestamp already set for NodeMaintenance") | ||
} | ||
} | ||
} | ||
} | ||
|
||
r.log.Info("periodic reconcile end") | ||
return nil | ||
} | ||
|
||
// Start NodeMaintenanceGarbageCollector | ||
func (r *NodeMaintenanceGarbageCollector) Start(ctx context.Context) error { | ||
r.log.Info("NodeMaintenanceGarbageCollector Start") | ||
|
||
t := time.NewTicker(garbageCollectionReconcileTime) | ||
defer t.Stop() | ||
|
||
OUTER: | ||
for { | ||
select { | ||
case <-ctx.Done(): | ||
break OUTER | ||
case <-t.C: | ||
err := r.Reconcile(ctx) | ||
if err != nil { | ||
r.log.Error(err, "failed to run reconcile") | ||
} | ||
} | ||
} | ||
|
||
return nil | ||
} |
180 changes: 180 additions & 0 deletions
180
internal/controller/garbage_collector_controller_test.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
/* | ||
Copyright 2024, NVIDIA CORPORATION & AFFILIATES | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package controller | ||
|
||
import ( | ||
"context" | ||
"sync" | ||
"time" | ||
|
||
. "github.com/onsi/ginkgo/v2" | ||
. "github.com/onsi/gomega" | ||
k8serrors "k8s.io/apimachinery/pkg/api/errors" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/apimachinery/pkg/types" | ||
ctrl "sigs.k8s.io/controller-runtime" | ||
"sigs.k8s.io/controller-runtime/pkg/client" | ||
ctrllog "sigs.k8s.io/controller-runtime/pkg/log" | ||
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" | ||
|
||
maintenancev1 "github.com/Mellanox/maintenance-operator/api/v1alpha1" | ||
"github.com/Mellanox/maintenance-operator/internal/testutils" | ||
) | ||
|
||
var _ = Describe("NodeMaintenance Controller", func() { | ||
Context("Envtests", func() { | ||
var nmObjectsToCleanup []*maintenancev1.NodeMaintenance | ||
var reconciler *NodeMaintenanceGarbageCollector | ||
var options *GarbageCollectorOptions | ||
// test context, TODO(adrianc): use ginkgo spec context | ||
var testCtx context.Context | ||
|
||
BeforeEach(func() { | ||
testCtx = context.Background() | ||
garbageCollectionReconcileTime = 100 * time.Millisecond | ||
DeferCleanup(func() { | ||
garbageCollectionReconcileTime = defaultGarbageCollectionReconcileTime | ||
}) | ||
|
||
// create controller manager | ||
By("create controller manager") | ||
mgr, err := ctrl.NewManager(cfg, ctrl.Options{ | ||
Scheme: k8sClient.Scheme(), | ||
Metrics: metricsserver.Options{BindAddress: "0"}, | ||
}) | ||
Expect(err).ToNot(HaveOccurred()) | ||
|
||
// create reconciler | ||
By("create NodeMaintenanceGarbageCollector") | ||
options = NewGarbageCollectorOptions() | ||
options.Store(1 * time.Second) | ||
reconciler = NewNodeMaintenanceGarbageCollector( | ||
k8sClient, options, ctrllog.Log.WithName("NodeMaintenanceGarbageCollector")) | ||
|
||
// setup reconciler with manager | ||
By("setup NodeMaintenanceGarbageCollector with controller manager") | ||
Expect(reconciler.SetupWithManager(mgr)). | ||
ToNot(HaveOccurred()) | ||
|
||
// start manager | ||
testMgrCtx, cancel := context.WithCancel(testCtx) | ||
By("start manager") | ||
wg := sync.WaitGroup{} | ||
wg.Add(1) | ||
go func() { | ||
defer wg.Done() | ||
defer GinkgoRecover() | ||
By("Start controller manager") | ||
err := mgr.Start(testMgrCtx) | ||
Expect(err).ToNot(HaveOccurred()) | ||
}() | ||
|
||
DeferCleanup(func() { | ||
By("Shut down controller manager") | ||
cancel() | ||
wg.Wait() | ||
}) | ||
}) | ||
|
||
AfterEach(func() { | ||
By("Cleanup NodeMaintenance resources") | ||
for _, nm := range nmObjectsToCleanup { | ||
err := k8sClient.Delete(testCtx, nm) | ||
if err != nil && k8serrors.IsNotFound(err) { | ||
err = nil | ||
} | ||
Expect(err).ToNot(HaveOccurred()) | ||
} | ||
By("Wait for NodeMaintenance resources to be deleted") | ||
for _, nm := range nmObjectsToCleanup { | ||
Eventually(func() bool { | ||
err := k8sClient.Get(testCtx, types.NamespacedName{Namespace: nm.Namespace, Name: nm.Name}, nm) | ||
if err != nil && k8serrors.IsNotFound(err) { | ||
return true | ||
} | ||
return false | ||
|
||
}).WithTimeout(10 * time.Second).WithPolling(1 * time.Second).Should(BeTrue()) | ||
} | ||
nmObjectsToCleanup = make([]*maintenancev1.NodeMaintenance, 0) | ||
}) | ||
|
||
It("Should Delete NodeMaintenance with ready time annotation", func() { | ||
nm := testutils.GetTestNodeMaintenance("test-nm", "test-node-0", "some-operator.nvidia.com", "") | ||
metav1.SetMetaDataAnnotation(&nm.ObjectMeta, ReadyTimeAnnotation, time.Now().UTC().Format(time.RFC3339)) | ||
Expect(k8sClient.Create(testCtx, nm)).ToNot(HaveOccurred()) | ||
nmObjectsToCleanup = append(nmObjectsToCleanup, nm) | ||
|
||
By("Consistently NodeMaintenance exists") | ||
Consistently(k8sClient.Get(testCtx, client.ObjectKeyFromObject(nm), nm)). | ||
Within(500 * time.Millisecond). | ||
WithPolling(100 * time.Millisecond). | ||
Should(Succeed()) | ||
|
||
By("Eventually NodeMaintenance is deleted") | ||
Eventually(func() bool { | ||
err := k8sClient.Get(testCtx, client.ObjectKeyFromObject(nm), nm) | ||
if err != nil && k8serrors.IsNotFound(err) { | ||
return true | ||
} | ||
return false | ||
}). | ||
WithTimeout(1 * time.Second). | ||
Should(BeTrue()) | ||
}) | ||
|
||
It("should not delete NodeMaintenance with ready time annotation if ignore garbage collection annotation set", func() { | ||
nm := testutils.GetTestNodeMaintenance("test-nm", "test-node-0", "some-operator.nvidia.com", "") | ||
metav1.SetMetaDataAnnotation(&nm.ObjectMeta, ReadyTimeAnnotation, time.Now().UTC().Format(time.RFC3339)) | ||
metav1.SetMetaDataAnnotation(&nm.ObjectMeta, GarbageCollectIgnoreAnnotation, "true") | ||
Expect(k8sClient.Create(testCtx, nm)).ToNot(HaveOccurred()) | ||
nmObjectsToCleanup = append(nmObjectsToCleanup, nm) | ||
|
||
By("Consistently NodeMaintenance exists") | ||
Consistently(k8sClient.Get(testCtx, client.ObjectKeyFromObject(nm), nm)). | ||
Within(2 * time.Second). | ||
WithPolling(500 * time.Millisecond). | ||
Should(Succeed()) | ||
}) | ||
|
||
It("should not delete NodeMaintenance without ready time annotation", func() { | ||
nm := testutils.GetTestNodeMaintenance("test-nm", "test-node-0", "some-operator.nvidia.com", "") | ||
Expect(k8sClient.Create(testCtx, nm)).ToNot(HaveOccurred()) | ||
nmObjectsToCleanup = append(nmObjectsToCleanup, nm) | ||
|
||
By("Consistently NodeMaintenance exists") | ||
Consistently(k8sClient.Get(testCtx, client.ObjectKeyFromObject(nm), nm)). | ||
Within(2 * time.Second). | ||
WithPolling(500 * time.Millisecond). | ||
Should(Succeed()) | ||
}) | ||
}) | ||
|
||
Context("UnitTests", func() { | ||
Context("GarbageCollectorOptions", func() { | ||
It("Works", func() { | ||
options := NewGarbageCollectorOptions() | ||
Expect(options.MaxNodeMaintenanceTime()).To(Equal(defaultMaxNodeMaintenanceTime)) | ||
newTime := 300 * time.Second | ||
options.Store(newTime) | ||
Expect(options.MaxNodeMaintenanceTime()).To(Equal(defaultMaxNodeMaintenanceTime)) | ||
options.Load() | ||
Expect(options.MaxNodeMaintenanceTime()).To(Equal(newTime)) | ||
}) | ||
}) | ||
}) | ||
}) |
Oops, something went wrong.