Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

POC: block operator upgrade when detecting outdated vms #3171

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions controllers/hyperconverged/hyperconverged_controller.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
package hyperconverged

import (
"bufio"
"cmp"
"context"
"crypto/tls"
"encoding/json"
"fmt"
"net/http"
"os"
"reflect"
"regexp"
"slices"
"sync"
"time"

"github.com/blang/semver/v4"
jsonpatch "github.com/evanphx/json-patch/v5"
Expand Down Expand Up @@ -101,6 +107,8 @@ var JSONPatchAnnotationNames = []string{
common.JSONPatchSSPAnnotationName,
}

var rhel8Regex = regexp.MustCompile(`.*rhel8.*`)

// RegisterReconciler creates a new HyperConverged Reconciler and registers it into manager.
func RegisterReconciler(mgr manager.Manager, ci hcoutil.ClusterInfo, upgradeableCond hcoutil.Condition) error {
return add(mgr, newReconciler(mgr, ci, upgradeableCond), ci)
Expand Down Expand Up @@ -336,6 +344,11 @@ func (r *ReconcileHyperConverged) Reconcile(ctx context.Context, request reconci
return result, err
}

err = r.evaluateUpgradeEligibility(hcoRequest)
if err != nil {
hcoRequest.Logger.Error(err, "Failed to evaluate upgrade eligibility", "err", err)
return reconcile.Result{}, err
}
if err = r.setOperatorUpgradeableStatus(hcoRequest); err != nil {
return reconcile.Result{}, err
}
Expand Down Expand Up @@ -1330,6 +1343,101 @@ func (r *ReconcileHyperConverged) deleteObj(req *common.HcoRequest, obj client.O
return removed, nil
}

func (r *ReconcileHyperConverged) evaluateUpgradeEligibility(req *common.HcoRequest) error {
podList := &corev1.PodList{}
listOpts := []client.ListOption{
client.InNamespace(req.Namespace),
client.MatchingLabels{"kubevirt.io": "virt-controller"},
}

if err := r.client.List(req.Ctx, podList, listOpts...); err != nil {
req.Logger.Info("Failed to list virt-controller pods", "namespace", req.Namespace, "error", err)
return fmt.Errorf("failed to list virt-controller pods: %w", err)
}

if len(podList.Items) == 0 {
req.Logger.Info("No virt-controller pods found", "namespace", req.Namespace)
return nil
}
Comment on lines +1347 to +1361
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: does each pod collects only several VM info? In another words: can we query the service instead of the pod itself?

Copy link
Contributor Author

@dasionov dasionov Nov 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't have a service for virt-controller as far as i know, only for virt-api
but i assume i can query just one of the pods since they all should have the same metrics data


httpClient := &http.Client{
Transport: &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
},
}

var wg sync.WaitGroup
var mu sync.Mutex
errorOccurred := false

for _, pod := range podList.Items {
if pod.Status.PodIP == "" {
continue
}

wg.Add(1)
go func(p corev1.Pod) {
defer wg.Done()
if err := r.checkPodMetrics(req, httpClient, p); err != nil {
req.Logger.Info("Error processing pod metrics", "pod", p.Name, "error", err)

mu.Lock()
errorOccurred = true
mu.Unlock()
}
}(pod)
}

wg.Wait()

if errorOccurred {
return fmt.Errorf("one or more errors occurred while checking pod metrics")
}

return nil
}

func (r *ReconcileHyperConverged) checkPodMetrics(req *common.HcoRequest, httpClient *http.Client, pod corev1.Pod) error {
ctx, cancel := context.WithTimeout(req.Ctx, 3*time.Second)
defer cancel()

metricsURL := fmt.Sprintf("https://%s:%d/metrics", pod.Status.PodIP, 8443)
reqWithCtx, err := http.NewRequestWithContext(ctx, http.MethodGet, metricsURL, nil)
if err != nil {
req.Logger.Info("Failed to create HTTP request", "pod", pod.Name, "error", err)
return fmt.Errorf("failed to create HTTP request: %w", err)
}

resp, err := httpClient.Do(reqWithCtx)
if err != nil {
req.Logger.Info("Failed to query metrics from pod", "pod", pod.Name, "error", err)
return fmt.Errorf("failed to query metrics: %w", err)
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
req.Logger.Info("Metrics endpoint returned error", "pod", pod.Name, "status", resp.StatusCode)
return fmt.Errorf("metrics endpoint returned status %d", resp.StatusCode)
}

scanner := bufio.NewScanner(resp.Body)
for scanner.Scan() {
line := scanner.Text()
if rhel8Regex.MatchString(line) {
req.Logger.Info("Detected outdated machine type in metrics", "pod", pod.Name, "matched", rhel8Regex.FindString(line))
req.Upgradeable = false
return nil
}
}

if err := scanner.Err(); err != nil {
req.Logger.Info("Failed to scan metrics response from pod", "pod", pod.Name, "error", err)
return fmt.Errorf("failed to scan metrics response: %w", err)
}

return nil
}

func removeOldQuickStartGuides(req *common.HcoRequest, cl client.Client, requiredQSList []string) {
existingQSList := &consolev1.ConsoleQuickStartList{}
req.Logger.Info("reading quickstart guides")
Expand Down
Loading