From 9ccb13ee33cc336d3dc3409c7ac673cf09aa3598 Mon Sep 17 00:00:00 2001
From: avlitman <alitman@redhat.com>
Date: Mon, 23 Dec 2024 18:38:48 +0200
Subject: [PATCH] Fix hco system health metric values

kubevirt_hco_system_health_status values changed to value+1,
this pr remove unknown from the options and change back the values to be
again 0 for health, 1 for warning and 2 for error.

also tests and related recording rules and alerts updated accordingly.

Signed-off-by: avlitman <alitman@redhat.com>
---
 .../hyperconverged_controller.go              | 43 ++++++++-----------
 .../hyperconverged_controller_test.go         | 10 ++---
 hack/prom-rule-ci/prom-rules-tests.yaml       | 27 ++++++------
 pkg/monitoring/metrics/operator_metrics.go    | 32 +++++---------
 .../metrics/operator_metrics_test.go          | 16 +++----
 pkg/monitoring/rules/alerts/health_alerts.go  |  4 +-
 6 files changed, 54 insertions(+), 78 deletions(-)

diff --git a/controllers/hyperconverged/hyperconverged_controller.go b/controllers/hyperconverged/hyperconverged_controller.go
index cf25852f8..913ab6126 100644
--- a/controllers/hyperconverged/hyperconverged_controller.go
+++ b/controllers/hyperconverged/hyperconverged_controller.go
@@ -1011,6 +1011,8 @@ func (r *ReconcileHyperConverged) updateConditions(req *common.HcoRequest) {
 		req.Instance.Status.SystemHealthStatus = systemHealthStatus
 		req.StatusDirty = true
 	}
+
+	metrics.SetHCOMetricSystemHealthStatus(getNumericalHealthStatus(systemHealthStatus))
 }
 
 func (r *ReconcileHyperConverged) setLabels(req *common.HcoRequest) {
@@ -1066,42 +1068,23 @@ func (r *ReconcileHyperConverged) detectTaintedConfiguration(req *common.HcoRequ
 }
 
 func (r *ReconcileHyperConverged) getSystemHealthStatus(conditions common.HcoConditions) string {
-	if isError, reason := isSystemHealthStatusError(conditions); isError {
-		metrics.SetHCOSystemError(reason)
+	if isSystemHealthStatusError(conditions) {
 		return systemHealthStatusError
 	}
 
-	if isWarning, reason := isSystemHealthStatusWarning(conditions); isWarning {
-		metrics.SetHCOSystemWarning(reason)
+	if isSystemHealthStatusWarning(conditions) {
 		return systemHealthStatusWarning
 	}
 
-	metrics.SetHCOSystemHealthy()
 	return systemHealthStatusHealthy
 }
 
-func isSystemHealthStatusError(conditions common.HcoConditions) (bool, string) {
-	if cond, found := conditions.GetCondition(hcov1beta1.ConditionDegraded); found && cond.Status == metav1.ConditionTrue {
-		return true, cond.Reason
-	}
-
-	if cond, found := conditions.GetCondition(hcov1beta1.ConditionAvailable); found && cond.Status != metav1.ConditionTrue {
-		return true, cond.Reason
-	}
-
-	return false, ""
+func isSystemHealthStatusError(conditions common.HcoConditions) bool {
+	return !conditions.IsStatusConditionTrue(hcov1beta1.ConditionAvailable) || conditions.IsStatusConditionTrue(hcov1beta1.ConditionDegraded)
 }
 
-func isSystemHealthStatusWarning(conditions common.HcoConditions) (bool, string) {
-	if cond, found := conditions.GetCondition(hcov1beta1.ConditionProgressing); found && cond.Status == metav1.ConditionTrue {
-		return true, cond.Reason
-	}
-
-	if cond, found := conditions.GetCondition(hcov1beta1.ConditionReconcileComplete); found && cond.Status != metav1.ConditionTrue {
-		return true, cond.Reason
-	}
-
-	return false, ""
+func isSystemHealthStatusWarning(conditions common.HcoConditions) bool {
+	return !conditions.IsStatusConditionTrue(hcov1beta1.ConditionReconcileComplete) || conditions.IsStatusConditionTrue(hcov1beta1.ConditionProgressing)
 }
 
 func getNumOfChangesJSONPatch(jsonPatch string) int {
@@ -1112,6 +1095,16 @@ func getNumOfChangesJSONPatch(jsonPatch string) int {
 	return len(patches)
 }
 
+func getNumericalHealthStatus(status string) float64 {
+	healthStatusCodes := map[string]float64{
+		systemHealthStatusHealthy: metrics.SystemHealthStatusHealthy,
+		systemHealthStatusWarning: metrics.SystemHealthStatusWarning,
+		systemHealthStatusError:   metrics.SystemHealthStatusError,
+	}
+
+	return healthStatusCodes[status]
+}
+
 func (r *ReconcileHyperConverged) firstLoopInitialization(request *common.HcoRequest) {
 	// Initialize operand handler.
 	r.operandHandler.FirstUseInitiation(r.scheme, hcoutil.GetClusterInfo(), request.Instance)
diff --git a/controllers/hyperconverged/hyperconverged_controller_test.go b/controllers/hyperconverged/hyperconverged_controller_test.go
index 0911388bb..0e98341e3 100644
--- a/controllers/hyperconverged/hyperconverged_controller_test.go
+++ b/controllers/hyperconverged/hyperconverged_controller_test.go
@@ -192,7 +192,7 @@ var _ = Describe("HyperconvergedController", func() {
 					Message: "Initializing HyperConverged cluster",
 				})))
 
-				verifySystemHealthStatusError(foundResource, reconcileInit)
+				verifySystemHealthStatusError(foundResource)
 
 				expectedFeatureGates := []string{
 					"CPUManager",
@@ -307,7 +307,7 @@ var _ = Describe("HyperconvergedController", func() {
 					Message: reconcileCompletedMessage,
 				})))
 
-				verifySystemHealthStatusError(foundResource, "SSPConditions")
+				verifySystemHealthStatusError(foundResource)
 
 				Expect(foundResource.Status.RelatedObjects).To(HaveLen(21))
 				expectedRef := corev1.ObjectReference{
@@ -3794,15 +3794,15 @@ func verifyHyperConvergedCRExistsMetricFalse() {
 func verifySystemHealthStatusHealthy(hco *hcov1beta1.HyperConverged) {
 	ExpectWithOffset(1, hco.Status.SystemHealthStatus).To(Equal(systemHealthStatusHealthy))
 
-	systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus("healthy")
+	systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus()
 	ExpectWithOffset(1, err).ToNot(HaveOccurred())
 	ExpectWithOffset(1, systemHealthStatusMetric).To(Equal(metrics.SystemHealthStatusHealthy))
 }
 
-func verifySystemHealthStatusError(hco *hcov1beta1.HyperConverged, reason string) {
+func verifySystemHealthStatusError(hco *hcov1beta1.HyperConverged) {
 	ExpectWithOffset(1, hco.Status.SystemHealthStatus).To(Equal(systemHealthStatusError))
 
-	systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus(reason)
+	systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus()
 	ExpectWithOffset(1, err).ToNot(HaveOccurred())
 	ExpectWithOffset(1, systemHealthStatusMetric).To(Equal(metrics.SystemHealthStatusError))
 }
diff --git a/hack/prom-rule-ci/prom-rules-tests.yaml b/hack/prom-rule-ci/prom-rules-tests.yaml
index cd69db952..cb4422788 100755
--- a/hack/prom-rule-ci/prom-rules-tests.yaml
+++ b/hack/prom-rule-ci/prom-rules-tests.yaml
@@ -610,7 +610,7 @@ tests:
   input_series:
   - series: 'kubevirt_hco_system_health_status'
     # time:  0     1     2     3     4     5     6     7     8     9     10     11
-    values: "1     1     1     1     2     2     2     2     3     3      3      3"
+    values: "0     0     0     0     1     1     1     1     2     2      2      2"
   - series: 'ALERTS{kubernetes_operator_part_of="kubevirt", alertstate="firing", operator_health_impact="warning"}'
     # time:  0     1     2     3     4     5     6     7     8     9     10     11
     values: "1     1   stale stale   1     1   stale stale   1     1    stale  stale"
@@ -774,25 +774,26 @@ tests:
 # Test OperatorConditionsUnhealthy
 - interval: 1m
   input_series:
-    - series: 'kubevirt_hco_system_health_status{reason="healthy"}'
-    - values: "stale 1 stale stale stale"
+    - series: 'kubevirt_hco_system_health_status'
+    - values: "stale stale 2 stale"
 
-    - series: 'kubevirt_hco_system_health_status{reason="SOME_ERROR"}'
-      values: "stale stale stale 3 stale"
-
-    - series: 'kubevirt_hco_system_health_status{reason="SOME_WARNING"}'
-      values: "stale stale stale stale stale 2 stale"
+    - series: 'kubevirt_hco_system_health_status'
+      values: "stale stale 2 stale 1 stale"
 
   alert_rule_test:
     - eval_time: 1m
       alertname: OperatorConditionsUnhealthy
       exp_alerts: [ ]
 
-    - eval_time: 3m
+    - eval_time: 1m
+      alertname: OperatorConditionsUnhealthy
+      exp_alerts: [ ]
+
+    - eval_time: 2m
       alertname: OperatorConditionsUnhealthy
       exp_alerts:
         - exp_annotations:
-            description: "HCO and its secondary resources are in a critical state due to SOME_ERROR."
+            description: "HCO and its secondary resources are in a critical state due to system error."
             summary: "HCO and its secondary resources are in a critical state."
             runbook_url: "https://kubevirt.io/monitoring/runbooks/OperatorConditionsUnhealthy"
           exp_labels:
@@ -800,13 +801,12 @@ tests:
             operator_health_impact: "critical"
             kubernetes_operator_part_of: "kubevirt"
             kubernetes_operator_component: "hyperconverged-cluster-operator"
-            reason: "SOME_ERROR"
 
-    - eval_time: 5m
+    - eval_time: 4m
       alertname: OperatorConditionsUnhealthy
       exp_alerts:
         - exp_annotations:
-            description: "HCO and its secondary resources are in a warning state due to SOME_WARNING."
+            description: "HCO and its secondary resources are in a warning state due to system warning."
             summary: "HCO and its secondary resources are in a warning state."
             runbook_url: "https://kubevirt.io/monitoring/runbooks/OperatorConditionsUnhealthy"
           exp_labels:
@@ -814,4 +814,3 @@ tests:
             operator_health_impact: "warning"
             kubernetes_operator_part_of: "kubevirt"
             kubernetes_operator_component: "hyperconverged-cluster-operator"
-            reason: "SOME_WARNING"
diff --git a/pkg/monitoring/metrics/operator_metrics.go b/pkg/monitoring/metrics/operator_metrics.go
index 2190659ef..64aea2bd4 100644
--- a/pkg/monitoring/metrics/operator_metrics.go
+++ b/pkg/monitoring/metrics/operator_metrics.go
@@ -16,8 +16,7 @@ const (
 )
 
 const (
-	SystemHealthStatusUnknown float64 = iota
-	SystemHealthStatusHealthy
+	SystemHealthStatusHealthy float64 = iota
 	SystemHealthStatusWarning
 	SystemHealthStatusError
 )
@@ -53,12 +52,11 @@ var (
 		},
 	)
 
-	systemHealthStatus = operatormetrics.NewGaugeVec(
+	systemHealthStatus = operatormetrics.NewGauge(
 		operatormetrics.MetricOpts{
 			Name: "kubevirt_hco_system_health_status",
 			Help: "Indicates whether the system health status is healthy (0), warning (1), or error (2), by aggregating the conditions of HCO and its secondary resources",
 		},
-		[]string{"reason"},
 	)
 )
 
@@ -119,30 +117,20 @@ func IsHCOMetricHyperConvergedExists() (bool, error) {
 	return value == hyperConvergedExists, nil
 }
 
-func SetHCOSystemHealthy() {
-	systemHealthStatus.Reset()
-	systemHealthStatus.WithLabelValues("healthy").Set(SystemHealthStatusHealthy)
-}
-
-func SetHCOSystemWarning(reason string) {
-	systemHealthStatus.Reset()
-	systemHealthStatus.WithLabelValues(reason).Set(SystemHealthStatusWarning)
-}
-
-func SetHCOSystemError(reason string) {
-	systemHealthStatus.Reset()
-	systemHealthStatus.WithLabelValues(reason).Set(SystemHealthStatusError)
+func SetHCOMetricSystemHealthStatus(status float64) {
+	systemHealthStatus.Set(status)
 }
 
 // GetHCOMetricSystemHealthStatus returns current value of gauge. If error is not nil then value is undefined
-func GetHCOMetricSystemHealthStatus(reason string) (float64, error) {
+func GetHCOMetricSystemHealthStatus() (float64, error) {
 	dto := &ioprometheusclient.Metric{}
-	err := systemHealthStatus.WithLabelValues(reason).Write(dto)
+	err := systemHealthStatus.Write(dto)
+	value := dto.Gauge.GetValue()
+
 	if err != nil {
-		return SystemHealthStatusUnknown, err
+		return 0, err
 	}
-
-	return dto.Gauge.GetValue(), nil
+	return value, nil
 }
 
 func getLabelsForObj(kind string, name string) string {
diff --git a/pkg/monitoring/metrics/operator_metrics_test.go b/pkg/monitoring/metrics/operator_metrics_test.go
index 3b3df11fd..26d758165 100644
--- a/pkg/monitoring/metrics/operator_metrics_test.go
+++ b/pkg/monitoring/metrics/operator_metrics_test.go
@@ -9,20 +9,16 @@ import (
 
 var _ = Describe("Operator Metrics", func() {
 	Context("kubevirt_hco_system_health_status", func() {
-		It("should set system error reason correctly", func() {
-			metrics.SetHCOSystemError("Reason1")
-			v, err := metrics.GetHCOMetricSystemHealthStatus("Reason1")
+		It("should set the correct system health status", func() {
+			metrics.SetHCOMetricSystemHealthStatus(2)
+			v, err := metrics.GetHCOMetricSystemHealthStatus()
 			Expect(err).ToNot(HaveOccurred())
 			Expect(v).To(Equal(metrics.SystemHealthStatusError))
 
-			metrics.SetHCOSystemError("Reason2")
-			v, err = metrics.GetHCOMetricSystemHealthStatus("Reason2")
+			metrics.SetHCOMetricSystemHealthStatus(1)
+			v, err = metrics.GetHCOMetricSystemHealthStatus()
 			Expect(err).ToNot(HaveOccurred())
-			Expect(v).To(Equal(metrics.SystemHealthStatusError))
-
-			v, err = metrics.GetHCOMetricSystemHealthStatus("Reason1")
-			Expect(err).ToNot(HaveOccurred())
-			Expect(v).To(Equal(metrics.SystemHealthStatusUnknown))
+			Expect(v).To(Equal(metrics.SystemHealthStatusWarning))
 		})
 	})
 })
diff --git a/pkg/monitoring/rules/alerts/health_alerts.go b/pkg/monitoring/rules/alerts/health_alerts.go
index 8ab62dfd6..518e9fe1d 100644
--- a/pkg/monitoring/rules/alerts/health_alerts.go
+++ b/pkg/monitoring/rules/alerts/health_alerts.go
@@ -15,7 +15,7 @@ func healthAlerts() []promv1.Rule {
 			Alert: "OperatorConditionsUnhealthy",
 			Expr:  intstr.FromString(fmt.Sprintf("kubevirt_hco_system_health_status == %f", metrics.SystemHealthStatusError)),
 			Annotations: map[string]string{
-				"description": "HCO and its secondary resources are in a critical state due to {{ $labels.reason }}.",
+				"description": "HCO and its secondary resources are in a critical state due to system error.",
 				"summary":     "HCO and its secondary resources are in a critical state.",
 			},
 			Labels: map[string]string{
@@ -27,7 +27,7 @@ func healthAlerts() []promv1.Rule {
 			Alert: "OperatorConditionsUnhealthy",
 			Expr:  intstr.FromString(fmt.Sprintf("kubevirt_hco_system_health_status == %f", metrics.SystemHealthStatusWarning)),
 			Annotations: map[string]string{
-				"description": "HCO and its secondary resources are in a warning state due to {{ $labels.reason }}.",
+				"description": "HCO and its secondary resources are in a warning state due to system warning.",
 				"summary":     "HCO and its secondary resources are in a warning state.",
 			},
 			Labels: map[string]string{