diff --git a/controllers/argocd_metrics_controller.go b/controllers/argocd_metrics_controller.go index 2bb4bc21e..3e93ffb84 100644 --- a/controllers/argocd_metrics_controller.go +++ b/controllers/argocd_metrics_controller.go @@ -582,13 +582,6 @@ func newServiceMonitor(namespace, name, matchLabel string) *monitoringv1.Service } func newPrometheusRule(namespace string) *monitoringv1.PrometheusRule { - // The namespace used in the alert rule is not the namespace of the - // running application, it is the namespace that the corresponding - // ArgoCD application metadata was created in. This is needed to - // scope this alert rule to only fire for applications managed - // by the ArgoCD instance installed in this namespace. - expr := fmt.Sprintf("argocd_app_info{namespace=\"%s\",sync_status=\"OutOfSync\"} > 0", namespace) - objectMeta := metav1.ObjectMeta{ Name: alertRuleName, Namespace: namespace, @@ -602,17 +595,88 @@ func newPrometheusRule(namespace string) *monitoringv1.PrometheusRule { Alert: "ArgoCDSyncAlert", Annotations: map[string]string{ "summary": "Argo CD application is out of sync", - "description": "Argo CD application {{ $labels.name }} is out of sync. Check ArgoCDSyncAlert status, this alert is designed to notify that an application managed by Argo CD is out of sync.", + "description": "Argo CD application {{ $labels.namespace }}/{{ $labels.name }} is out of sync. Check ArgoCDSyncAlert status, this alert is designed to notify that an application managed by Argo CD is out of sync.", + }, + Expr: intstr.IntOrString{ + Type: intstr.String, + // The namespace used in the alert rule is not the namespace of the + // running application, it is the namespace that the corresponding + // ArgoCD application metadata was created in. This is needed to + // scope this alert rule to only fire for applications managed + // by the ArgoCD instance installed in this namespace. + StrVal: fmt.Sprintf("argocd_app_info{namespace=\"%s\",sync_status=\"OutOfSync\"} > 0", namespace), + }, + For: "5m", + Labels: map[string]string{ + "severity": "warning", + }, + }, + { + Alert: "ArgoCDUnknownSyncAlert", + Annotations: map[string]string{ + "summary": "Argo CD application sync state is unknown", + "description": "Argo CD application {{ $labels.namespace }}/{{ $labels.name }} is in an unknown sync state. Check ArgoCDUnknownSyncAlert status, this often occurs when the Application is misconfigured.", }, Expr: intstr.IntOrString{ Type: intstr.String, - StrVal: expr, + StrVal: fmt.Sprintf("argocd_app_info{namespace=\"%s\",sync_status=\"Unknown\"} > 0", namespace), + }, + For: "5m", + Labels: map[string]string{ + "severity": "critical", + }, + }, + { + Alert: "ArgoCDHealthAlert", + Annotations: map[string]string{ + "summary": "Argo CD application is not healthy", + "description": "Argo CD application {{ $labels.namespace }}/{{ $labels.name }} is not healthy. Check ArgoCDHealthAlert status, this alert is designed to notify that an application managed by Argo CD is not in a healthy, suspended, progressing or degraded state.", + }, + Expr: intstr.IntOrString{ + Type: intstr.String, + // General warning of not healthy, this ignores the status of Healthy and + // Suspended which are expected statuses. Degraded and Progressing are + // handled by other rules below + StrVal: fmt.Sprintf("argocd_app_info{namespace=\"%s\", health_status!~\"Healthy|Suspended|Progressing|Degraded\"} > 0", namespace), }, For: "5m", Labels: map[string]string{ "severity": "warning", }, }, + { + Alert: "ArgoCDDegradedAlert", + Annotations: map[string]string{ + "summary": "Argo CD application is degraded", + "description": "Argo CD application {{ $labels.namespace }}/{{ $labels.name }} is degraded. Check ArgoCDDegradedAlert status, this alert is designed to notify that an application managed by Argo CD is degraded.", + }, + Expr: intstr.IntOrString{ + Type: intstr.String, + // Specific warning of degraded state + StrVal: fmt.Sprintf("argocd_app_info{namespace=\"%s\", health_status=\"Degraded\"} > 0", namespace), + }, + For: "5m", + Labels: map[string]string{ + "severity": "critical", + }, + }, + { + Alert: "ArgoCDProgressingAlert", + Annotations: map[string]string{ + "summary": "Argo CD application has been progressing for more than 10 minutes", + "description": "Argo CD application {{ $labels.namespace }}/{{ $labels.name }} has been progressing for more than 10 minutes. Check ArgoCDProgressingAlert status, this alert is designed to notify when an application is taking a long time to exit the Progressing state.", + }, + Expr: intstr.IntOrString{ + Type: intstr.String, + // This rule is used to notify when an application is stuck in the progressing + // state for more then 10m. + StrVal: fmt.Sprintf("argocd_app_info{namespace=\"%s\", health_status=\"Progressing\"} > 0", namespace), + }, + For: "10m", + Labels: map[string]string{ + "severity": "warning", + }, + }, }, }, }, diff --git a/controllers/argocd_metrics_controller_test.go b/controllers/argocd_metrics_controller_test.go index 54f8f6088..a511d81ed 100644 --- a/controllers/argocd_metrics_controller_test.go +++ b/controllers/argocd_metrics_controller_test.go @@ -313,6 +313,38 @@ func TestReconciler_add_prometheus_rule(t *testing.T) { namespace: "namespace-two", }, } + testMonitoringRules := []struct { + name string + duration string + expr string + }{ + { + name: "ArgoCDSyncAlert", + duration: "5m", + expr: "argocd_app_info{namespace=\"%s\",sync_status=\"OutOfSync\"} > 0", + }, + { + name: "ArgoCDUnknownSyncAlert", + duration: "5m", + expr: "argocd_app_info{namespace=\"%s\",sync_status=\"Unknown\"} > 0", + }, + { + name: "ArgoCDHealthAlert", + duration: "5m", + expr: "argocd_app_info{namespace=\"%s\", health_status!~\"Healthy|Suspended|Progressing|Degraded\"} > 0", + }, + { + name: "ArgoCDDegradedAlert", + duration: "5m", + expr: "argocd_app_info{namespace=\"%s\", health_status=\"Degraded\"} > 0", + }, + { + name: "ArgoCDProgressingAlert", + duration: "10m", + expr: "argocd_app_info{namespace=\"%s\", health_status=\"Progressing\"} > 0", + }, + } + flagPtr := false for _, tc := range testCases { r := newMetricsReconciler(t, tc.namespace, tc.instanceName, &flagPtr) @@ -327,13 +359,15 @@ func TestReconciler_add_prometheus_rule(t *testing.T) { assert.Equal(t, rule.OwnerReferences[0].Kind, argocdKind) assert.Equal(t, rule.OwnerReferences[0].Name, tc.instanceName) - assert.Equal(t, rule.Spec.Groups[0].Rules[0].Alert, "ArgoCDSyncAlert") - assert.Assert(t, rule.Spec.Groups[0].Rules[0].Annotations["summary"] != "") - assert.Assert(t, rule.Spec.Groups[0].Rules[0].Annotations["description"] != "") - assert.Assert(t, rule.Spec.Groups[0].Rules[0].Labels["severity"] != "") - assert.Equal(t, rule.Spec.Groups[0].Rules[0].For, "5m") - expr := fmt.Sprintf("argocd_app_info{namespace=\"%s\",sync_status=\"OutOfSync\"} > 0", tc.namespace) - assert.Equal(t, rule.Spec.Groups[0].Rules[0].Expr.StrVal, expr) + for index, testMonitoringRule := range testMonitoringRules { + assert.Equal(t, rule.Spec.Groups[0].Rules[index].Alert, testMonitoringRule.name) + assert.Assert(t, rule.Spec.Groups[0].Rules[index].Annotations["summary"] != "") + assert.Assert(t, rule.Spec.Groups[0].Rules[index].Annotations["description"] != "") + assert.Assert(t, rule.Spec.Groups[0].Rules[index].Labels["severity"] != "") + assert.Equal(t, rule.Spec.Groups[0].Rules[index].For, testMonitoringRule.duration) + expr := fmt.Sprintf(testMonitoringRule.expr, tc.namespace) + assert.Equal(t, rule.Spec.Groups[0].Rules[index].Expr.StrVal, expr) + } } } diff --git a/test/openshift/e2e/ignore-tests/sequential/1-005_validate_metrics/01-assert.yaml b/test/openshift/e2e/ignore-tests/sequential/1-005_validate_metrics/01-assert.yaml index 1b0dc4c31..d03ae9e06 100644 --- a/test/openshift/e2e/ignore-tests/sequential/1-005_validate_metrics/01-assert.yaml +++ b/test/openshift/e2e/ignore-tests/sequential/1-005_validate_metrics/01-assert.yaml @@ -47,7 +47,35 @@ spec: - alert: ArgoCDSyncAlert annotations: summary: Argo CD application is out of sync - description: Argo CD application {{ $labels.name }} is out of sync. Check ArgoCDSyncAlert status, this alert is designed to notify that an application managed by Argo CD is out of sync. + description: Argo CD application {{ $labels.namespace }}/{{ $labels.name }} is out of sync. Check ArgoCDSyncAlert status, this alert is designed to notify that an application managed by Argo CD is out of sync. expr: argocd_app_info{namespace="openshift-gitops",sync_status="OutOfSync"} > 0 labels: severity: warning + - alert: ArgoCDUnknownSyncAlert + annotations: + summary: Argo CD application sync state is unknown + description: Argo CD application {{ $labels.namespace }}/{{ $labels.name }} is in an unknown sync state. Check ArgoCDUnknownSyncAlert status, this often occurs when the Application is misconfigured. + expr: argocd_app_info{namespace="openshift-gitops",sync_status="Unknown"} > 0 + labels: + severity: critical + - alert: ArgoCDHealthAlert + annotations: + summary: Argo CD application is not healthy + description: Argo CD application {{ $labels.namespace }}/{{ $labels.name }} is not healthy. Check ArgoCDHealthAlert status, this alert is designed to notify that an application managed by Argo CD is not in a healthy, suspended, progressing or degraded state. + expr: argocd_app_info{namespace="openshift-gitops", health_status!~"Healthy|Suspended|Progressing|Degraded"} > 0 + labels: + severity: warning + - alert: ArgoCDDegradedAlert + annotations: + summary: Argo CD application is degraded + description: Argo CD application {{ $labels.namespace }}/{{ $labels.name }} is degraded. Check ArgoCDDegradedAlert status, this alert is designed to notify that an application managed by Argo CD is degraded. + expr: argocd_app_info{namespace="openshift-gitops", health_status="Degraded"} > 0 + labels: + severity: critical + - alert: ArgoCDProgressingAlert + annotations: + summary: Argo CD application has been progressing for more than 10 minutes + description: Argo CD application {{ $labels.namespace }}/{{ $labels.name }} has been progressing for more than 10 minutes. Check ArgoCDProgressingAlert status, this alert is designed to notify when an application is taking a long time to exit the Progressing state. + expr: argocd_app_info{namespace="openshift-gitops", health_status="Progressing"} > 0 + labels: + severity: warning diff --git a/test/openshift/e2e/ignore-tests/sequential/1-041_validate_argocd_sync_alert/01-assert.yaml b/test/openshift/e2e/ignore-tests/sequential/1-041_validate_argocd_sync_alert/01-assert.yaml index 2b89033e1..65486ac60 100644 --- a/test/openshift/e2e/ignore-tests/sequential/1-041_validate_argocd_sync_alert/01-assert.yaml +++ b/test/openshift/e2e/ignore-tests/sequential/1-041_validate_argocd_sync_alert/01-assert.yaml @@ -18,8 +18,40 @@ spec: - alert: ArgoCDSyncAlert annotations: summary: Argo CD application is out of sync - description: Argo CD application {{ $labels.name }} is out of sync. Check ArgoCDSyncAlert status, this alert is designed to notify that an application managed by Argo CD is out of sync. + description: Argo CD application {{ $labels.namespace }}/{{ $labels.name }} is out of sync. Check ArgoCDSyncAlert status, this alert is designed to notify that an application managed by Argo CD is out of sync. expr: argocd_app_info{namespace="openshift-gitops",sync_status="OutOfSync"} > 0 for: 5m labels: severity: warning + - alert: ArgoCDUnknownSyncAlert + annotations: + summary: Argo CD application sync state is unknown + description: Argo CD application {{ $labels.namespace }}/{{ $labels.name }} is in an unknown sync state. Check ArgoCDUnknownSyncAlert status, this often occurs when the Application is misconfigured. + expr: argocd_app_info{namespace="openshift-gitops",sync_status="Unknown"} > 0 + for: 5m + labels: + severity: critical + - alert: ArgoCDHealthAlert + annotations: + summary: Argo CD application is not healthy + description: Argo CD application {{ $labels.namespace }}/{{ $labels.name }} is not healthy. Check ArgoCDHealthAlert status, this alert is designed to notify that an application managed by Argo CD is not in a healthy, suspended, progressing or degraded state. + expr: argocd_app_info{namespace="openshift-gitops", health_status!~"Healthy|Suspended|Progressing|Degraded"} > 0 + for: 5m + labels: + severity: warning + - alert: ArgoCDDegradedAlert + annotations: + summary: Argo CD application is degraded + description: Argo CD application {{ $labels.namespace }}/{{ $labels.name }} is degraded. Check ArgoCDDegradedAlert status, this alert is designed to notify that an application managed by Argo CD is degraded. + expr: argocd_app_info{namespace="openshift-gitops", health_status="Degraded"} > 0 + for: 5m + labels: + severity: critical + - alert: ArgoCDProgressingAlert + annotations: + summary: Argo CD application has been progressing for more than 10 minutes + description: Argo CD application {{ $labels.namespace }}/{{ $labels.name }} has been progressing for more than 10 minutes. Check ArgoCDProgressingAlert status, this alert is designed to notify when an application is taking a long time to exit the Progressing state. + expr: argocd_app_info{namespace="openshift-gitops", health_status="Progressing"} > 0 + for: 10m + labels: + severity: warning diff --git a/test/openshift/e2e/sequential/1-041_validate_argocd_sync_alert/01-assert.yaml b/test/openshift/e2e/sequential/1-041_validate_argocd_sync_alert/01-assert.yaml index 0257afc12..65486ac60 100644 --- a/test/openshift/e2e/sequential/1-041_validate_argocd_sync_alert/01-assert.yaml +++ b/test/openshift/e2e/sequential/1-041_validate_argocd_sync_alert/01-assert.yaml @@ -18,8 +18,40 @@ spec: - alert: ArgoCDSyncAlert annotations: summary: Argo CD application is out of sync - description: Argo CD application {{ $labels.name }} is out of sync. Check ArgoCDSyncAlert status, this alert is designed to notify that an application managed by Argo CD is out of sync. + description: Argo CD application {{ $labels.namespace }}/{{ $labels.name }} is out of sync. Check ArgoCDSyncAlert status, this alert is designed to notify that an application managed by Argo CD is out of sync. expr: argocd_app_info{namespace="openshift-gitops",sync_status="OutOfSync"} > 0 for: 5m labels: - severity: warning \ No newline at end of file + severity: warning + - alert: ArgoCDUnknownSyncAlert + annotations: + summary: Argo CD application sync state is unknown + description: Argo CD application {{ $labels.namespace }}/{{ $labels.name }} is in an unknown sync state. Check ArgoCDUnknownSyncAlert status, this often occurs when the Application is misconfigured. + expr: argocd_app_info{namespace="openshift-gitops",sync_status="Unknown"} > 0 + for: 5m + labels: + severity: critical + - alert: ArgoCDHealthAlert + annotations: + summary: Argo CD application is not healthy + description: Argo CD application {{ $labels.namespace }}/{{ $labels.name }} is not healthy. Check ArgoCDHealthAlert status, this alert is designed to notify that an application managed by Argo CD is not in a healthy, suspended, progressing or degraded state. + expr: argocd_app_info{namespace="openshift-gitops", health_status!~"Healthy|Suspended|Progressing|Degraded"} > 0 + for: 5m + labels: + severity: warning + - alert: ArgoCDDegradedAlert + annotations: + summary: Argo CD application is degraded + description: Argo CD application {{ $labels.namespace }}/{{ $labels.name }} is degraded. Check ArgoCDDegradedAlert status, this alert is designed to notify that an application managed by Argo CD is degraded. + expr: argocd_app_info{namespace="openshift-gitops", health_status="Degraded"} > 0 + for: 5m + labels: + severity: critical + - alert: ArgoCDProgressingAlert + annotations: + summary: Argo CD application has been progressing for more than 10 minutes + description: Argo CD application {{ $labels.namespace }}/{{ $labels.name }} has been progressing for more than 10 minutes. Check ArgoCDProgressingAlert status, this alert is designed to notify when an application is taking a long time to exit the Progressing state. + expr: argocd_app_info{namespace="openshift-gitops", health_status="Progressing"} > 0 + for: 10m + labels: + severity: warning