From 875c0d8b349010ab709c454c99d464edff5c925b Mon Sep 17 00:00:00 2001 From: David Hurta Date: Tue, 28 May 2024 18:43:00 +0200 Subject: [PATCH] pkg/cvo/status: Filter out immediate UpdateEffectNone errors nested in MultipleErrors in Failing condition Various errors get propagated to users, such as the summarized task graph error. For example, in the form of the message in the Failing condition. However, update errors set with the update effect of UpdateEffectNone can confuse users, as these primarily informing messages get displayed together with valid update errors that heavily impact the update. This can result in a message such as: { "lastTransitionTime": "2023-06-20T13:40:12Z", "message": "Multiple errors are preventing progress:\n* Cluster operator authentication is updating versions\n* Could not update customresourcedefinition \"alertingrules.monitoring.openshift.io\" (512 of 993): the object is invalid, possibly due to local cluster configuration", "reason": "MultipleErrors", "status": "True", "type": "Failing" } The Failing condition is not true because of the UpdateEffectNone error ("Cluster operator authentication is updating versions"), but its message still gets displayed. This commit makes sure that update errors that do not heavily affect the update will be removed from the MultipleErrors error in the Failing condition message to an extent. The filtered out errors from the message will still be displayed in the logs and in other places, such as the ReconciliationIssues condition. The original code handles correctly situations where the status failure is an UpdateEffectNone error. The new changes leave such errors be. In case the MultipleErrors error contains only UpdateEffectNone errors, the error is unchanged to keep the original logic unchanged and keep the commit simple. The goal of this commit is to remove unimportant messages from MultipleErrors errors that contain valid messages in the Failing condition. The current code contains an override to set the Failing condition when history is empty or the CVO is reconciling. This commit will keep this logic functional. This means the filtering is only applied when history is not empty and the CVO is not reconciling the payload. --- pkg/cvo/status.go | 92 ++++++++++++++++---- pkg/cvo/status_test.go | 192 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 267 insertions(+), 17 deletions(-) diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index 3eb0ee478..9736dfd24 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -293,8 +293,37 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status }) } - progressReason, progressMessage, skipFailure := convertErrorToProgressing(cvStatus.History, now.Time, status) + failure := status.Failure + failingReason, failingMessage := getReasonMessageFromError(failure) + var skipFailure bool + var progressReason, progressMessage string + if !status.Reconciling && len(cvStatus.History) != 0 { + progressReason, progressMessage, skipFailure = convertErrorToProgressing(now.Time, failure) + failure = filterErrorForFailingCondition(failure, payload.UpdateEffectNone) + filteredFailingReason, filteredFailingMessage := getReasonMessageFromError(failure) + if failingReason != filteredFailingReason { + klog.Infof("Filtered failure reason changed from '%s' to '%s'", failingReason, filteredFailingReason) + } + if failingMessage != filteredFailingMessage { + klog.Infof("Filtered failure message changed from '%s' to '%s'", failingMessage, filteredFailingMessage) + } + failingReason, failingMessage = filteredFailingReason, filteredFailingMessage + } + + // set the failing condition + failingCondition := configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionFalse, + LastTransitionTime: now, + } + if failure != nil && !skipFailure { + failingCondition.Status = configv1.ConditionTrue + failingCondition.Reason = failingReason + failingCondition.Message = failingMessage + } + resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, failingCondition) + // update progressing if err := status.Failure; err != nil && !skipFailure { var reason string msg := progressMessage @@ -307,16 +336,6 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status msg = "an error occurred" } - // set the failing condition - resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, configv1.ClusterOperatorStatusCondition{ - Type: ClusterStatusFailing, - Status: configv1.ConditionTrue, - Reason: reason, - Message: err.Error(), - LastTransitionTime: now, - }) - - // update progressing if status.Reconciling { resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, configv1.ClusterOperatorStatusCondition{ Type: configv1.OperatorProgressing, @@ -336,9 +355,6 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status } } else { - // clear the failure condition - resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, configv1.ClusterOperatorStatusCondition{Type: ClusterStatusFailing, Status: configv1.ConditionFalse, LastTransitionTime: now}) - // update progressing if status.Reconciling { message := fmt.Sprintf("Cluster version is %s", version) @@ -413,6 +429,48 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status } } +// getReasonMessageFromError returns the reason and the message from an error. +// If the reason or message is not available, an empty string is returned. +func getReasonMessageFromError(err error) (reason, message string) { + if uErr, ok := err.(*payload.UpdateError); ok { + reason = uErr.Reason + } + if err != nil { + message = err.Error() + } + return reason, message +} + +// filterErrorForFailingCondition filters out update errors based on the given +// updateEffect from a MultipleError error. If the err has the reason +// MultipleErrors, its immediate nested errors are filtered out and the error +// is recreated. If all nested errors are filtered out, nil is returned. +func filterErrorForFailingCondition(err error, updateEffect payload.UpdateEffectType) error { + if err == nil { + return nil + } + if uErr, ok := err.(*payload.UpdateError); ok && uErr.Reason == "MultipleErrors" { + if nested, ok := uErr.Nested.(interface{ Errors() []error }); ok { + filtered := nested.Errors() + filtered = filterOutUpdateErrors(filtered, updateEffect) + return newMultipleError(filtered) + } + } + return err +} + +// filterOutUpdateErrors filters out update errors of the given effect. +func filterOutUpdateErrors(errs []error, updateEffect payload.UpdateEffectType) []error { + filtered := make([]error, 0, len(errs)) + for _, err := range errs { + if uErr, ok := err.(*payload.UpdateError); ok && uErr.UpdateEffect == updateEffect { + continue + } + filtered = append(filtered, err) + } + return filtered +} + func setImplicitlyEnabledCapabilitiesCondition(cvStatus *configv1.ClusterVersionStatus, implicitlyEnabled []configv1.ClusterVersionCapability, now metav1.Time) { @@ -479,11 +537,11 @@ func setDesiredReleaseAcceptedCondition(cvStatus *configv1.ClusterVersionStatus, // how an update error is interpreted. An error may simply need to be reported but does not indicate the update is // failing. An error may indicate the update is failing or that if the error continues for a defined interval the // update is failing. -func convertErrorToProgressing(history []configv1.UpdateHistory, now time.Time, status *SyncWorkerStatus) (reason string, message string, ok bool) { - if len(history) == 0 || status.Failure == nil || status.Reconciling { +func convertErrorToProgressing(now time.Time, statusFailure error) (reason string, message string, ok bool) { + if statusFailure == nil { return "", "", false } - uErr, ok := status.Failure.(*payload.UpdateError) + uErr, ok := statusFailure.(*payload.UpdateError) if !ok { return "", "", false } diff --git a/pkg/cvo/status_test.go b/pkg/cvo/status_test.go index d92566a80..1403c71f9 100644 --- a/pkg/cvo/status_test.go +++ b/pkg/cvo/status_test.go @@ -497,6 +497,13 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t Reason: "MultipleErrors", Message: "Multiple errors are preventing progress:\n* Cluster operator A is not available\n* Cluster operator B is updating versions", }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorNotAvailable", + Message: "Cluster operator A is not available", + }, }, { name: "MultipleErrors of UpdateEffectNone and UpdateEffectNone", @@ -527,6 +534,11 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t Reason: "MultipleErrors", Message: "Multiple errors are preventing progress:\n* Cluster operator A is updating versions\n* Cluster operator B is getting conscious", }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionFalse, + }, }, { name: "MultipleErrors of UpdateEffectFail, UpdateEffectFailAfterInterval, and UpdateEffectNone", @@ -562,6 +574,13 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t Reason: "MultipleErrors", Message: "Multiple errors are preventing progress:\n* Cluster operator A is not available\n* Cluster operator B is updating versions\n* Cluster operator C is degraded", }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "MultipleErrors", + Message: "Multiple errors are preventing progress:\n* Cluster operator A is not available\n* Cluster operator C is degraded", + }, }, } for _, tc := range tests { @@ -601,3 +620,176 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t }) } } + +func Test_filterOutUpdateErrors(t *testing.T) { + type args struct { + errs []error + updateEffectType payload.UpdateEffectType + } + tests := []struct { + name string + args args + want []error + }{ + { + name: "empty errors", + args: args{ + errs: []error{}, + updateEffectType: payload.UpdateEffectNone, + }, + want: []error{}, + }, + { + name: "single update error of the specified value", + args: args{ + errs: []error{ + &payload.UpdateError{ + Name: "None", + UpdateEffect: payload.UpdateEffectNone, + }, + }, + updateEffectType: payload.UpdateEffectNone, + }, + want: []error{}, + }, + { + name: "errors do not contain update errors of the specified value", + args: args{ + errs: []error{ + &payload.UpdateError{ + Name: "Fail", + UpdateEffect: payload.UpdateEffectFail, + }, + &payload.UpdateError{ + Name: "Report", + UpdateEffect: payload.UpdateEffectReport, + }, + &payload.UpdateError{ + Name: "Fail After Interval", + UpdateEffect: payload.UpdateEffectFailAfterInterval, + }, + }, + updateEffectType: payload.UpdateEffectNone, + }, + want: []error{ + &payload.UpdateError{ + Name: "Fail", + UpdateEffect: payload.UpdateEffectFail, + }, + &payload.UpdateError{ + Name: "Report", + UpdateEffect: payload.UpdateEffectReport, + }, + &payload.UpdateError{ + Name: "Fail After Interval", + UpdateEffect: payload.UpdateEffectFailAfterInterval, + }, + }, + }, + { + name: "errors contain update errors of the specified value UpdateEffectNone", + args: args{ + errs: []error{ + &payload.UpdateError{ + Name: "Fail After Interval", + UpdateEffect: payload.UpdateEffectFailAfterInterval, + }, + &payload.UpdateError{ + Name: "None #1", + UpdateEffect: payload.UpdateEffectNone, + }, + &payload.UpdateError{ + Name: "Report", + UpdateEffect: payload.UpdateEffectReport, + }, + &payload.UpdateError{ + Name: "None #2", + UpdateEffect: payload.UpdateEffectNone, + }, + }, + updateEffectType: payload.UpdateEffectNone, + }, + want: []error{ + &payload.UpdateError{ + Name: "Fail After Interval", + UpdateEffect: payload.UpdateEffectFailAfterInterval, + }, + &payload.UpdateError{ + Name: "Report", + UpdateEffect: payload.UpdateEffectReport, + }, + }, + }, + { + name: "errors contain update errors of the specified value UpdateEffectReport", + args: args{ + errs: []error{ + &payload.UpdateError{ + Name: "Fail After Interval", + UpdateEffect: payload.UpdateEffectFailAfterInterval, + }, + &payload.UpdateError{ + Name: "None #1", + UpdateEffect: payload.UpdateEffectNone, + }, + &payload.UpdateError{ + Name: "Report", + UpdateEffect: payload.UpdateEffectReport, + }, + &payload.UpdateError{ + Name: "None #2", + UpdateEffect: payload.UpdateEffectNone, + }, + }, + updateEffectType: payload.UpdateEffectReport, + }, + want: []error{ + &payload.UpdateError{ + Name: "Fail After Interval", + UpdateEffect: payload.UpdateEffectFailAfterInterval, + }, + &payload.UpdateError{ + Name: "None #1", + UpdateEffect: payload.UpdateEffectNone, + }, + &payload.UpdateError{ + Name: "None #2", + UpdateEffect: payload.UpdateEffectNone, + }, + }, + }, + { + name: "errors contain only update errors of the specified value UpdateEffectNone", + args: args{ + errs: []error{ + &payload.UpdateError{ + Name: "None #1", + UpdateEffect: payload.UpdateEffectNone, + }, + &payload.UpdateError{ + Name: "None #2", + UpdateEffect: payload.UpdateEffectNone, + }, + &payload.UpdateError{ + Name: "None #3", + UpdateEffect: payload.UpdateEffectNone, + }, + &payload.UpdateError{ + Name: "None #4", + UpdateEffect: payload.UpdateEffectNone, + }, + }, + updateEffectType: payload.UpdateEffectNone, + }, + want: []error{}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + filtered := filterOutUpdateErrors(tt.args.errs, tt.args.updateEffectType) + if difference := cmp.Diff(filtered, tt.want); difference != "" { + t.Errorf("got errors differ from expected:\n%s", difference) + } + }) + } +}