From e3dcd78091b113da6c42a57053cde7af4760d822 Mon Sep 17 00:00:00 2001 From: Ryan Zhang Date: Tue, 22 Oct 2024 22:02:06 -0700 Subject: [PATCH] temp --- apis/placement/v1alpha1/common.go | 19 + apis/placement/v1alpha1/stagedupdate_types.go | 137 ++-- .../v1alpha1/zz_generated.deepcopy.go | 357 +++++---- apis/placement/v1beta1/commons.go | 53 +- ...tes-fleet.io_clusterapprovalrequests.yaml} | 22 +- ...tes-fleet.io_clusterstagedupdateruns.yaml} | 67 +- ...eet.io_clusterstagedupdatestrategies.yaml} | 20 +- .../resource_selector.go | 2 +- pkg/controllers/rollout/controller.go | 6 +- pkg/controllers/rollout/controller_test.go | 12 + pkg/controllers/updaterun/controller.go | 197 +++++ pkg/controllers/updaterun/executing.go | 479 ++++++++++++ pkg/controllers/updaterun/initialization.go | 373 +++++++++ .../initialization_integration_test.go | 733 ++++++++++++++++++ pkg/controllers/updaterun/suite_test.go | 188 +++++ pkg/controllers/updaterun/validating.go | 246 ++++++ pkg/resourcewatcher/change_dector.go | 2 +- pkg/utils/common.go | 59 +- pkg/utils/condition/condition.go | 48 ++ pkg/utils/controller/controller.go | 54 +- pkg/utils/controller/controller_test.go | 9 - .../controller}/override_test.go | 35 +- .../controller/overrider.go} | 61 +- 23 files changed, 2717 insertions(+), 462 deletions(-) rename config/crd/bases/{placement.kubernetes-fleet.io_approvalrequests.yaml => placement.kubernetes-fleet.io_clusterapprovalrequests.yaml} (91%) rename config/crd/bases/{placement.kubernetes-fleet.io_stagedupdateruns.yaml => placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml} (97%) rename config/crd/bases/{placement.kubernetes-fleet.io_stagedupdatestrategies.yaml => placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml} (92%) create mode 100644 pkg/controllers/updaterun/controller.go create mode 100644 pkg/controllers/updaterun/executing.go create mode 100644 pkg/controllers/updaterun/initialization.go create mode 100644 pkg/controllers/updaterun/initialization_integration_test.go create mode 100644 pkg/controllers/updaterun/suite_test.go create mode 100644 pkg/controllers/updaterun/validating.go rename pkg/{controllers/rollout => utils/controller}/override_test.go (97%) rename pkg/{controllers/rollout/override.go => utils/controller/overrider.go} (69%) diff --git a/apis/placement/v1alpha1/common.go b/apis/placement/v1alpha1/common.go index 2431e0a8d..c2c84bc30 100644 --- a/apis/placement/v1alpha1/common.go +++ b/apis/placement/v1alpha1/common.go @@ -14,4 +14,23 @@ const ( // ResourceOverrideSnapshotKind is the kind of the ResourceOverrideSnapshotKind. ResourceOverrideSnapshotKind = "ResourceOverrideSnapshot" + + // StagedUpdateRunFinalizer is used by the staged update run controller to make sure that the stagedUpdateRun + // object is not deleted until all its dependent resources are deleted. + StagedUpdateRunFinalizer = fleetPrefix + "stagedupdaterun-finalizer" + + // TargetUpdateRunLabel is the label that indicates the target update run on a staged run related object. + TargetUpdateRunLabel = fleetPrefix + "targetupdaterun" + + // The name of delete stage in the staged update run + UpdateRunDeleteStageName = fleetPrefix + "deleteStage" + + // IsLatestUpdateRunApprovalLabel is the label that indicates if the apporavl is the latest approval on a staged run. + IsLatestUpdateRunApprovalLabel = fleetPrefix + "isLatestUpdateRunApproval" + + // UpdatingStageNameLabel is the label that indicates the updating stage name on a staged run related object. + TargetUpdatingStageNameLabel = fleetPrefix + "targetUpdatingStage" + + // ApprovalTaskNameFmt is the format of the approval task name. + ApprovalTaskNameFmt = "%s-%s" ) diff --git a/apis/placement/v1alpha1/stagedupdate_types.go b/apis/placement/v1alpha1/stagedupdate_types.go index 6c0b572e0..867ba3840 100644 --- a/apis/placement/v1alpha1/stagedupdate_types.go +++ b/apis/placement/v1alpha1/stagedupdate_types.go @@ -12,27 +12,28 @@ import ( ) // +genclient -// +genclient:namespaced +// +genclient:Cluster // +kubebuilder:object:root=true // +kubebuilder:subresource:status -// +kubebuilder:resource:scope="Namespaced",categories={fleet,fleet-placement} +// +kubebuilder:resource:scope=Cluster,categories={fleet,fleet-placement},shortName=crsur // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object -// StagedUpdateRun represents a stage by stage update process that applies selected resources to specified clusters. +// ClusterStagedUpdateRun represents a stage by stage update process that applies ClusterResourcePlacement +// selected resources to specified clusters. // Resources from unselected clusters are removed after all stages in the update strategy are completed. -// Each StagedUpdateRun object corresponds to a single release of a specific resource version. -// The release is abandoned if the StagedUpdateRun object is deleted or the scheduling decision changes. -// The name of the StagedUpdateRun must conform to RFC 1123. -type StagedUpdateRun struct { +// Each ClusterStagedUpdateRun object corresponds to a single release of a specific resource version. +// The release is abandoned if the ClusterStagedUpdateRun object is deleted or the scheduling decision changes. +// The name of the ClusterStagedUpdateRun must conform to RFC 1123. +type ClusterStagedUpdateRun struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` - // The desired state of StagedUpdateRun. The spec is immutable. + // The desired state of ClusterStagedUpdateRun. The spec is immutable. // +kubebuilder:validation:Required // +kubebuilder:validation:XValidation:rule="self == oldSelf",message="The spec field is immutable" Spec StagedUpdateRunSpec `json:"spec"` - // The observed status of StagedUpdateRun. + // The observed status of ClusterStagedUpdateRun. // +kubebuilder:validation:Optional Status StagedUpdateRunStatus `json:"status,omitempty"` } @@ -40,46 +41,40 @@ type StagedUpdateRun struct { // StagedUpdateRunSpec defines the desired rollout strategy and the snapshot indices of the resources to be updated. // It specifies a stage-by-stage update process across selected clusters for the given ResourcePlacement object. type StagedUpdateRunSpec struct { - // A reference to the placement that this update run is applied to. + // PlacementName is the name of placement that this update run is applied to. // There can be multiple active update runs for each placement, but // it's up to the DevOps team to ensure they don't conflict with each other. // +kubebuilder:validation:Required - PlacementRef PlacementReference `json:"placementRef"` + // +kubebuilder:validation:MaxLength=255 + PlacementName string `json:"placementName"` // The resource snapshot index of the selected resources to be updated across clusters. // The index represents a group of resource snapshots that includes all the resources a ResourcePlacement selected. // +kubebuilder:validation:Required ResourceSnapshotIndex string `json:"resourceSnapshotIndex"` - // The reference to the update strategy that specifies the stages and the sequence + // The name of the update strategy that specifies the stages and the sequence // in which the selected resources will be updated on the member clusters. The stages // are computed according to the referenced strategy when the update run starts // and recorded in the status field. // +kubebuilder:validation:Required - StagedUpdateStrategyRef v1beta1.NamespacedName `json:"stagedRolloutStrategyRef"` -} - -// PlacementReference is a reference to a placement object. -type PlacementReference struct { - // Name is the name of the referenced placement. - // +kubebuilder:validation:Required - Name string `json:"name"` + StagedUpdateStrategyName string `json:"stagedRolloutStrategyName"` } // +genclient -// +genclient:namespaced +// +genclient:cluster // +kubebuilder:object:root=true // +kubebuilder:subresource:status -// +kubebuilder:resource:scope="Namespaced",categories={fleet,fleet-placement} +// +kubebuilder:resource:scope=Cluster,categories={fleet,fleet-placement},shortName=sus // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object -// StagedUpdateStrategy defines a reusable strategy that specifies the stages and the sequence -// in which the selected resources will be updated on the member clusters. -type StagedUpdateStrategy struct { +// ClusterStagedUpdateStrategy defines a reusable strategy that specifies the stages and the sequence +// in which the selected cluster resources will be updated on the member clusters. +type ClusterStagedUpdateStrategy struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` - // The desired state of StagedUpdateStrategy. + // The desired state of ClusterStagedUpdateStrategy. // +kubebuilder:validation:Required Spec StagedUpdateStrategySpec `json:"spec"` } @@ -92,13 +87,13 @@ type StagedUpdateStrategySpec struct { Stages []StageConfig `json:"stages"` } -// StagedUpdateStrategyList contains a list of StagedUpdateStrategy. -// +kubebuilder:resource:scope="Namespaced" +// ClusterStagedUpdateStrategyList contains a list of StagedUpdateStrategy. +// +kubebuilder:resource:scope=Cluster // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object -type StagedUpdateStrategyList struct { +type ClusterStagedUpdateStrategyList struct { metav1.TypeMeta `json:",inline"` metav1.ListMeta `json:"metadata,omitempty"` - Items []StagedUpdateStrategy `json:"items"` + Items []ClusterStagedUpdateStrategy `json:"items"` } // StageConfig describes a single update stage. @@ -146,7 +141,7 @@ type AfterStageTask struct { WaitTime metav1.Duration `json:"waitTime,omitempty"` } -// StagedUpdateRunStatus defines the observed state of the StagedUpdateRun. +// StagedUpdateRunStatus defines the observed state of the ClusterStagedUpdateRun. type StagedUpdateRunStatus struct { // PolicySnapShotIndexUsed records the policy snapshot index of the ClusterResourcePlacement (CRP) that // the update run is based on. The index represents the latest policy snapshot at the start of the update run. @@ -157,11 +152,17 @@ type StagedUpdateRunStatus struct { // +kubebuilder:validation:Optional PolicySnapshotIndexUsed string `json:"policySnapshotIndexUsed,omitempty"` + // PolicyObservedClusterCount records the number of observed clusters in the policy snapshot. + // It is recorded at the beginning of the update run from the policy snapshot object. + // If the `ObservedClusterCount` value is updated during the update run, the update run is abandoned. + // +kubebuilder:validation:Optional + PolicyObservedClusterCount int `json:"policyObservedClusterCount,omitempty"` + // ApplyStrategy is the apply strategy that the stagedUpdateRun is using. // It is the same as the apply strategy in the CRP when the staged update run starts. // The apply strategy is not updated during the update run even if it changes in the CRP. // +kubebuilder:validation:Optional - ApplyStrategy v1beta1.ApplyStrategy `json:"appliedStrategy,omitempty"` + ApplyStrategy *v1beta1.ApplyStrategy `json:"appliedStrategy,omitempty"` // StagedUpdateStrategySnapshot is the snapshot of the StagedUpdateStrategy used for the update run. // The snapshot is immutable during the update run. @@ -169,7 +170,7 @@ type StagedUpdateRunStatus struct { // The update run fails to initialize if the strategy fails to produce a valid list of stages where each selected // cluster is included in exactly one stage. // +kubebuilder:validation:Optional - StagedUpdateStrategySnapshot StagedUpdateStrategySpec `json:"stagedUpdateStrategySnapshot,omitempty"` + StagedUpdateStrategySnapshot *StagedUpdateStrategySpec `json:"stagedUpdateStrategySnapshot,omitempty"` // StagesStatus lists the current updating status of each stage. // The list is empty if the update run is not started or failed to initialize. @@ -180,7 +181,7 @@ type StagedUpdateRunStatus struct { // removes all the resources from the clusters that are not selected by the // current policy after all the update stages are completed. // +kubebuilder:validation:Optional - DeletionStageStatus StageUpdatingStatus `json:"deletionStageStatus,omitempty"` + DeletionStageStatus *StageUpdatingStatus `json:"deletionStageStatus,omitempty"` // +patchMergeKey=type // +patchStrategy=merge @@ -201,8 +202,9 @@ const ( // StagedUpdateRunConditionInitialized indicates whether the staged update run is initialized, meaning it // has computed all the stages according to the referenced strategy and is ready to start the update. // Its condition status can be one of the following: - // - "True": The staged update run is initialized. - // - "False": The staged update run encountered an error during initialization. + // - "True": The staged update run is initialized successfully. + // - "False": The staged update run encountered an error during initialization and aborted. + // - "Unknown": The staged update run initialization has started. StagedUpdateRunConditionInitialized StagedUpdateRunConditionType = "Initialized" // StagedUpdateRunConditionProgressing indicates whether the staged update run is making progress. @@ -269,11 +271,11 @@ const ( // - "False": The stage updating is waiting/pausing. StageUpdatingConditionProgressing StageUpdatingConditionType = "Progressing" - // ClusterUpdatingStatusConditionSucceeded indicates whether the stage updating is completed successfully. + // StageUpdatingConditionSucceeded indicates whether the stage updating is completed successfully. // Its condition status can be one of the following: // - "True": The stage updating is completed successfully. // - "False": The stage updating encountered an error and stopped. - ClusterUpdatingStatusConditionSucceeded StageUpdatingConditionType = "Succeeded" + StageUpdatingConditionSucceeded StageUpdatingConditionType = "Succeeded" ) // ClusterUpdatingStatus defines the status of the update run on a cluster. @@ -311,17 +313,16 @@ type ClusterUpdatingStatus struct { type ClusterUpdatingStatusConditionType string const ( - // UpdatingStatusConditionTypeStarted indicates whether the cluster updating has started. + // ClusterUpdatingConditionStarted indicates whether the cluster updating has started. // Its condition status can be one of the following: // - "True": The cluster updating has started. - // - "False": The stage updating has not started. - UpdatingStatusConditionTypeStarted ClusterUpdatingStatusConditionType = "Started" + ClusterUpdatingConditionStarted ClusterUpdatingStatusConditionType = "Started" - // UpdatingStatusConditionTypeSucceeded indicates whether the cluster updating is completed successfully. + // ClusterUpdatingConditionSucceeded indicates whether the cluster updating is completed successfully. // Its condition status can be one of the following: // - "True": The cluster updating is completed successfully. // - "False": The cluster updating encountered an error and stopped. - UpdatingStatusConditionTypeSucceeded ClusterUpdatingStatusConditionType = "Succeeded" + ClusterUpdatingConditionSucceeded ClusterUpdatingStatusConditionType = "Succeeded" ) type AfterStageTaskStatus struct { @@ -366,54 +367,52 @@ const ( // AfterStageTaskConditionApprovalRequestCreated indicates if the approval request has been created. // Its condition status can be: // - "True": The approval request has been created. - // - "False": The approval request has not been created. AfterStageTaskConditionApprovalRequestCreated AfterStageTaskConditionType = "ApprovalRequestCreated" // AfterStageTaskConditionApprovalRequestApproved indicates if the approval request has been approved. // Its condition status can be: // - "True": The approval request has been approved. - // - "False": The approval request has not been approved. AfterStageTaskConditionApprovalRequestApproved AfterStageTaskConditionType = "ApprovalRequestApproved" - // AfterStageTaskConditionApprovalWaitTimeElapsed indicates if the wait time after each stage has elapsed. + // AfterStageTaskConditionWaitTimeElapsed indicates if the wait time after each stage has elapsed. // If the status is "False", the condition message will include the remaining wait time. // Its condition status can be: // - "True": The wait time has elapsed. // - "False": The wait time has not elapsed. - AfterStageTaskConditionApprovalWaitTimeElapsed AfterStageTaskConditionType = "WaitTimeElapsed" + AfterStageTaskConditionWaitTimeElapsed AfterStageTaskConditionType = "WaitTimeElapsed" ) -// StagedUpdateRunList contains a list of StagedUpdateRun. -// +kubebuilder:resource:scope="Namespaced" +// ClusterStagedUpdateRunList contains a list of ClusterStagedUpdateRun. +// +kubebuilder:resource:scope=Cluster // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object -type StagedUpdateRunList struct { +type ClusterStagedUpdateRunList struct { metav1.TypeMeta `json:",inline"` metav1.ListMeta `json:"metadata,omitempty"` - Items []StagedUpdateRun `json:"items"` + Items []ClusterStagedUpdateRun `json:"items"` } // +genclient -// +genclient:namespaced +// +genclient:Cluster // +kubebuilder:object:root=true // +kubebuilder:subresource:status -// +kubebuilder:resource:scope="Namespaced",categories={fleet,fleet-placement} +// +kubebuilder:resource:scope=Cluster,categories={fleet,fleet-placement},shortName=careq // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object -// ApprovalRequest defines a request for user approval. +// ClusterApprovalRequest defines a request for user approval for cluster staged update run. // The request object MUST have the following labels: -// - `TargetUpdateRun`: Points to the update run that this approval request is for. +// - `TargetUpdateRun`: Points to the cluster staged update run that this approval request is for. // - `TargetStage`: The name of the stage that this approval request is for. // - `IsLatestUpdateRunApproval`: Indicates whether this approval request is the latest one related to this update run. -type ApprovalRequest struct { +type ClusterApprovalRequest struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` - // The desired state of ApprovalRequest. + // The desired state of ClusterApprovalRequest. // +kubebuilder:validation:XValidation:rule="self == oldSelf",message="The spec field is immutable" // +kubebuilder:validation:Required Spec ApprovalRequestSpec `json:"spec"` - // The observed state of ApprovalRequest. + // The observed state of ClusterApprovalRequest. // +kubebuilder:validation:Optional Status ApprovalRequestStatus `json:"status,omitempty"` } @@ -430,7 +429,7 @@ type ApprovalRequestSpec struct { TargetStage string `json:"targetStage"` } -// ApprovalRequestStatus defines the observed state of the ApprovalRequest. +// ApprovalRequestStatus defines the observed state of the ClusterApprovalRequest. type ApprovalRequestStatus struct { // +patchMergeKey=type // +patchStrategy=merge @@ -443,35 +442,27 @@ type ApprovalRequestStatus struct { Conditions []metav1.Condition `json:"conditions,omitempty"` } -// ApprovalRequestConditionType identifies a specific condition of the ApprovalRequest. +// ApprovalRequestConditionType identifies a specific condition of the ClusterApprovalRequest. type ApprovalRequestConditionType string const ( // ApprovalRequestConditionApproved indicates if the approval request was approved. // Its condition status can be: // - "True": The request is approved. - // - "False": The request is not approved. ApprovalRequestConditionApproved ApprovalRequestConditionType = "Approved" - - // ApprovalRequestConditionApprovalAccepted indicates whether the approval request is accepted by the update process. - // Its condition status can be: - // - "True": The approval request is accepted. - // - "False": The approval request is not accepted. - // - "Unknown": The approval request is not yet approved. - ApprovalRequestConditionApprovalAccepted ApprovalRequestConditionType = "ApprovalAccepted" ) -// ApprovalRequestList contains a list of ApprovalRequest. -// +kubebuilder:resource:scope="Namespaced" +// ClusterApprovalRequestList contains a list of ClusterApprovalRequest. +// +kubebuilder:resource:scope=Cluster // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object -type ApprovalRequestList struct { +type ClusterApprovalRequestList struct { metav1.TypeMeta `json:",inline"` metav1.ListMeta `json:"metadata,omitempty"` - Items []ApprovalRequest `json:"items"` + Items []ClusterApprovalRequest `json:"items"` } func init() { SchemeBuilder.Register( - &StagedUpdateRun{}, &StagedUpdateRunList{}, &StagedUpdateStrategy{}, &StagedUpdateStrategyList{}, &ApprovalRequest{}, &ApprovalRequestList{}, + &ClusterStagedUpdateRun{}, &ClusterStagedUpdateRunList{}, &ClusterStagedUpdateStrategy{}, &ClusterStagedUpdateStrategyList{}, &ClusterApprovalRequest{}, &ClusterApprovalRequestList{}, ) } diff --git a/apis/placement/v1alpha1/zz_generated.deepcopy.go b/apis/placement/v1alpha1/zz_generated.deepcopy.go index d9caa6b5c..d26fe49a5 100644 --- a/apis/placement/v1alpha1/zz_generated.deepcopy.go +++ b/apis/placement/v1alpha1/zz_generated.deepcopy.go @@ -55,101 +55,101 @@ func (in *AfterStageTaskStatus) DeepCopy() *AfterStageTaskStatus { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ApprovalRequest) DeepCopyInto(out *ApprovalRequest) { +func (in *ApprovalRequestSpec) DeepCopyInto(out *ApprovalRequestSpec) { *out = *in - out.TypeMeta = in.TypeMeta - in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - out.Spec = in.Spec - in.Status.DeepCopyInto(&out.Status) } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ApprovalRequest. -func (in *ApprovalRequest) DeepCopy() *ApprovalRequest { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ApprovalRequestSpec. +func (in *ApprovalRequestSpec) DeepCopy() *ApprovalRequestSpec { if in == nil { return nil } - out := new(ApprovalRequest) + out := new(ApprovalRequestSpec) in.DeepCopyInto(out) return out } -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *ApprovalRequest) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ApprovalRequestList) DeepCopyInto(out *ApprovalRequestList) { +func (in *ApprovalRequestStatus) DeepCopyInto(out *ApprovalRequestStatus) { *out = *in - out.TypeMeta = in.TypeMeta - in.ListMeta.DeepCopyInto(&out.ListMeta) - if in.Items != nil { - in, out := &in.Items, &out.Items - *out = make([]ApprovalRequest, len(*in)) + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ApprovalRequestList. -func (in *ApprovalRequestList) DeepCopy() *ApprovalRequestList { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ApprovalRequestStatus. +func (in *ApprovalRequestStatus) DeepCopy() *ApprovalRequestStatus { if in == nil { return nil } - out := new(ApprovalRequestList) + out := new(ApprovalRequestStatus) in.DeepCopyInto(out) return out } -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *ApprovalRequestList) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ApprovalRequestSpec) DeepCopyInto(out *ApprovalRequestSpec) { +func (in *ClusterApprovalRequest) DeepCopyInto(out *ClusterApprovalRequest) { *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ApprovalRequestSpec. -func (in *ApprovalRequestSpec) DeepCopy() *ApprovalRequestSpec { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterApprovalRequest. +func (in *ClusterApprovalRequest) DeepCopy() *ClusterApprovalRequest { if in == nil { return nil } - out := new(ApprovalRequestSpec) + out := new(ClusterApprovalRequest) in.DeepCopyInto(out) return out } +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ClusterApprovalRequest) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ApprovalRequestStatus) DeepCopyInto(out *ApprovalRequestStatus) { +func (in *ClusterApprovalRequestList) DeepCopyInto(out *ClusterApprovalRequestList) { *out = *in - if in.Conditions != nil { - in, out := &in.Conditions, &out.Conditions - *out = make([]v1.Condition, len(*in)) + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ClusterApprovalRequest, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ApprovalRequestStatus. -func (in *ApprovalRequestStatus) DeepCopy() *ApprovalRequestStatus { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterApprovalRequestList. +func (in *ClusterApprovalRequestList) DeepCopy() *ClusterApprovalRequestList { if in == nil { return nil } - out := new(ApprovalRequestStatus) + out := new(ClusterApprovalRequestList) in.DeepCopyInto(out) return out } +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ClusterApprovalRequestList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ClusterResourceOverride) DeepCopyInto(out *ClusterResourceOverride) { *out = *in @@ -431,6 +431,123 @@ func (in *ClusterResourcePlacementEvictionList) DeepCopyObject() runtime.Object return nil } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterStagedUpdateRun) DeepCopyInto(out *ClusterStagedUpdateRun) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterStagedUpdateRun. +func (in *ClusterStagedUpdateRun) DeepCopy() *ClusterStagedUpdateRun { + if in == nil { + return nil + } + out := new(ClusterStagedUpdateRun) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ClusterStagedUpdateRun) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterStagedUpdateRunList) DeepCopyInto(out *ClusterStagedUpdateRunList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ClusterStagedUpdateRun, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterStagedUpdateRunList. +func (in *ClusterStagedUpdateRunList) DeepCopy() *ClusterStagedUpdateRunList { + if in == nil { + return nil + } + out := new(ClusterStagedUpdateRunList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ClusterStagedUpdateRunList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterStagedUpdateStrategy) DeepCopyInto(out *ClusterStagedUpdateStrategy) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterStagedUpdateStrategy. +func (in *ClusterStagedUpdateStrategy) DeepCopy() *ClusterStagedUpdateStrategy { + if in == nil { + return nil + } + out := new(ClusterStagedUpdateStrategy) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ClusterStagedUpdateStrategy) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterStagedUpdateStrategyList) DeepCopyInto(out *ClusterStagedUpdateStrategyList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ClusterStagedUpdateStrategy, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterStagedUpdateStrategyList. +func (in *ClusterStagedUpdateStrategyList) DeepCopy() *ClusterStagedUpdateStrategyList { + if in == nil { + return nil + } + out := new(ClusterStagedUpdateStrategyList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ClusterStagedUpdateStrategyList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ClusterUpdatingStatus) DeepCopyInto(out *ClusterUpdatingStatus) { *out = *in @@ -590,21 +707,6 @@ func (in *PlacementEvictionStatus) DeepCopy() *PlacementEvictionStatus { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *PlacementReference) DeepCopyInto(out *PlacementReference) { - *out = *in -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PlacementReference. -func (in *PlacementReference) DeepCopy() *PlacementReference { - if in == nil { - return nil - } - out := new(PlacementReference) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ResourceOverride) DeepCopyInto(out *ResourceOverride) { *out = *in @@ -856,70 +958,9 @@ func (in *StageUpdatingStatus) DeepCopy() *StageUpdatingStatus { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *StagedUpdateRun) DeepCopyInto(out *StagedUpdateRun) { - *out = *in - out.TypeMeta = in.TypeMeta - in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - out.Spec = in.Spec - in.Status.DeepCopyInto(&out.Status) -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StagedUpdateRun. -func (in *StagedUpdateRun) DeepCopy() *StagedUpdateRun { - if in == nil { - return nil - } - out := new(StagedUpdateRun) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *StagedUpdateRun) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *StagedUpdateRunList) DeepCopyInto(out *StagedUpdateRunList) { - *out = *in - out.TypeMeta = in.TypeMeta - in.ListMeta.DeepCopyInto(&out.ListMeta) - if in.Items != nil { - in, out := &in.Items, &out.Items - *out = make([]StagedUpdateRun, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StagedUpdateRunList. -func (in *StagedUpdateRunList) DeepCopy() *StagedUpdateRunList { - if in == nil { - return nil - } - out := new(StagedUpdateRunList) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *StagedUpdateRunList) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *StagedUpdateRunSpec) DeepCopyInto(out *StagedUpdateRunSpec) { *out = *in - out.PlacementRef = in.PlacementRef - out.StagedUpdateStrategyRef = in.StagedUpdateStrategyRef } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StagedUpdateRunSpec. @@ -935,8 +976,16 @@ func (in *StagedUpdateRunSpec) DeepCopy() *StagedUpdateRunSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *StagedUpdateRunStatus) DeepCopyInto(out *StagedUpdateRunStatus) { *out = *in - in.ApplyStrategy.DeepCopyInto(&out.ApplyStrategy) - in.StagedUpdateStrategySnapshot.DeepCopyInto(&out.StagedUpdateStrategySnapshot) + if in.ApplyStrategy != nil { + in, out := &in.ApplyStrategy, &out.ApplyStrategy + *out = new(v1beta1.ApplyStrategy) + (*in).DeepCopyInto(*out) + } + if in.StagedUpdateStrategySnapshot != nil { + in, out := &in.StagedUpdateStrategySnapshot, &out.StagedUpdateStrategySnapshot + *out = new(StagedUpdateStrategySpec) + (*in).DeepCopyInto(*out) + } if in.StagesStatus != nil { in, out := &in.StagesStatus, &out.StagesStatus *out = make([]StageUpdatingStatus, len(*in)) @@ -944,7 +993,11 @@ func (in *StagedUpdateRunStatus) DeepCopyInto(out *StagedUpdateRunStatus) { (*in)[i].DeepCopyInto(&(*out)[i]) } } - in.DeletionStageStatus.DeepCopyInto(&out.DeletionStageStatus) + if in.DeletionStageStatus != nil { + in, out := &in.DeletionStageStatus, &out.DeletionStageStatus + *out = new(StageUpdatingStatus) + (*in).DeepCopyInto(*out) + } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]v1.Condition, len(*in)) @@ -964,64 +1017,6 @@ func (in *StagedUpdateRunStatus) DeepCopy() *StagedUpdateRunStatus { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *StagedUpdateStrategy) DeepCopyInto(out *StagedUpdateStrategy) { - *out = *in - out.TypeMeta = in.TypeMeta - in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - in.Spec.DeepCopyInto(&out.Spec) -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StagedUpdateStrategy. -func (in *StagedUpdateStrategy) DeepCopy() *StagedUpdateStrategy { - if in == nil { - return nil - } - out := new(StagedUpdateStrategy) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *StagedUpdateStrategy) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *StagedUpdateStrategyList) DeepCopyInto(out *StagedUpdateStrategyList) { - *out = *in - out.TypeMeta = in.TypeMeta - in.ListMeta.DeepCopyInto(&out.ListMeta) - if in.Items != nil { - in, out := &in.Items, &out.Items - *out = make([]StagedUpdateStrategy, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StagedUpdateStrategyList. -func (in *StagedUpdateStrategyList) DeepCopy() *StagedUpdateStrategyList { - if in == nil { - return nil - } - out := new(StagedUpdateStrategyList) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *StagedUpdateStrategyList) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *StagedUpdateStrategySpec) DeepCopyInto(out *StagedUpdateStrategySpec) { *out = *in diff --git a/apis/placement/v1beta1/commons.go b/apis/placement/v1beta1/commons.go index dce5bb8e8..656a5110d 100644 --- a/apis/placement/v1beta1/commons.go +++ b/apis/placement/v1beta1/commons.go @@ -6,18 +6,25 @@ Licensed under the MIT license. package v1beta1 const ( - ClusterResourcePlacementKind = "ClusterResourcePlacement" - ClusterResourcePlacementResource = "clusterresourceplacements" - ClusterResourceBindingKind = "ClusterResourceBinding" - ClusterResourceSnapshotKind = "ClusterResourceSnapshot" + // ClusterResourcePlacementKind represents the kind of ClusterResourcePlacement. + ClusterResourcePlacementKind = "ClusterResourcePlacement" + // ClusterResourcePlacementResource represents the resource name for ClusterResourcePlacement. + ClusterResourcePlacementResource = "clusterresourceplacements" + // ClusterResourceBindingKind represents the kind of ClusterResourceBinding. + ClusterResourceBindingKind = "ClusterResourceBinding" + // ClusterResourceSnapshotKind represents the kind of ClusterResourceSnapshot. + ClusterResourceSnapshotKind = "ClusterResourceSnapshot" + // ClusterSchedulingPolicySnapshotKind represents the kind of ClusterSchedulingPolicySnapshot. ClusterSchedulingPolicySnapshotKind = "ClusterSchedulingPolicySnapshot" - WorkKind = "Work" - AppliedWorkKind = "AppliedWork" + // WorkKind represents the kind of Work. + WorkKind = "Work" + // AppliedWorkKind represents the kind of AppliedWork. + AppliedWorkKind = "AppliedWork" ) const ( - // Unprefixed labels/annotations are reserved for end-users - // we will add a kubernetes-fleet.io to designate these labels/annotations as official fleet labels/annotations. + // fleetPrefix is the prefix used for official fleet labels/annotations. + // Unprefixed labels/annotations are reserved for end-users. // See https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#label-selector-and-annotation-conventions fleetPrefix = "kubernetes-fleet.io/" @@ -29,25 +36,25 @@ const ( // cluster. WorkFinalizer = fleetPrefix + "work-cleanup" - // CRPTrackingLabel is the label that points to the cluster resource policy that creates a resource binding. + // CRPTrackingLabel points to the cluster resource policy that creates this resource binding. CRPTrackingLabel = fleetPrefix + "parent-CRP" - // IsLatestSnapshotLabel tells if the snapshot is the latest one. + // IsLatestSnapshotLabel indicates if the snapshot is the latest one. IsLatestSnapshotLabel = fleetPrefix + "is-latest-snapshot" - // FleetResourceLabelKey is that label that indicates the resource is a fleet resource. + // FleetResourceLabelKey indicates that the resource is a fleet resource. FleetResourceLabelKey = fleetPrefix + "is-fleet-resource" - // FirstWorkNameFmt is the format of the name of the work generated with first resource snapshot . + // FirstWorkNameFmt is the format of the name of the work generated with the first resource snapshot. // The name of the first work is {crpName}-work. FirstWorkNameFmt = "%s-work" - // WorkNameWithSubindexFmt is the format of the name of a work generated with resource snapshot with subindex. - // The name of the first work is {crpName}-{subindex}. + // WorkNameWithSubindexFmt is the format of the name of a work generated with a resource snapshot with a subindex. + // The name of the work is {crpName}-{subindex}. WorkNameWithSubindexFmt = "%s-%d" - // WorkNameWithConfigEnvelopeFmt is the format of the name of a work generated with config envelop. - // The format is {workPrefix}-configMap-uuid + // WorkNameWithConfigEnvelopeFmt is the format of the name of a work generated with a config envelope. + // The format is {workPrefix}-configMap-uuid. WorkNameWithConfigEnvelopeFmt = "%s-configmap-%s" // ParentResourceSnapshotIndexLabel is the label applied to work that contains the index of the resource snapshot that generates the work. @@ -56,25 +63,23 @@ const ( // ParentBindingLabel is the label applied to work that contains the name of the binding that generates the work. ParentBindingLabel = fleetPrefix + "parent-resource-binding" - // CRPGenerationAnnotation is the annotation that indicates the generation of the CRP from - // which an object is derived or last updated. + // CRPGenerationAnnotation indicates the generation of the CRP from which an object is derived or last updated. CRPGenerationAnnotation = fleetPrefix + "CRP-generation" - // EnvelopeConfigMapAnnotation is the annotation that indicates the configmap is an envelope configmap that contains resources - // we need to apply to the member cluster instead of the configMap itself. + // EnvelopeConfigMapAnnotation indicates that the configmap is an envelope configmap containing resources to be applied to the member cluster instead of the configMap itself. EnvelopeConfigMapAnnotation = fleetPrefix + "envelope-configmap" - // EnvelopeTypeLabel is the label that marks the work object as generated from an envelope object. + // EnvelopeTypeLabel marks the work object as generated from an envelope object. // The value of the annotation is the type of the envelope object. EnvelopeTypeLabel = fleetPrefix + "envelope-work" - // EnvelopeNamespaceLabel is the label that contains the namespace of the envelope object that the work is generated from. + // EnvelopeNamespaceLabel contains the namespace of the envelope object that the work is generated from. EnvelopeNamespaceLabel = fleetPrefix + "envelope-namespace" - // EnvelopeNameLabel is the label that contains the name of the envelope object that the work is generated from. + // EnvelopeNameLabel contains the name of the envelope object that the work is generated from. EnvelopeNameLabel = fleetPrefix + "envelope-name" - // PreviousBindingStateAnnotation is the annotation that records the previous state of a binding. + // PreviousBindingStateAnnotation records the previous state of a binding. // This is used to remember if an "unscheduled" binding was moved from a "bound" state or a "scheduled" state. PreviousBindingStateAnnotation = fleetPrefix + "previous-binding-state" ) diff --git a/config/crd/bases/placement.kubernetes-fleet.io_approvalrequests.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterapprovalrequests.yaml similarity index 91% rename from config/crd/bases/placement.kubernetes-fleet.io_approvalrequests.yaml rename to config/crd/bases/placement.kubernetes-fleet.io_clusterapprovalrequests.yaml index 1c199f1b8..03d208d0a 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_approvalrequests.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterapprovalrequests.yaml @@ -4,26 +4,28 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.15.0 - name: approvalrequests.placement.kubernetes-fleet.io + name: clusterapprovalrequests.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io names: categories: - fleet - fleet-placement - kind: ApprovalRequest - listKind: ApprovalRequestList - plural: approvalrequests - singular: approvalrequest - scope: Namespaced + kind: ClusterApprovalRequest + listKind: ClusterApprovalRequestList + plural: clusterapprovalrequests + shortNames: + - careq + singular: clusterapprovalrequest + scope: Cluster versions: - name: v1alpha1 schema: openAPIV3Schema: description: |- - ApprovalRequest defines a request for user approval. + ClusterApprovalRequest defines a request for user approval for cluster staged update run. The request object MUST have the following labels: - - `TargetUpdateRun`: Points to the update run that this approval request is for. + - `TargetUpdateRun`: Points to the cluster staged update run that this approval request is for. - `TargetStage`: The name of the stage that this approval request is for. - `IsLatestUpdateRunApproval`: Indicates whether this approval request is the latest one related to this update run. properties: @@ -45,7 +47,7 @@ spec: metadata: type: object spec: - description: The desired state of ApprovalRequest. + description: The desired state of ClusterApprovalRequest. properties: parentStageRollout: description: The name of the staged update run that this approval @@ -63,7 +65,7 @@ spec: - message: The spec field is immutable rule: self == oldSelf status: - description: The observed state of ApprovalRequest. + description: The observed state of ClusterApprovalRequest. properties: conditions: description: |- diff --git a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml similarity index 97% rename from config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml rename to config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml index 69cdd278f..f149b3e98 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml @@ -4,28 +4,31 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.15.0 - name: stagedupdateruns.placement.kubernetes-fleet.io + name: clusterstagedupdateruns.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io names: categories: - fleet - fleet-placement - kind: StagedUpdateRun - listKind: StagedUpdateRunList - plural: stagedupdateruns - singular: stagedupdaterun - scope: Namespaced + kind: ClusterStagedUpdateRun + listKind: ClusterStagedUpdateRunList + plural: clusterstagedupdateruns + shortNames: + - crsur + singular: clusterstagedupdaterun + scope: Cluster versions: - name: v1alpha1 schema: openAPIV3Schema: description: |- - StagedUpdateRun represents a stage by stage update process that applies selected resources to specified clusters. + ClusterStagedUpdateRun represents a stage by stage update process that applies ClusterResourcePlacement + selected resources to specified clusters. Resources from unselected clusters are removed after all stages in the update strategy are completed. - Each StagedUpdateRun object corresponds to a single release of a specific resource version. - The release is abandoned if the StagedUpdateRun object is deleted or the scheduling decision changes. - The name of the StagedUpdateRun must conform to RFC 1123. + Each ClusterStagedUpdateRun object corresponds to a single release of a specific resource version. + The release is abandoned if the ClusterStagedUpdateRun object is deleted or the scheduling decision changes. + The name of the ClusterStagedUpdateRun must conform to RFC 1123. properties: apiVersion: description: |- @@ -45,52 +48,38 @@ spec: metadata: type: object spec: - description: The desired state of StagedUpdateRun. The spec is immutable. + description: The desired state of ClusterStagedUpdateRun. The spec is + immutable. properties: - placementRef: + placementName: description: |- - A reference to the placement that this update run is applied to. + PlacementName is the name of placement that this update run is applied to. There can be multiple active update runs for each placement, but it's up to the DevOps team to ensure they don't conflict with each other. - properties: - name: - description: Name is the name of the referenced placement. - type: string - required: - - name - type: object + maxLength: 255 + type: string resourceSnapshotIndex: description: |- The resource snapshot index of the selected resources to be updated across clusters. The index represents a group of resource snapshots that includes all the resources a ResourcePlacement selected. type: string - stagedRolloutStrategyRef: + stagedRolloutStrategyName: description: |- - The reference to the update strategy that specifies the stages and the sequence + The name of the update strategy that specifies the stages and the sequence in which the selected resources will be updated on the member clusters. The stages are computed according to the referenced strategy when the update run starts and recorded in the status field. - properties: - name: - description: Name is the name of the namespaced scope resource. - type: string - namespace: - description: Namespace is namespace of the namespaced scope resource. - type: string - required: - - name - - namespace - type: object + type: string required: - - placementRef + - placementName - resourceSnapshotIndex - - stagedRolloutStrategyRef + - stagedRolloutStrategyName type: object x-kubernetes-validations: - message: The spec field is immutable rule: self == oldSelf status: - description: The observed status of StagedUpdateRun. + description: The observed status of ClusterStagedUpdateRun. properties: appliedStrategy: description: |- @@ -720,6 +709,12 @@ spec: - clusters - stageName type: object + policyObservedClusterCount: + description: |- + PolicyObservedClusterCount records the number of observed clusters in the policy snapshot. + It is recorded at the beginning of the update run from the policy snapshot object. + If the `ObservedClusterCount` value is updated during the update run, the update run is abandoned. + type: integer policySnapshotIndexUsed: description: |- PolicySnapShotIndexUsed records the policy snapshot index of the ClusterResourcePlacement (CRP) that diff --git a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml similarity index 92% rename from config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml rename to config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml index 63c97c2b3..673ce5ce5 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml @@ -4,25 +4,27 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.15.0 - name: stagedupdatestrategies.placement.kubernetes-fleet.io + name: clusterstagedupdatestrategies.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io names: categories: - fleet - fleet-placement - kind: StagedUpdateStrategy - listKind: StagedUpdateStrategyList - plural: stagedupdatestrategies - singular: stagedupdatestrategy - scope: Namespaced + kind: ClusterStagedUpdateStrategy + listKind: ClusterStagedUpdateStrategyList + plural: clusterstagedupdatestrategies + shortNames: + - sus + singular: clusterstagedupdatestrategy + scope: Cluster versions: - name: v1alpha1 schema: openAPIV3Schema: description: |- - StagedUpdateStrategy defines a reusable strategy that specifies the stages and the sequence - in which the selected resources will be updated on the member clusters. + ClusterStagedUpdateStrategy defines a reusable strategy that specifies the stages and the sequence + in which the selected cluster resources will be updated on the member clusters. properties: apiVersion: description: |- @@ -42,7 +44,7 @@ spec: metadata: type: object spec: - description: The desired state of StagedUpdateStrategy. + description: The desired state of ClusterStagedUpdateStrategy. properties: stages: description: Stage specifies the configuration for each update stage. diff --git a/pkg/controllers/clusterresourceplacement/resource_selector.go b/pkg/controllers/clusterresourceplacement/resource_selector.go index 94225683a..8cdf2a3fa 100644 --- a/pkg/controllers/clusterresourceplacement/resource_selector.go +++ b/pkg/controllers/clusterresourceplacement/resource_selector.go @@ -286,7 +286,7 @@ func (r *Reconciler) fetchAllResourcesInOneNamespace(namespaceName string, place } for _, obj := range objs { uObj := obj.DeepCopyObject().(*unstructured.Unstructured) - shouldInclude, err := utils.ShouldPropagateObj(r.InformerManager, uObj) + shouldInclude, err := controller.ShouldPropagateObj(r.InformerManager, uObj) if err != nil { klog.ErrorS(err, "cannot determine if we should propagate an object", "object", klog.KObj(uObj)) return nil, err diff --git a/pkg/controllers/rollout/controller.go b/pkg/controllers/rollout/controller.go index e94e67189..17c56111b 100644 --- a/pkg/controllers/rollout/controller.go +++ b/pkg/controllers/rollout/controller.go @@ -125,7 +125,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req runtime.Request) (runtim // fill out all the default values for CRP just in case the mutation webhook is not enabled. defaulter.SetDefaultsClusterResourcePlacement(&crp) - matchedCRO, matchedRO, err := r.fetchAllMatchingOverridesForResourceSnapshot(ctx, crp.Name, latestResourceSnapshot) + matchedCRO, matchedRO, err := controller.FetchAllMatchOverridesForResourceSnapshot(ctx, r.Client, r.InformerManager, crp.Name, latestResourceSnapshot) if err != nil { klog.ErrorS(err, "Failed to find all matching overrides for the clusterResourcePlacement", "clusterResourcePlacement", crpName) return runtime.Result{}, err @@ -370,7 +370,7 @@ func (r *Reconciler) pickBindingsToRoll(ctx context.Context, allBindings []*flee schedulerTargetedBinds = append(schedulerTargetedBinds, binding) // this binding has not been bound yet, so it is an update candidate // pickFromResourceMatchedOverridesForTargetCluster always returns the ordered list of the overrides. - cro, ro, err := r.pickFromResourceMatchedOverridesForTargetCluster(ctx, binding, matchedCROs, matchedROs) + cro, ro, err := controller.PickFromResourceMatchedOverridesForTargetCluster(ctx, r.Client, binding.Spec.TargetCluster, matchedCROs, matchedROs) if err != nil { return nil, nil, false, minWaitTime, err } @@ -395,7 +395,7 @@ func (r *Reconciler) pickBindingsToRoll(ctx context.Context, allBindings []*flee canBeReadyBindings = append(canBeReadyBindings, binding) } // pickFromResourceMatchedOverridesForTargetCluster always returns the ordered list of the overrides. - cro, ro, err := r.pickFromResourceMatchedOverridesForTargetCluster(ctx, binding, matchedCROs, matchedROs) + cro, ro, err := controller.PickFromResourceMatchedOverridesForTargetCluster(ctx, r.Client, binding.Spec.TargetCluster, matchedCROs, matchedROs) if err != nil { return nil, nil, false, 0, err } diff --git a/pkg/controllers/rollout/controller_test.go b/pkg/controllers/rollout/controller_test.go index 8d5f92e58..49b7732be 100644 --- a/pkg/controllers/rollout/controller_test.go +++ b/pkg/controllers/rollout/controller_test.go @@ -15,6 +15,7 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/client-go/util/workqueue" "k8s.io/utils/ptr" @@ -64,6 +65,17 @@ var ( } ) +func serviceScheme(t *testing.T) *runtime.Scheme { + scheme := runtime.NewScheme() + if err := fleetv1beta1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add placement v1beta1 scheme: %v", err) + } + if err := clusterv1beta1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add cluster v1beta1 scheme: %v", err) + } + return scheme +} + func TestReconcilerHandleResourceSnapshot(t *testing.T) { tests := map[string]struct { snapshot client.Object diff --git a/pkg/controllers/updaterun/controller.go b/pkg/controllers/updaterun/controller.go new file mode 100644 index 000000000..1642b02d6 --- /dev/null +++ b/pkg/controllers/updaterun/controller.go @@ -0,0 +1,197 @@ +/* +Copyright (c) Microsoft Corporation. +Licensed under the MIT license. +*/ + +// Package updaterun features controllers to reconcile the stagedUpdateRun objects. +package updaterun + +import ( + "context" + "errors" + "fmt" + "time" + + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + "k8s.io/client-go/util/workqueue" + "k8s.io/klog/v2" + runtime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + placementv1alpha1 "go.goms.io/fleet/apis/placement/v1alpha1" + placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" + "go.goms.io/fleet/pkg/utils" + "go.goms.io/fleet/pkg/utils/condition" + "go.goms.io/fleet/pkg/utils/controller" + "go.goms.io/fleet/pkg/utils/informer" +) + +// Reconciler reconciles a ClusterStagedUpdateRun object +type Reconciler struct { + client.Client + recorder record.EventRecorder + // the informer contains the cache for all the resources we need. + // to check the resource scope + InformerManager informer.Manager +} + +func (r *Reconciler) Reconcile(ctx context.Context, req runtime.Request) (runtime.Result, error) { + startTime := time.Now() + klog.V(2).InfoS("StagedUpdateRun reconciliation starts", "stagedUpdateRun", req.NamespacedName) + defer func() { + latency := time.Since(startTime).Milliseconds() + klog.V(2).InfoS("StagedUpdateRun reconciliation ends", "stagedUpdateRun", req.NamespacedName, "latency", latency) + }() + + var updateRun placementv1alpha1.ClusterStagedUpdateRun + if err := r.Client.Get(ctx, req.NamespacedName, &updateRun); err != nil { + klog.ErrorS(err, "Failed to get stagedUpdateRun object", "stagedUpdateRun", req.Name) + return runtime.Result{}, client.IgnoreNotFound(err) + } + runObjRef := klog.KObj(&updateRun) + + // Handle the deletion of the stagedUpdateRun + if !updateRun.DeletionTimestamp.IsZero() { + klog.V(2).InfoS("The stagedUpdateRun is being deleted", "stagedUpdateRun", runObjRef) + return runtime.Result{}, r.handleDelete(ctx, updateRun.DeepCopy()) + } + + // Add the finalizer to the stagedUpdateRun + if err := r.ensureFinalizer(ctx, &updateRun); err != nil { + klog.ErrorS(err, "Failed to add the finalizer to the stagedUpdateRun", "stagedUpdateRun", runObjRef) + return runtime.Result{}, err + } + var updatingStageIndex int + var tobeUpdatedBinding, tobeDeletedBinding []*placementv1beta1.ClusterResourceBinding + var err error + initCond := meta.FindStatusCondition(updateRun.Status.Conditions, string(placementv1alpha1.StagedUpdateRunConditionInitialized)) + if !condition.IsConditionStatusTrue(initCond, updateRun.Generation) { + if condition.IsConditionStatusFalse(initCond, updateRun.Generation) { + klog.InfoS("The stagedUpdateRun has failed to initialize", "errorMsg", initCond.Message, "stagedUpdateRun", runObjRef) + return runtime.Result{}, nil + } + klog.V(2).InfoS("The stagedUpdateRun is not initialized", "stagedUpdateRun", runObjRef) + if tobeUpdatedBinding, tobeDeletedBinding, err = r.initialize(ctx, &updateRun); err != nil { + klog.ErrorS(err, "Failed to initialize the stagedUpdateRun", "stagedUpdateRun", runObjRef) + // errInitializedFailed cannot be retried + if errors.Is(err, errInitializedFailed) { + return runtime.Result{}, r.recordInitializationFailed(ctx, &updateRun, err.Error()) + } + return runtime.Result{}, err + } + updatingStageIndex = 0 //start from the first stage + klog.V(2).InfoS("Initialized the stagedUpdateRun", "stagedUpdateRun", runObjRef) + } else { + klog.V(2).InfoS("The stagedUpdateRun is initialized", "stagedUpdateRun", runObjRef) + // Check if the stagedUpdateRun is finished + finishedCond := meta.FindStatusCondition(updateRun.Status.Conditions, string(placementv1alpha1.StagedUpdateRunConditionSucceeded)) + if condition.IsConditionStatusTrue(finishedCond, updateRun.Generation) || condition.IsConditionStatusFalse(finishedCond, updateRun.Generation) { + klog.V(2).InfoS("The stagedUpdateRun is finished", "finishedSuccessfully", finishedCond.Status, "stagedUpdateRun", runObjRef) + return runtime.Result{}, nil + } + // Validate the stagedUpdateRun status to ensure the update can be continued and get the updating stage index and cluster indices + if updatingStageIndex, tobeUpdatedBinding, tobeDeletedBinding, err = r.validateUpdateRunStatus(ctx, &updateRun); err != nil { + // errStagedUpdatedAborted cannot be retried + if errors.Is(err, errStagedUpdatedAborted) { + return runtime.Result{}, r.recordUpdateRunFailed(ctx, &updateRun, err.Error()) + } + return runtime.Result{}, err + } + klog.V(2).InfoS("Validated the stagedUpdateRun", "stagedUpdateRun", runObjRef) + } + // the previous run is completed but the update to the status failed + if updatingStageIndex == -1 { + klog.V(2).InfoS("the stagedUpdateRun is completed", "stagedUpdateRun", runObjRef) + return runtime.Result{}, r.recordUpdateRunSucceeded(ctx, &updateRun) + } + // execute the update run + klog.V(2).InfoS("Continue to execute the stagedUpdateRun", "updatingStageIndex", updatingStageIndex, "stagedUpdateRun", runObjRef) + finished, waitTime, executeErr := r.executeUpdateRun(ctx, &updateRun, updatingStageIndex, tobeUpdatedBinding, tobeDeletedBinding) + if executeErr != nil { + // errStagedUpdatedAborted cannot be retried + if errors.Is(executeErr, errStagedUpdatedAborted) { + return runtime.Result{}, r.recordUpdateRunFailed(ctx, &updateRun, executeErr.Error()) + } + return runtime.Result{}, executeErr + } + if finished { + klog.V(2).InfoS("The stagedUpdateRun is finished", "stagedUpdateRun", runObjRef) + return runtime.Result{}, r.recordUpdateRunSucceeded(ctx, &updateRun) + } + // retry if the update run is not finished + klog.V(2).InfoS("The stagedUpdateRun is not finished yet", "requeueWaitTime", waitTime, "stagedUpdateRun", runObjRef) + return runtime.Result{RequeueAfter: waitTime}, nil +} + +// handleDelete handles the deletion of the stagedUpdateRun object +// We need to wait for the update run to stop before deleting the stagedUpdateRun object +// We will delete all the dependent resources, such as approvalRequest objects, of the stagedUpdateRun object. +func (r *Reconciler) handleDelete(ctx context.Context, updateRun *placementv1alpha1.ClusterStagedUpdateRun) error { + runObjRef := klog.KObj(updateRun) + // delete all the associated approvalRequests + approvalRequest := &placementv1alpha1.ClusterApprovalRequest{} + if err := r.Client.DeleteAllOf(ctx, approvalRequest, client.InNamespace(updateRun.GetNamespace()), client.MatchingLabels{placementv1alpha1.TargetUpdateRunLabel: updateRun.GetName()}); err != nil { + klog.ErrorS(err, "Failed to delete all associated approvalRequests", "stagedUpdateRun", runObjRef) + return controller.NewAPIServerError(false, err) + } + klog.V(2).InfoS("Deleted all approvalRequests associated with the stagedUpdateRun", "stagedUpdateRun", runObjRef) + controllerutil.RemoveFinalizer(updateRun, placementv1alpha1.StagedUpdateRunFinalizer) + if err := r.Client.Update(ctx, updateRun); err != nil { + klog.ErrorS(err, "Failed to remove updateRun finalizer", "stagedUpdateRun", runObjRef) + return controller.NewUpdateIgnoreConflictError(err) + } + return nil +} + +// ensureFinalizer makes sure that the member cluster CR has a finalizer on it +func (r *Reconciler) ensureFinalizer(ctx context.Context, updateRun *placementv1alpha1.ClusterStagedUpdateRun) error { + if controllerutil.ContainsFinalizer(updateRun, placementv1alpha1.StagedUpdateRunFinalizer) { + return nil + } + klog.InfoS("Added the staged update run finalizer", "stagedUpdateRun", klog.KObj(updateRun)) + controllerutil.AddFinalizer(updateRun, placementv1alpha1.StagedUpdateRunFinalizer) + return r.Update(ctx, updateRun, client.FieldOwner(utils.UpdateRunControllerFieldManagerName)) +} + +// SetupWithManager sets up the controller with the Manager. +func (r *Reconciler) SetupWithManager(mgr runtime.Manager) error { + r.recorder = mgr.GetEventRecorderFor("clusterresource-stagedupdaterun-controller") + return runtime.NewControllerManagedBy(mgr). + Named("clusterresource-stagedupdaterun-controller"). + For(&placementv1alpha1.ClusterStagedUpdateRun{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})). + Watches(&placementv1alpha1.ClusterApprovalRequest{}, &handler.Funcs{ + // We only care about when an approval request is approved. + UpdateFunc: func(ctx context.Context, e event.UpdateEvent, q workqueue.RateLimitingInterface) { + klog.V(2).InfoS("Handling an approvalRequest update event", "approvalRequest", klog.KObj(e.ObjectNew)) + handleApprovalRequest(e.ObjectNew, q) + }, + GenericFunc: func(ctx context.Context, e event.GenericEvent, q workqueue.RateLimitingInterface) { + klog.V(2).InfoS("Handling a approvalRequest generic event", "approvalRequest", klog.KObj(e.Object)) + handleApprovalRequest(e.Object, q) + }, + }).Complete(r) +} + +// handleApprovalRequest find the CRP name from the approval request and enqueue the CRP to the updaterun controller queue +func handleApprovalRequest(approvalRequest client.Object, q workqueue.RateLimitingInterface) { + // get the CRP name from the label + crp := approvalRequest.GetLabels()[placementv1beta1.CRPTrackingLabel] + if len(crp) == 0 { + // should never happen, we might be able to alert on this error + klog.ErrorS(controller.NewUnexpectedBehaviorError(fmt.Errorf("cannot find CRPTrackingLabel label value")), + "Invalid clusterResourceSnapshot", "clusterResourceSnapshot", klog.KObj(approvalRequest)) + return + } + // enqueue the CRP to the updaterun controller queue + q.Add(reconcile.Request{ + NamespacedName: types.NamespacedName{Name: crp}, + }) +} diff --git a/pkg/controllers/updaterun/executing.go b/pkg/controllers/updaterun/executing.go new file mode 100644 index 000000000..7f867bce8 --- /dev/null +++ b/pkg/controllers/updaterun/executing.go @@ -0,0 +1,479 @@ +package updaterun + +import ( + "context" + "errors" + "fmt" + "reflect" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/client" + + placementv1alpha1 "go.goms.io/fleet/apis/placement/v1alpha1" + placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" + "go.goms.io/fleet/pkg/utils/condition" + "go.goms.io/fleet/pkg/utils/controller" +) + +const ( + clusterUpdatingWaitTime = 15 * time.Second + // + stageUpdatingWaitTime = 60 * time.Second +) + +// executeUpdateRun executes the update run by updating the clusters in the updatingClusterIndices with the update run. +func (r *Reconciler) executeUpdateRun(ctx context.Context, updateRun *placementv1alpha1.ClusterStagedUpdateRun, updatingStageIndex int, + tobeUpdatedBinding, tobeDeletedBinding []*placementv1beta1.ClusterResourceBinding) (bool, time.Duration, error) { + // mark the update run as started regardless if it is already marked + markUpdateRunStarted(updateRun) + if updatingStageIndex < len(updateRun.Status.StagesStatus) { + updatingStage := &updateRun.Status.StagesStatus[updatingStageIndex] + waitTime, executingErr := r.executeUpdatingStage(ctx, updateRun, updatingStageIndex, tobeUpdatedBinding) + if errors.Is(executingErr, errStagedUpdatedAborted) { + markStageUpdatingFailed(updatingStage, updateRun.Generation, executingErr.Error()) + return true, waitTime, executingErr + } + // the stage has not finished yet + if updatingErr := r.recordUpdateRunStatus(ctx, updateRun); updatingErr != nil { + klog.ErrorS(updatingErr, "Failed to update the stagedUpdateRun status", "stagedUpdateRun", klog.KObj(updateRun)) + } + return false, waitTime, executingErr + } + // all the stages have been finished, now start the delete stage + finished, executingErr := r.executeDeleteStage(ctx, tobeDeletedBinding, updateRun) + if !finished { + if updatingErr := r.recordUpdateRunStatus(ctx, updateRun); updatingErr != nil { + klog.ErrorS(updatingErr, "Failed to update the stagedUpdateRun status", "stagedUpdateRun", klog.KObj(updateRun)) + } + } + return finished, clusterUpdatingWaitTime, executingErr +} + +// executeUpdatingStage executes the updating stage by updating the clusters in the updatingStage with the update run. +func (r *Reconciler) executeUpdatingStage(ctx context.Context, updateRun *placementv1alpha1.ClusterStagedUpdateRun, updatingStageIndex int, tobeUpdatedBinding []*placementv1beta1.ClusterResourceBinding) (time.Duration, error) { + updatingStageStatus := &updateRun.Status.StagesStatus[updatingStageIndex] + resourceSnapshotName := updateRun.Spec.ResourceSnapshotIndex + // create the map of the tobeUpdatedBinding + tobeUpdatedBindingMap := make(map[string]*placementv1beta1.ClusterResourceBinding, len(tobeUpdatedBinding)) + for _, binding := range tobeUpdatedBinding { + tobeUpdatedBindingMap[binding.Spec.TargetCluster] = binding + } + finishedClusterCount := 0 + // go through each cluster in the stage and check if it is updated + for i := range updatingStageStatus.Clusters { + clusterStatus := &updatingStageStatus.Clusters[i] + clusterStartedCond := meta.FindStatusCondition(clusterStatus.Conditions, string(placementv1alpha1.ClusterUpdatingConditionStarted)) + clusterUpdateSucceededCond := meta.FindStatusCondition(clusterStatus.Conditions, string(placementv1alpha1.ClusterUpdatingConditionSucceeded)) + if condition.IsConditionStatusFalse(clusterUpdateSucceededCond, updateRun.Generation) { + // the cluster is marked as failed to update + failedErr := fmt.Errorf("the to be updated cluster `%s` in the stage %s has failed", clusterStatus.ClusterName, updatingStageStatus.StageName) + klog.ErrorS(failedErr, "The cluster has failed to be updated", "stagedUpdateRun", klog.KObj(updateRun)) + return 0, fmt.Errorf("%w: %s", errStagedUpdatedAborted, failedErr.Error()) + } + if condition.IsConditionStatusTrue(clusterUpdateSucceededCond, updateRun.Generation) { + // the cluster is marked as finished updating successfully + finishedClusterCount++ + continue + } + // the cluster is either updating or not started updating yet + binding := tobeUpdatedBindingMap[clusterStatus.ClusterName] + availCond := binding.GetCondition(string(placementv1beta1.ResourceBindingAvailable)) + if !condition.IsConditionStatusTrue(clusterStartedCond, updateRun.Generation) { + // the cluster has not started updating yet + markClusterUpdatingStarted(clusterStatus, updateRun.Generation) + if finishedClusterCount == 0 { + markStageUpdatingStarted(updatingStageStatus, updateRun.Generation) + } + if !isBindingSyncedWithClusterStatus(updateRun, binding, clusterStatus) { + klog.V(2).InfoS("Find the first cluster that needs to be updated", "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "stagedUpdateRun", klog.KObj(updateRun)) + // The binding is not up to date with the cluster status. + binding.Spec.State = placementv1beta1.BindingStateBound + binding.Spec.ResourceSnapshotName = resourceSnapshotName + binding.Spec.ResourceOverrideSnapshots = clusterStatus.ResourceOverrideSnapshots + binding.Spec.ClusterResourceOverrideSnapshots = clusterStatus.ClusterResourceOverrideSnapshots + binding.Spec.ApplyStrategy = updateRun.Status.ApplyStrategy + if err := r.Client.Update(ctx, binding); err != nil { + klog.ErrorS(err, "Failed to update binding to be bound with matching spec with the update run", "clusterResourceBinding", klog.KObj(binding), "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "stagedUpdateRun", klog.KObj(updateRun)) + return 0, controller.NewUpdateIgnoreConflictError(err) + } + klog.V(2).InfoS("Updated the status of a binding to bound", "clusterResourceBinding", klog.KObj(binding), "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "stagedUpdateRun", klog.KObj(updateRun)) + } else { + klog.V(2).InfoS("Find the first binding that is updating but the cluster status has not been updated", "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "stagedUpdateRun", klog.KObj(updateRun)) + if binding.Spec.State != placementv1beta1.BindingStateBound { + if err := r.Client.Update(ctx, binding); err != nil { + klog.ErrorS(err, "Failed to update binding to be bound", "clusterResourceBinding", klog.KObj(binding), "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "stagedUpdateRun", klog.KObj(updateRun)) + return 0, controller.NewUpdateIgnoreConflictError(err) + } + klog.V(2).InfoS("Updated the status of a binding to bound", "clusterResourceBinding", klog.KObj(binding), "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "stagedUpdateRun", klog.KObj(updateRun)) + } else { + if _, updateErr := checkClusterUpgradeResult(availCond, binding, clusterStatus, updatingStageStatus, updateRun); updateErr != nil { + return clusterUpdatingWaitTime, updateErr + } + } + } + // no need to continue as we only support one cluster updating at a time for now + return clusterUpdatingWaitTime, nil + } + // now the cluster has to be updating, the binding should point to the right resource snapshot and the binding should be bound. + if !isBindingSyncedWithClusterStatus(updateRun, binding, clusterStatus) || binding.Spec.State != placementv1beta1.BindingStateBound { + unexpectedErr := fmt.Errorf("the updating cluster `%s` in the stage %s does not match the cluster status: %+v, binding := %+v", clusterStatus.ClusterName, updatingStageStatus.StageName, clusterStatus, binding.Spec) + klog.ErrorS(controller.NewUnexpectedBehaviorError(unexpectedErr), "The binding has been changed after the updating, please check if there is con-current update run", "stagedUpdateRun", klog.KObj(updateRun)) + markClusterUpdatingFailed(clusterStatus, updateRun.Generation, unexpectedErr.Error()) + return 0, fmt.Errorf("%w: %s", errStagedUpdatedAborted, unexpectedErr.Error()) + } + if finishedClusterCount == 0 { + markStageUpdatingStarted(updatingStageStatus, updateRun.Generation) + } + finished, resultErr := checkClusterUpgradeResult(availCond, binding, clusterStatus, updatingStageStatus, updateRun) + if finished { + finishedClusterCount++ + continue + } + // no need continue as we only support one cluster updating at a time for now + return clusterUpdatingWaitTime, resultErr + } + if finishedClusterCount == len(updatingStageStatus.Clusters) { + // all the clusters in the stage have been updated + markStageUpdatingWaiting(updatingStageStatus, updateRun.Generation) + klog.V(2).InfoS("The stage has finished all cluster updating", "stage", updatingStageStatus.StageName, "stagedUpdateRun", klog.KObj(updateRun)) + // Check if the after stage tasks are ready. + approved, err := r.checkAfterStageTasksStatus(ctx, updatingStageIndex, updateRun) + if err != nil { + return 0, err + } + if approved { + markStageUpdatingSucceeded(updatingStageStatus, updateRun.Generation) + return 0, nil //no need to wait to get to the next stage + } + return stageUpdatingWaitTime, nil + } + return clusterUpdatingWaitTime, nil +} + +// checkClusterUpgradeResult checks if the cluster has been updated successfully. +// it returns if the cluster has been updated successfully and the error if the cluster upgrade failed. +func checkClusterUpgradeResult(availCond *metav1.Condition, binding *placementv1beta1.ClusterResourceBinding, clusterStatus *placementv1alpha1.ClusterUpdatingStatus, + updatingStage *placementv1alpha1.StageUpdatingStatus, updateRun *placementv1alpha1.ClusterStagedUpdateRun) (bool, error) { + if condition.IsConditionStatusTrue(availCond, binding.Generation) { + // the resource updated on the cluster is available + klog.InfoS("The cluster has been updated", "cluster", clusterStatus.ClusterName, "stage", updatingStage.StageName, "stagedUpdateRun", klog.KObj(updateRun)) + markClusterUpdatingSucceeded(clusterStatus, updateRun.Generation) + return true, nil + } + for i := condition.OverriddenCondition; i < condition.AppliedCondition; i++ { + bindingCond := binding.GetCondition(string(i.ResourceBindingConditionType())) + if condition.IsConditionStatusFalse(bindingCond, binding.Generation) { + // we have no awy to know if the failed condition is recoverable or not so we just let it run + klog.InfoS("The cluster upgrading encountered an error", "failedCondition", bindingCond, "cluster", clusterStatus.ClusterName, "stage", updatingStage.StageName, "stagedUpdateRun", klog.KObj(updateRun)) + //TODO: identify some none recoverable error and mark the cluster upgrading as failed + return false, fmt.Errorf("the cluster upgrading encountered an error at stage `%s`, err := `%s`", string(i.ResourceBindingConditionType()), bindingCond.Message) + } + } + return false, nil +} + +// isBindingSyncedWithClusterStatus checks if the binding is updated with the cluster status. +func isBindingSyncedWithClusterStatus(updateRun *placementv1alpha1.ClusterStagedUpdateRun, binding *placementv1beta1.ClusterResourceBinding, cluster *placementv1alpha1.ClusterUpdatingStatus) bool { + return binding.Spec.ResourceSnapshotName == updateRun.Spec.ResourceSnapshotIndex && + reflect.DeepEqual(cluster.ResourceOverrideSnapshots, binding.Spec.ResourceOverrideSnapshots) && + reflect.DeepEqual(cluster.ClusterResourceOverrideSnapshots, binding.Spec.ClusterResourceOverrideSnapshots) && + reflect.DeepEqual(binding.Spec.ApplyStrategy, updateRun.Status.ApplyStrategy) +} + +// checkAfterStageTasksStatus checks if the after stage tasks have finished. +// it returns if the after stage tasks have finished and the error if the after stage tasks failed. +func (r *Reconciler) checkAfterStageTasksStatus(ctx context.Context, updatingStageIndex int, updateRun *placementv1alpha1.ClusterStagedUpdateRun) (bool, error) { + updatingStageStatus := &updateRun.Status.StagesStatus[updatingStageIndex] + updatingStage := &updateRun.Status.StagedUpdateStrategySnapshot.Stages[updatingStageIndex] + if updatingStage.AfterStageTasks == nil { + klog.V(2).InfoS("There is no after stage task for this stage", "stage", updatingStage.Name, "stagedUpdateRun", klog.KObj(updateRun)) + return true, nil + } + for i, task := range updatingStage.AfterStageTasks { + switch task.Type { + case placementv1alpha1.AfterStageTaskTypeTimedWait: + waitStartTime := meta.FindStatusCondition(updatingStageStatus.Conditions, string(placementv1alpha1.StageUpdatingConditionProgressing)).LastTransitionTime.Time + // check if the wait time has passed + if waitStartTime.Add(task.WaitTime.Duration).After(time.Now()) { + klog.V(2).InfoS("The after stage task still need to wait", "waitStartTime", waitStartTime, "waitTime", task.WaitTime, "stage", updatingStage.Name, "stagedUpdateRun", klog.KObj(updateRun)) + return false, nil + } + markAfterStageWaitTimeElapsed(&updatingStageStatus.AfterStageTaskStatus[i], updateRun.Generation) + klog.V(2).InfoS("The after stage wait task has completed", "stage", updatingStage.Name, "stagedUpdateRun", klog.KObj(updateRun)) + + case placementv1alpha1.AfterStageTaskTypeApproval: + // check if the approval request has been created + approvalRequest := placementv1alpha1.ClusterApprovalRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: updatingStageStatus.AfterStageTaskStatus[i].ApprovalRequestName, + Namespace: updateRun.Namespace, + Labels: map[string]string{ + placementv1alpha1.TargetUpdatingStageNameLabel: updatingStage.Name, + placementv1alpha1.TargetUpdateRunLabel: updateRun.Name, + placementv1alpha1.IsLatestUpdateRunApprovalLabel: "true", + }, + }, + Spec: placementv1alpha1.ApprovalRequestSpec{ + TargetUpdateRun: updateRun.Name, + TargetStage: updatingStage.Name, + }, + } + requestRef := klog.KObj(&approvalRequest) + if err := r.Create(ctx, &approvalRequest); err != nil { + if apierrors.IsAlreadyExists(err) { + // the approval task already exists + markAfterStageRequestCreated(&updatingStageStatus.AfterStageTaskStatus[i], updateRun.Generation) + if err = r.Get(ctx, client.ObjectKeyFromObject(&approvalRequest), &approvalRequest); err != nil { + klog.ErrorS(err, "Failed to get the already existing approval request", "approvalRequest", requestRef, "stage", updatingStage.Name, "stagedUpdateRun", klog.KObj(updateRun)) + return false, err + } + if approvalRequest.Spec.TargetStage != updatingStage.Name || approvalRequest.Spec.TargetUpdateRun != updateRun.Name { + unexpectedErr := fmt.Errorf("the approval request task `%s` is targeting update run `%s` and stage `%s` ", approvalRequest.Name, approvalRequest.Spec.TargetStage, approvalRequest.Spec.TargetUpdateRun) + klog.ErrorS(controller.NewUnexpectedBehaviorError(unexpectedErr), "Found an approval request targeting wrong stage", "approvalRequestTask", requestRef, "stage", updatingStage.Name, "stagedUpdateRun", klog.KObj(updateRun)) + return false, fmt.Errorf("%w: %s", errStagedUpdatedAborted, unexpectedErr.Error()) + } + if !condition.IsConditionStatusTrue(meta.FindStatusCondition(approvalRequest.Status.Conditions, string(placementv1alpha1.ApprovalRequestConditionApproved)), approvalRequest.Generation) { + klog.V(2).InfoS("The approval request has not been approved yet", "approvalRequestTask", requestRef, "stage", updatingStage.Name, "stagedUpdateRun", klog.KObj(updateRun)) + return false, nil + } + klog.V(2).InfoS("The approval request has been approved", "approvalRequestTask", requestRef, "stage", updatingStage.Name, "stagedUpdateRun", klog.KObj(updateRun)) + markAfterStageRequestApproved(&updatingStageStatus.AfterStageTaskStatus[i], updateRun.Generation) + } else { + // retryable error + klog.ErrorS(err, "Failed to create the approval request", "approvalRequest", requestRef, "stage", updatingStage.Name, "stagedUpdateRun", klog.KObj(updateRun)) + return false, err + } + } else { + // the approval request has been created for the first time + klog.V(2).InfoS("The approval request has been created", "approvalRequestTask", requestRef, "stage", updatingStage.Name, "stagedUpdateRun", klog.KObj(updateRun)) + markAfterStageRequestCreated(&updatingStageStatus.AfterStageTaskStatus[i], updateRun.Generation) + return false, nil + } + } + } + // all the after stage tasks have been finished or the for loop will return before this line + return true, nil +} + +// executeDeleteStage executes the delete stage by updating the clusters in the deleteStage with the update run. +func (r *Reconciler) executeDeleteStage(ctx context.Context, tobeDeletedBindings []*placementv1beta1.ClusterResourceBinding, updateRun *placementv1alpha1.ClusterStagedUpdateRun) (bool, error) { + existingDeleteStageStatus := updateRun.Status.DeletionStageStatus + existingDeleteStageClustersMap := make(map[string]*placementv1alpha1.ClusterUpdatingStatus, len(existingDeleteStageStatus.Clusters)) + for i, clusterStatus := range existingDeleteStageStatus.Clusters { + existingDeleteStageClustersMap[clusterStatus.ClusterName] = &existingDeleteStageStatus.Clusters[i] + } + deletingBinding := 0 + // check that the clusters in the stage are part of the tobeDeletedBindings + for _, binding := range tobeDeletedBindings { + curCluster, exist := existingDeleteStageClustersMap[binding.Spec.TargetCluster] + if !exist { + missingErr := fmt.Errorf("the to be deleted cluster `%s` is not in the deleting stage", binding.Spec.TargetCluster) + klog.ErrorS(missingErr, "The cluster in the deleting stage does not include all the to be deleted binding", "stagedUpdateRun", klog.KObj(updateRun)) + return false, fmt.Errorf("%w: %s", errStagedUpdatedAborted, missingErr.Error()) + } + delete(existingDeleteStageClustersMap, binding.Spec.TargetCluster) + // make sure the cluster is not marked as deleted as the binding is still there + if condition.IsConditionStatusTrue(meta.FindStatusCondition(curCluster.Conditions, string(placementv1alpha1.ClusterUpdatingConditionSucceeded)), updateRun.Generation) { + unexpectedErr := fmt.Errorf("the deleted cluster `%s` in the deleting stage is not deleted yet", binding.Spec.TargetCluster) + klog.ErrorS(controller.NewUnexpectedBehaviorError(unexpectedErr), "The cluster in the deleting stage is not removed yet but marked as deleted", "cluster", curCluster.ClusterName, "stagedUpdateRun", klog.KObj(updateRun)) + return false, fmt.Errorf("%w: %s", errStagedUpdatedAborted, unexpectedErr.Error()) + } + if condition.IsConditionStatusTrue(meta.FindStatusCondition(curCluster.Conditions, string(placementv1alpha1.ClusterUpdatingConditionStarted)), updateRun.Generation) { + // the cluster status is marked as being deleted + if binding.DeletionTimestamp.IsZero() { + // the cluster is marked as deleting but the binding is not deleting + unexpectedErr := fmt.Errorf("the cluster `%s` in the deleting stage is marked as deleting but its corresponding binding is not deleting", curCluster.ClusterName) + klog.ErrorS(controller.NewUnexpectedBehaviorError(unexpectedErr), "The binding should be deleting before we mark a cluster deleting", "clusterStatus", curCluster, "stagedUpdateRun", klog.KObj(updateRun)) + return false, fmt.Errorf("%w: %s", errStagedUpdatedAborted, unexpectedErr.Error()) + } + deletingBinding++ + continue + } + // the cluster status is not deleting yet + if isBindingSyncedWithClusterStatus(updateRun, binding, curCluster) { + // delete the binding that is up to date with the cluster status. + if err := r.Client.Delete(ctx, binding); err != nil { + klog.ErrorS(err, "Failed to delete a binding in the update run", "clusterResourceBinding", klog.KObj(binding), "cluster", curCluster.ClusterName, "stagedUpdateRun", klog.KObj(updateRun)) + return false, controller.NewAPIServerError(false, err) + } + klog.V(2).InfoS("Deleted a binding pointing to a to be deleted cluster", "clusterResourceBinding", klog.KObj(binding), "cluster", curCluster.ClusterName, "stagedUpdateRun", klog.KObj(updateRun)) + markClusterUpdatingStarted(curCluster, updateRun.Generation) + if deletingBinding == 0 { + markStageUpdatingStarted(updateRun.Status.DeletionStageStatus, updateRun.Generation) + } + } else { + // the binding is not up to date with the cluster status, it could happen if there are multiple update stage running concurrently. + delErr := fmt.Errorf("the deleted cluster `%s` in the deleting stage does not match the cluster status: %+v, binding := %+v", curCluster.ClusterName, curCluster, binding.Spec) + klog.ErrorS(delErr, "The binding has been changed after the updating, please check if there is con-current update run", "stagedUpdateRun", klog.KObj(updateRun)) + return false, fmt.Errorf("%w: %s", errStagedUpdatedAborted, delErr.Error()) + } + } + // the rest of the clusters in the stage are not in the tobeDeletedBindings so it should be marked as delete succeeded + for _, clusterStatus := range existingDeleteStageClustersMap { + // make sure the cluster is marked as deleting + if !condition.IsConditionStatusTrue(meta.FindStatusCondition(clusterStatus.Conditions, string(placementv1alpha1.ClusterUpdatingConditionStarted)), updateRun.Generation) { + markClusterUpdatingStarted(clusterStatus, updateRun.Generation) + } + markClusterUpdatingSucceeded(clusterStatus, updateRun.Generation) + } + klog.InfoS("The delete stage is progressing", "numberOfDeletingClusters", len(tobeDeletedBindings), "stagedUpdateRun", klog.KObj(updateRun)) + if len(tobeDeletedBindings) == 0 { + markStageUpdatingSucceeded(updateRun.Status.DeletionStageStatus, updateRun.Generation) + } + return len(tobeDeletedBindings) == 0, nil +} + +// recordUpdateRunSucceeded marks the update run as succeeded in memory. +func markAfterStageRequestCreated(afterStageTaskStatus *placementv1alpha1.AfterStageTaskStatus, generation int64) { + meta.SetStatusCondition(&afterStageTaskStatus.Conditions, metav1.Condition{ + Type: string(placementv1alpha1.AfterStageTaskConditionApprovalRequestCreated), + Status: metav1.ConditionTrue, + ObservedGeneration: generation, + Reason: condition.AfterStageTaskApprovalRequestCreatedReason, + }) +} + +// recordUpdateRunSucceeded marks the update run as succeeded in memory. +func markAfterStageRequestApproved(afterStageTaskStatus *placementv1alpha1.AfterStageTaskStatus, generation int64) { + meta.SetStatusCondition(&afterStageTaskStatus.Conditions, metav1.Condition{ + Type: string(placementv1alpha1.AfterStageTaskConditionApprovalRequestApproved), + Status: metav1.ConditionTrue, + ObservedGeneration: generation, + Reason: condition.AfterStageTaskApprovalRequestApprovedReason, + }) +} + +// recordUpdateRunSucceeded marks the update run as succeeded in memory. +func markAfterStageWaitTimeElapsed(afterStageTaskStatus *placementv1alpha1.AfterStageTaskStatus, generation int64) { + meta.SetStatusCondition(&afterStageTaskStatus.Conditions, metav1.Condition{ + Type: string(placementv1alpha1.AfterStageTaskConditionWaitTimeElapsed), + Status: metav1.ConditionTrue, + ObservedGeneration: generation, + Reason: condition.AfterStageTaskWaitTimeElapsedReason, + }) +} + +// recordUpdateRunSucceeded mark the update run as succeeded in memory. +func markClusterUpdatingStarted(clusterUpdatingStatus *placementv1alpha1.ClusterUpdatingStatus, generation int64) { + meta.SetStatusCondition(&clusterUpdatingStatus.Conditions, metav1.Condition{ + Type: string(placementv1alpha1.ClusterUpdatingConditionStarted), + Status: metav1.ConditionTrue, + ObservedGeneration: generation, + Reason: condition.ClusterUpdatingStartedReason, + }) +} + +// markClusterUpdatingFailed mark the cluster updating failed in memory. +func markClusterUpdatingFailed(clusterUpdatingStatus *placementv1alpha1.ClusterUpdatingStatus, generation int64, message string) { + meta.SetStatusCondition(&clusterUpdatingStatus.Conditions, metav1.Condition{ + Type: string(placementv1alpha1.ClusterUpdatingConditionSucceeded), + Status: metav1.ConditionFalse, + ObservedGeneration: generation, + Reason: condition.ClusterUpdatingFailedReason, + Message: message, + }) +} + +// markClusterUpdatingSucceeded mark the cluster updating succeeded in memory. +func markClusterUpdatingSucceeded(clusterUpdatingStatus *placementv1alpha1.ClusterUpdatingStatus, generation int64) { + meta.SetStatusCondition(&clusterUpdatingStatus.Conditions, metav1.Condition{ + Type: string(placementv1alpha1.ClusterUpdatingConditionSucceeded), + Status: metav1.ConditionTrue, + ObservedGeneration: generation, + Reason: condition.ClusterUpdatingSucceededReason, + }) +} + +// markStageUpdatingStarted mark the stage updating started in memory. +func markStageUpdatingStarted(stageUpdatingStatus *placementv1alpha1.StageUpdatingStatus, generation int64) { + if stageUpdatingStatus.StartTime == nil { + stageUpdatingStatus.StartTime = &metav1.Time{Time: time.Now()} + } + meta.SetStatusCondition(&stageUpdatingStatus.Conditions, metav1.Condition{ + Type: string(placementv1alpha1.StageUpdatingConditionProgressing), + Status: metav1.ConditionTrue, + ObservedGeneration: generation, + Reason: condition.StageUpdatingStartedReason, + }) +} + +// markStageUpdatingWaiting mark the stage updating as waiting in memory. +func markStageUpdatingWaiting(stageUpdatingStatus *placementv1alpha1.StageUpdatingStatus, generation int64) { + meta.SetStatusCondition(&stageUpdatingStatus.Conditions, metav1.Condition{ + Type: string(placementv1alpha1.StageUpdatingConditionProgressing), + Status: metav1.ConditionFalse, + ObservedGeneration: generation, + Reason: condition.StageUpdatingWaitingReason, + }) +} + +// markStageUpdatingFailed mark the stage updating failed in memory. +func markStageUpdatingFailed(stageUpdatingStatus *placementv1alpha1.StageUpdatingStatus, generation int64, message string) { + if stageUpdatingStatus.StartTime == nil { + stageUpdatingStatus.StartTime = &metav1.Time{Time: time.Now()} + } + if stageUpdatingStatus.EndTime == nil { + stageUpdatingStatus.EndTime = &metav1.Time{Time: time.Now()} + } + meta.SetStatusCondition(&stageUpdatingStatus.Conditions, metav1.Condition{ + Type: string(placementv1alpha1.StageUpdatingConditionSucceeded), + Status: metav1.ConditionFalse, + ObservedGeneration: generation, + Reason: condition.StageUpdatingFailedReason, + Message: message, + }) +} + +// markStageUpdatingSucceeded mark the stage updating as succeeded in memory. +func markStageUpdatingSucceeded(stageUpdatingStatus *placementv1alpha1.StageUpdatingStatus, generation int64) { + if stageUpdatingStatus.StartTime == nil { + stageUpdatingStatus.StartTime = &metav1.Time{Time: time.Now()} + } + if stageUpdatingStatus.EndTime == nil { + stageUpdatingStatus.EndTime = &metav1.Time{Time: time.Now()} + } + meta.SetStatusCondition(&stageUpdatingStatus.Conditions, metav1.Condition{ + Type: string(placementv1alpha1.StageUpdatingConditionSucceeded), + Status: metav1.ConditionTrue, + ObservedGeneration: generation, + Reason: condition.StageUpdatingSucceededReason, + }) +} + +// markUpdateRunStarted mark the update run as succeeded in memory. +func markUpdateRunStarted(updateRun *placementv1alpha1.ClusterStagedUpdateRun) { + meta.SetStatusCondition(&updateRun.Status.Conditions, metav1.Condition{ + Type: string(placementv1alpha1.StagedUpdateRunConditionProgressing), + Status: metav1.ConditionTrue, + ObservedGeneration: updateRun.Generation, + Reason: condition.UpdateRunStartedReason, + }) +} + +// recordUpdateRunFailed records the ClusterStagedUpdateRun status. +func (r *Reconciler) recordUpdateRunStatus(ctx context.Context, updateRun *placementv1alpha1.ClusterStagedUpdateRun) error { + if updateErr := r.Client.Status().Update(ctx, updateRun); updateErr != nil { + klog.ErrorS(updateErr, "Failed to update the ClusterStagedUpdateRun status as failed", "stagedUpdateRun", klog.KObj(updateRun)) + return updateErr + } + return nil +} + +// recordUpdateRunSucceeded records the update run as succeeded. +func (r *Reconciler) recordUpdateRunSucceeded(ctx context.Context, updateRun *placementv1alpha1.ClusterStagedUpdateRun) error { + meta.SetStatusCondition(&updateRun.Status.Conditions, metav1.Condition{ + Type: string(placementv1alpha1.StagedUpdateRunConditionSucceeded), + Status: metav1.ConditionTrue, + ObservedGeneration: updateRun.Generation, + Reason: condition.UpdateRunSucceededReason, + }) + if updateErr := r.Client.Status().Update(ctx, updateRun); updateErr != nil { + klog.ErrorS(updateErr, "Failed to update the ClusterStagedUpdateRun status as completed successfully", "stagedUpdateRun", klog.KObj(updateRun)) + return updateErr + } + return nil +} diff --git a/pkg/controllers/updaterun/initialization.go b/pkg/controllers/updaterun/initialization.go new file mode 100644 index 000000000..c8dda80b2 --- /dev/null +++ b/pkg/controllers/updaterun/initialization.go @@ -0,0 +1,373 @@ +/* +Copyright (c) Microsoft Corporation. +Licensed under the MIT license. +*/ + +package updaterun + +import ( + "context" + "fmt" + "sort" + "strconv" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/client" + + clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1" + placementv1alpha1 "go.goms.io/fleet/apis/placement/v1alpha1" + placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" + "go.goms.io/fleet/pkg/utils/annotations" + "go.goms.io/fleet/pkg/utils/condition" + "go.goms.io/fleet/pkg/utils/controller" +) + +var errInitializedFailed = fmt.Errorf("%w: failed to initialize the StagedUpdateRun", errStagedUpdatedAborted) + +// initialize initializes the ClusterStagedUpdateRun object with all the stages computed during the initialization. +// This function is called only once during the initialization of the ClusterStagedUpdateRun. +func (r *Reconciler) initialize(ctx context.Context, updateRun *placementv1alpha1.ClusterStagedUpdateRun) ([]*placementv1beta1.ClusterResourceBinding, []*placementv1beta1.ClusterResourceBinding, error) { + // Validate the ClusterResourcePlacement object referenced by the ClusterStagedUpdateRun + placementName, err := r.validateCRP(ctx, updateRun) + if err != nil { + return nil, nil, err + } + // Record the latest policy snapshot associated with the ClusterResourcePlacement + latestPolicySnapshot, _, err := r.determinePolicySnapshot(ctx, placementName, updateRun) + if err != nil { + return nil, nil, err + } + // Collect the scheduled clusters by the corresponding ClusterResourcePlacement with the latest policy snapshot + scheduledBinding, tobeDeleted, err := r.collectScheduledClusters(ctx, placementName, latestPolicySnapshot, updateRun) + if err != nil { + return nil, nil, err + } + // Compute the stages based on the StagedUpdateStrategy + if err = r.generateStageByStrategy(ctx, scheduledBinding, tobeDeleted, updateRun); err != nil { + return nil, nil, err + } + // Record the override snapshots associated with each cluster + if err = r.recordOverrideSnapshots(ctx, updateRun); err != nil { + return nil, nil, err + } + // Update the ClusterStagedUpdateRun's initialized condition + meta.SetStatusCondition(&updateRun.Status.Conditions, metav1.Condition{ + Type: string(placementv1alpha1.StagedUpdateRunConditionInitialized), + Status: metav1.ConditionTrue, + ObservedGeneration: updateRun.Generation, + Reason: condition.UpdateRunInitializeSucceededReason, + Message: "update run initialized successfully", + }) + return scheduledBinding, tobeDeleted, r.Client.Status().Update(ctx, updateRun) +} + +// validateCRP validates the ClusterResourcePlacement object referenced by the ClusterStagedUpdateRun. +func (r *Reconciler) validateCRP(ctx context.Context, updateRun *placementv1alpha1.ClusterStagedUpdateRun) (string, error) { + updateRunRef := klog.KObj(updateRun) + // Fetch the ClusterResourcePlacement object + clusterResourcePlacementName := updateRun.Spec.PlacementName + var clusterResourcePlacement placementv1beta1.ClusterResourcePlacement + if err := r.Get(ctx, client.ObjectKey{Name: clusterResourcePlacementName}, &clusterResourcePlacement); err != nil { + klog.ErrorS(err, "Failed to get ClusterResourcePlacement", "clusterResourcePlacement", clusterResourcePlacementName, "stagedUpdateRun", updateRunRef) + if apierrors.IsNotFound(err) { + // we won't continue the initialization if the ClusterResourcePlacement is not found + return "", fmt.Errorf("%w: %s", errInitializedFailed, "Parent placement not found") + } + return "", err + } + // Check if the ClusterResourcePlacement has an external rollout strategy + if clusterResourcePlacement.Spec.Strategy.Type != placementv1beta1.ExternalRolloutStrategyType { + klog.V(2).InfoS("The ClusterResourcePlacement does not have an external rollout strategy", "clusterResourcePlacement", clusterResourcePlacementName, "stagedUpdateRun", updateRunRef) + return "", fmt.Errorf("%w: %s", errInitializedFailed, "The ClusterResourcePlacement does not have an external rollout strategy") + } + updateRun.Status.ApplyStrategy = clusterResourcePlacement.Spec.Strategy.ApplyStrategy + return clusterResourcePlacement.Name, nil +} + +// determinePolicySnapshot retrieves the latest policy snapshot associated with the ClusterResourcePlacement and validates it and records it in the ClusterStagedUpdateRun status. +func (r *Reconciler) determinePolicySnapshot(ctx context.Context, placementName string, updateRun *placementv1alpha1.ClusterStagedUpdateRun) (*placementv1beta1.ClusterSchedulingPolicySnapshot, int, error) { + updateRunRef := klog.KObj(updateRun) + // Get the latest policy snapshot + var policySnapshotList placementv1beta1.ClusterSchedulingPolicySnapshotList + latestPolicyMatcher := client.MatchingLabels{ + placementv1beta1.CRPTrackingLabel: placementName, + placementv1beta1.IsLatestSnapshotLabel: "true", + } + if err := r.List(ctx, &policySnapshotList, latestPolicyMatcher); err != nil { + klog.ErrorS(err, "Failed to list the latest policy snapshots of a cluster resource placement", "clusterResourcePlacement", placementName, "stagedUpdateRun", updateRunRef) + return nil, -1, err + } + if len(policySnapshotList.Items) != 1 { + if len(policySnapshotList.Items) > 1 { + err := fmt.Errorf("more than one latest policy snapshot associated with cluster resource placement: %s", placementName) + klog.ErrorS(controller.NewUnexpectedBehaviorError(err), "Failed to find the latest policy snapshot", "clusterResourcePlacement", placementName, "numberOfSnapshot", len(policySnapshotList.Items), "stagedUpdateRun", updateRunRef) + return nil, -1, fmt.Errorf("%w: %s", errInitializedFailed, err.Error()) + } + err := fmt.Errorf("no latest policy snapshot associated with cluster resource placement: %s", placementName) + klog.ErrorS(err, "Failed to find the latest policy snapshot", "clusterResourcePlacement", placementName, "numberOfSnapshot", len(policySnapshotList.Items), "stagedUpdateRun", updateRunRef) + return nil, -1, fmt.Errorf("%w: %s", errInitializedFailed, err.Error()) + } + // Get the node count from the latest policy snapshot + latestPolicySnapshot := policySnapshotList.Items[0] + updateRun.Status.PolicySnapshotIndexUsed = latestPolicySnapshot.Name + clusterCount, err := annotations.ExtractNumOfClustersFromPolicySnapshot(&latestPolicySnapshot) + if err != nil { + annErr := fmt.Errorf("%w, the policySnapshot `%s` doesn't have cluster count annotation", err, latestPolicySnapshot.Name) + klog.ErrorS(controller.NewUnexpectedBehaviorError(annErr), "Failed to get the cluster count from the latestPolicySnapshot", "clusterResourcePlacement", placementName, "latestPolicySnapshot", latestPolicySnapshot.Name, "stagedUpdateRun", updateRunRef) + return nil, -1, fmt.Errorf("%w: %s", errInitializedFailed, annErr.Error()) + } + updateRun.Status.PolicyObservedClusterCount = clusterCount + klog.V(2).InfoS("Found the corresponding policy snapshot", "policySnapshot", latestPolicySnapshot.Name, "observed CRP generation", updateRun.Status.PolicyObservedClusterCount, "stagedUpdateRun", updateRunRef) + if !condition.IsConditionStatusTrue(latestPolicySnapshot.GetCondition(string(placementv1beta1.PolicySnapshotScheduled)), latestPolicySnapshot.Generation) { + scheduleErr := fmt.Errorf("policy snapshot not fully scheduled yet") + klog.ErrorS(scheduleErr, "The policy snapshot is not scheduled successfully", "clusterResourcePlacement", placementName, "latestPolicySnapshot", latestPolicySnapshot.Name, "stagedUpdateRun", updateRunRef) + return nil, -1, fmt.Errorf("%w: %s", errInitializedFailed, scheduleErr.Error()) + } + return &latestPolicySnapshot, clusterCount, nil +} + +// collectScheduledClusters retrieves the scheduled clusters from the latest policy snapshot and lists all the bindings according to its SchedulePolicyTrackingLabel. +func (r *Reconciler) collectScheduledClusters(ctx context.Context, placementName string, latestPolicySnapshot *placementv1beta1.ClusterSchedulingPolicySnapshot, + updateRun *placementv1alpha1.ClusterStagedUpdateRun) ([]*placementv1beta1.ClusterResourceBinding, []*placementv1beta1.ClusterResourceBinding, error) { + updateRunRef := klog.KObj(updateRun) + // List all the bindings according to the SchedulePolicyTrackingLabel + var bindingsList placementv1beta1.ClusterResourceBindingList + schedulePolicyMatcher := client.MatchingLabels{ + placementv1beta1.CRPTrackingLabel: placementName, + } + if err := r.List(ctx, &bindingsList, schedulePolicyMatcher); err != nil { + klog.ErrorS(err, "Failed to list bindings according to the SchedulePolicyTrackingLabel", "policySnapshot", latestPolicySnapshot.Name, "stagedUpdateRun", updateRunRef) + return nil, nil, err + } + var tobeDeleted, selectedBindings []*placementv1beta1.ClusterResourceBinding + for i, binding := range bindingsList.Items { + if binding.Spec.SchedulingPolicySnapshotName == latestPolicySnapshot.Name { + if binding.Spec.State != placementv1beta1.BindingStateScheduled { + return nil, nil, controller.NewUnexpectedBehaviorError(fmt.Errorf("binding `%s`'s state %s is not scheduled", binding.Name, binding.Spec.State)) + } + klog.V(2).InfoS("Found a scheduled binding", "binding", binding.Name, "policySnapshot", latestPolicySnapshot.Name, "stagedUpdateRun", updateRunRef) + selectedBindings = append(selectedBindings, &bindingsList.Items[i]) + } else { + klog.V(2).InfoS("Found a to be deleted binding", "binding", binding.Name, "policySnapshot", latestPolicySnapshot.Name, "stagedUpdateRun", updateRunRef) + tobeDeleted = append(tobeDeleted, &bindingsList.Items[i]) + } + } + if len(selectedBindings) == 0 { + err := fmt.Errorf("no scheduled bindings found for the policy snapshot: %s", latestPolicySnapshot.Name) + klog.ErrorS(err, "Failed to find the scheduled bindings", "policySnapshot", latestPolicySnapshot.Name, "stagedUpdateRun", updateRunRef) + return nil, nil, fmt.Errorf("%w: %s", errInitializedFailed, err.Error()) + } + return selectedBindings, tobeDeleted, nil +} + +// generateStageByStrategy computes the stages based on the StagedUpdateStrategy the ClusterStagedUpdateRun references. +func (r *Reconciler) generateStageByStrategy(ctx context.Context, scheduledBindings, tobeDeletedBindings []*placementv1beta1.ClusterResourceBinding, updateRun *placementv1alpha1.ClusterStagedUpdateRun) error { + // Fetch the StagedUpdateStrategy referenced by StagedUpdateStrategyName + var updateStrategy placementv1alpha1.ClusterStagedUpdateStrategy + if err := r.Client.Get(ctx, types.NamespacedName{Name: updateRun.Spec.StagedUpdateStrategyName}, &updateStrategy); err != nil { + klog.ErrorS(err, "Failed to get StagedUpdateStrategy", "stagedUpdateStrategy", updateRun.Spec.StagedUpdateStrategyName) + if apierrors.IsNotFound(err) { + // we won't continue the initialization if the StagedUpdateStrategy is not found + return fmt.Errorf("%w: %s", errInitializedFailed, "referenced update strategy not found") + } + return err + } + // this won't change even if the stagedUpdateStrategy changes or is deleted after the updateRun is initialized + updateRun.Status.StagedUpdateStrategySnapshot = &updateStrategy.Spec + // Record the stages in the ClusterStagedUpdateRun status + err := r.computeRunStageStatus(ctx, scheduledBindings, updateRun) + if err != nil { + return err + } + // Record the clusters to be deleted + tobeDeletedCluster := make([]placementv1alpha1.ClusterUpdatingStatus, len(tobeDeletedBindings)) + for i, binding := range tobeDeletedBindings { + klog.V(2).InfoS("Add a cluster to the delete stage", "cluster", binding.Spec.TargetCluster, "stagedUpdateStrategy", updateRun.Spec.StagedUpdateStrategyName, "stagedUpdateRun", klog.KObj(updateRun)) + tobeDeletedCluster[i].ClusterName = binding.Spec.TargetCluster + } + // Sort the clusters in the stage based on the cluster name + sort.Slice(tobeDeletedCluster, func(i, j int) bool { + return tobeDeletedCluster[i].ClusterName < tobeDeletedCluster[j].ClusterName + }) + updateRun.Status.DeletionStageStatus = &placementv1alpha1.StageUpdatingStatus{ + StageName: placementv1alpha1.UpdateRunDeleteStageName, + Clusters: tobeDeletedCluster, + } + return nil +} + +// computeRunStageStatus computes the stages based on the StagedUpdateStrategy and scheduled run and records them in the ClusterStagedUpdateRun status. +func (r *Reconciler) computeRunStageStatus(ctx context.Context, scheduledBindings []*placementv1beta1.ClusterResourceBinding, updateRun *placementv1alpha1.ClusterStagedUpdateRun) error { + updateRunRef := klog.KObj(updateRun) + stagedUpdateStrategyName := updateRun.Spec.StagedUpdateStrategyName + // Map to track clusters and ensure they appear in only one stage + allSelectedClusters := make(map[string]bool, len(scheduledBindings)) + allPlacedClusters := make(map[string]bool) + for _, binding := range scheduledBindings { + allSelectedClusters[binding.Spec.TargetCluster] = true + } + // Apply the label selectors from the StagedUpdateStrategy to filter the clusters + for _, stage := range updateRun.Status.StagedUpdateStrategySnapshot.Stages { + if err := validateAfterStageTask(stage.AfterStageTasks); err != nil { + klog.ErrorS(err, "Failed to validate the after stage tasks", "stagedUpdateStrategy", stagedUpdateStrategyName, "stage", stage.Name, "stagedUpdateRun", updateRunRef) + + return fmt.Errorf("%w: the after stage tasks are invalide, stagedUpdateStrategy is `%s`, stage Name is `%s`, err = %s", errInitializedFailed, stagedUpdateStrategyName, stage.Name, err.Error()) + } + curSageUpdatingStatus := placementv1alpha1.StageUpdatingStatus{ + StageName: stage.Name, + } + var curStageClusters []clusterv1beta1.MemberCluster + labelSelector, err := metav1.LabelSelectorAsSelector(stage.LabelSelector) + if err != nil { + klog.ErrorS(err, "Failed to convert label selector", "stagedUpdateStrategy", stagedUpdateStrategyName, "stage", stage.Name, "labelSelector", stage.LabelSelector, "stagedUpdateRun", updateRunRef) + return fmt.Errorf("%w: the stage label selector is invalide, stagedUpdateStrategy is `%s`, stage Name is `%s`, err = %s", errInitializedFailed, stagedUpdateStrategyName, stage.Name, err.Error()) + } + // List all the clusters that match the label selector + clusterList := &clusterv1beta1.MemberClusterList{} + listOptions := &client.ListOptions{LabelSelector: labelSelector} + if err = r.List(ctx, clusterList, listOptions); err != nil { + klog.ErrorS(err, "Failed to list clusters for the stage", "stagedUpdateStrategy", stagedUpdateStrategyName, "stage", stage.Name, "stagedUpdateRun", updateRunRef) + return err + } + // intersect the selected clusters with the clusters in the stage + for _, cluster := range clusterList.Items { + if allSelectedClusters[cluster.Name] { + if allPlacedClusters[cluster.Name] { + err = fmt.Errorf("cluster `%s` appears in more than one stage", cluster.Name) + klog.ErrorS(err, "Failed to compute the stages", "stagedUpdateStrategy", stagedUpdateStrategyName, "stage", stage.Name, "stagedUpdateRun", updateRunRef) + return fmt.Errorf("%w: %s", errInitializedFailed, err.Error()) + } + if stage.SortingLabelKey != nil { + // interpret the label values as integers + _, err = strconv.Atoi(cluster.Labels[*stage.SortingLabelKey]) + if err != nil { + sortingKeyErr := fmt.Errorf("the sorting label `%s` on cluster `%s` is not valid", *stage.SortingLabelKey, cluster.Name) + klog.ErrorS(sortingKeyErr, "The sorting label is not an integer", "stagedUpdateStrategy", stagedUpdateStrategyName, "stage", stage.Name, "stagedUpdateRun", updateRunRef) + return fmt.Errorf("%w: %s", errInitializedFailed, sortingKeyErr.Error()) + } + } + curStageClusters = append(curStageClusters, cluster) + allPlacedClusters[cluster.Name] = true + } + } + // Check if the stage has any clusters selected + if len(curStageClusters) == 0 { + err = fmt.Errorf("stage '%s' has no clusters selected", stage.Name) + klog.Error(err, "No cluster is selected for the stage", "stagedUpdateStrategy", stagedUpdateStrategyName, "stage", stage.Name, "stagedUpdateRun", updateRunRef) + return fmt.Errorf("%w: %s", errInitializedFailed, err.Error()) + } + // Sort the clusters in the stage based on the SortingLabelKey and cluster name + sort.Slice(curStageClusters, func(i, j int) bool { + if stage.SortingLabelKey == nil { + return curStageClusters[i].Name < curStageClusters[j].Name + } + labelI := curStageClusters[i].Labels[*stage.SortingLabelKey] + labelJ := curStageClusters[j].Labels[*stage.SortingLabelKey] + intI, _ := strconv.Atoi(labelI) + intJ, _ := strconv.Atoi(labelJ) + if intI != intJ { + return intI < intJ + } + return curStageClusters[i].Name < curStageClusters[j].Name + }) + // Record the clusters in the stage status + curSageUpdatingStatus.Clusters = make([]placementv1alpha1.ClusterUpdatingStatus, len(curStageClusters)) + for i, cluster := range curStageClusters { + klog.V(2).InfoS("Add a cluster to stage", "cluster", cluster.Name, "stagedUpdateStrategy", stagedUpdateStrategyName, "stage", stage.Name, "stagedUpdateRun", updateRunRef) + curSageUpdatingStatus.Clusters[i].ClusterName = cluster.Name + } + // create the after stage tasks status array + curSageUpdatingStatus.AfterStageTaskStatus = make([]placementv1alpha1.AfterStageTaskStatus, len(stage.AfterStageTasks)) + // Record the after stage tasks in the stage status + for i, afterStageTask := range stage.AfterStageTasks { + curSageUpdatingStatus.AfterStageTaskStatus[i].Type = afterStageTask.Type + if afterStageTask.Type == placementv1alpha1.AfterStageTaskTypeApproval { + curSageUpdatingStatus.AfterStageTaskStatus[i].ApprovalRequestName = fmt.Sprintf(placementv1alpha1.ApprovalTaskNameFmt, updateRun.Name, stage.Name) + } + } + updateRun.Status.StagesStatus = append(updateRun.Status.StagesStatus, curSageUpdatingStatus) + } + // Check if all the selected clusters are placed in a stage + if len(allPlacedClusters) < len(allSelectedClusters) { + err := fmt.Errorf("some clusters are not placed in any stage") + for cluster := range allSelectedClusters { + if !allPlacedClusters[cluster] { + klog.ErrorS(err, "one cluster is not placed in any stage", "selectedCluster", cluster, "stagedUpdateStrategy", stagedUpdateStrategyName, "stagedUpdateRun", updateRunRef) + r.recorder.Event(updateRun, corev1.EventTypeWarning, "MissingCluster", fmt.Sprintf("The cluster `%s` in not selected in any stage", cluster)) + } + } + return fmt.Errorf("%w: %s", errInitializedFailed, err.Error()) + } + return nil +} + +// validateAfterStageTask validates the afterStageTask in the stage defined in updateRunStrategy. +func validateAfterStageTask(afterStageTasks []placementv1alpha1.AfterStageTask) error { + for _, afterStageTask := range afterStageTasks { + if afterStageTask.Type == placementv1alpha1.AfterStageTaskTypeTimedWait { + if afterStageTask.WaitTime.Duration == 0 { + return fmt.Errorf("the wait task duration is 0") + } + if afterStageTask.WaitTime.Duration < 0 { + return fmt.Errorf("the wait task duration is negative") + } + } + } + return nil +} + +// recordOverrideSnapshots finds all the override snapshots that are associated with each cluster and record them in the ClusterStagedUpdateRun status. +// This is done only once during the initialization. +func (r *Reconciler) recordOverrideSnapshots(ctx context.Context, updateRun *placementv1alpha1.ClusterStagedUpdateRun) error { + updateRunRef := klog.KObj(updateRun) + var masterResourceSnapshot placementv1beta1.ClusterResourceSnapshot + if err := r.Get(ctx, types.NamespacedName{Name: updateRun.Spec.ResourceSnapshotIndex}, &masterResourceSnapshot); err != nil { + klog.ErrorS(err, "Failed to get the master resource snapshot", "resourceSnapshot", updateRun.Spec.ResourceSnapshotIndex, "stagedUpdateRun", updateRunRef) + return err + } + if len(masterResourceSnapshot.Annotations[placementv1beta1.ResourceGroupHashAnnotation]) == 0 { + err := fmt.Errorf("the resource snapshot is not a master snapshot") + klog.ErrorS(err, "Failed to get the master resource snapshot", "resourceSnapshot", updateRun.Spec.ResourceSnapshotIndex, "stagedUpdateRun", updateRunRef) + return err + } + // Fetch all the matching overrides which selected the resources + matchedCRO, matchedRO, err := controller.FetchAllMatchOverridesForResourceSnapshot(ctx, r.Client, r.InformerManager, updateRun.Spec.PlacementName, &masterResourceSnapshot) + if err != nil { + klog.ErrorS(err, "Failed to find all matching overrides for the update run", "stagedUpdateRun", updateRunRef, "masterResourceSnapshot", klog.KObj(&masterResourceSnapshot)) + return err + } + // Pick the overrides associated with the target cluster + for _, stageStatus := range updateRun.Status.StagesStatus { + for _, clusterStatus := range stageStatus.Clusters { + // Fetch the override snapshots associated with the cluster + clusterStatus.ClusterResourceOverrideSnapshots, clusterStatus.ResourceOverrideSnapshots, err = + controller.PickFromResourceMatchedOverridesForTargetCluster(ctx, r.Client, clusterStatus.ClusterName, matchedCRO, matchedRO) + if err != nil { + klog.ErrorS(err, "Failed to pick the override snapshots for the cluster", "cluster", clusterStatus.ClusterName, "stagedUpdateRun", updateRunRef, "masterResourceSnapshot", klog.KObj(&masterResourceSnapshot)) + return err + } + } + } + return nil +} + +// recordInitializationFailed records the failed initialization in the ClusterStagedUpdateRun status. +func (r *Reconciler) recordInitializationFailed(ctx context.Context, updateRun *placementv1alpha1.ClusterStagedUpdateRun, message string) error { + meta.SetStatusCondition(&updateRun.Status.Conditions, metav1.Condition{ + Type: string(placementv1alpha1.StagedUpdateRunConditionInitialized), + Status: metav1.ConditionFalse, + ObservedGeneration: updateRun.Generation, + Reason: condition.UpdateRunInitializeFailedReason, + Message: message, + }) + if updateErr := r.Client.Status().Update(ctx, updateRun); updateErr != nil { + klog.ErrorS(updateErr, "Failed to update the ClusterStagedUpdateRun status as failed to initialize", "stagedUpdateRun", klog.KObj(updateRun)) + return updateErr + } + return nil +} diff --git a/pkg/controllers/updaterun/initialization_integration_test.go b/pkg/controllers/updaterun/initialization_integration_test.go new file mode 100644 index 000000000..05421bd89 --- /dev/null +++ b/pkg/controllers/updaterun/initialization_integration_test.go @@ -0,0 +1,733 @@ +/* +Copyright (c) Microsoft Corporation. +Licensed under the MIT license. +*/ + +package updaterun + +import ( + "fmt" + "strconv" + "strings" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + + clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1" + placementv1alpha1 "go.goms.io/fleet/apis/placement/v1alpha1" + placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" + "go.goms.io/fleet/pkg/utils" + "go.goms.io/fleet/pkg/utils/condition" +) + +const ( + timeout = time.Second * 5 + interval = time.Millisecond * 250 + consistentTimeout = time.Second * 20 + consistentInterval = time.Second * 1 +) + +var ( + cmpOptions = []cmp.Option{ + cmpopts.EquateEmpty(), + cmpopts.IgnoreFields(metav1.ObjectMeta{}, "ResourceVersion"), + cmpopts.SortSlices(func(c1, c2 metav1.Condition) bool { + return c1.Type < c2.Type + }), + cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime"), + cmpopts.IgnoreFields(metav1.Condition{}, "Message"), + } +) + +var _ = Describe("Test the ClusterStagedUpdateRun Controller", func() { + var testCRPName string + var crp *placementv1beta1.ClusterResourcePlacement + var testUpdateRunName string + var updateRun *placementv1alpha1.ClusterStagedUpdateRun + var policySnapshot *placementv1beta1.ClusterSchedulingPolicySnapshot + var testUpdateStrategyName string + var updateStrategy *placementv1alpha1.ClusterStagedUpdateStrategy + var masterSnapshot *placementv1beta1.ClusterResourceSnapshot + numberOfClustersAnnotation := 2 + numTargetCluster := 10 // make it easier to sort the clusters by its name + numDeletingCluster := 3 + + BeforeEach(func() { + testCRPName = "crp-" + utils.RandStr() + testUpdateRunName = "updaterun-" + utils.RandStr() + testUpdateStrategyName = "updatestrategy-" + utils.RandStr() + crp = &placementv1beta1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: testCRPName, + }, + Spec: placementv1beta1.ClusterResourcePlacementSpec{ + ResourceSelectors: []placementv1beta1.ClusterResourceSelector{ + { + Group: "", + Version: "v1", + Kind: "Namespace", + Name: "test-namespace", + }, + }, + Strategy: placementv1beta1.RolloutStrategy{ + ApplyStrategy: &placementv1beta1.ApplyStrategy{ + Type: placementv1beta1.ApplyStrategyTypeReportDiff, + WhenToTakeOver: placementv1beta1.WhenToTakeOverTypeIfNoDiff, + }, + }, + }, + } + policySnapshot = &placementv1beta1.ClusterSchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(placementv1beta1.PolicySnapshotNameFmt, testCRPName, 2), + Labels: map[string]string{ + placementv1beta1.CRPTrackingLabel: testCRPName, + placementv1beta1.IsLatestSnapshotLabel: "true", + }, + }, + + Spec: placementv1beta1.SchedulingPolicySnapshotSpec{ + PolicyHash: []byte("hash"), + }, + } + updateStrategy = &placementv1alpha1.ClusterStagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: testUpdateStrategyName, + Namespace: "default", + }, + } + // create the resourcesnapshot + masterSnapshot = generateResourceSnapshot(testCRPName, 1, true) + Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed()) + updateRun = &placementv1alpha1.ClusterStagedUpdateRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: testUpdateRunName, + Namespace: "default", + }, + Spec: placementv1alpha1.StagedUpdateRunSpec{ + PlacementName: testCRPName, + ResourceSnapshotIndex: masterSnapshot.Name, + StagedUpdateStrategyName: testUpdateStrategyName, + }, + } + }) + + AfterEach(func() { + By("Deleting ClusterStagedUpdateRun") + Expect(k8sClient.Delete(ctx, updateRun)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) + updateRun = nil + By("Deleting StagedUpdateStrategy") + Expect(k8sClient.Delete(ctx, updateStrategy)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) + updateStrategy = nil + By("Deleting ClusterResourcePlacement") + Expect(k8sClient.Delete(ctx, crp)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) + crp = nil + By("Deleting ClusterResourceSnapshot") + Expect(k8sClient.Delete(ctx, masterSnapshot)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) + masterSnapshot = nil + By("Deleting ClusterSchedulingPolicySnapshot") + Expect(k8sClient.Delete(ctx, policySnapshot)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) + policySnapshot = nil + }) + + Describe("Test the validateCRP function", func() { + It("Should fail validation for non-existent CRP", func() { + Expect(k8sClient.Create(ctx, updateRun)).Should(Succeed()) + // we didn't create crp, so it should fail validation + Eventually(func() error { + return verifyFailedInitCondition(updateRun, "Parent placement not found") + }, timeout, interval).Should(Succeed()) + }) + + It("Should fail validation for CRP without external rollout strategy", func() { + // Create a CRP object without an external rollout strategy + crp.Spec.Strategy.Type = placementv1beta1.RollingUpdateRolloutStrategyType + Expect(k8sClient.Create(ctx, crp)).Should(Succeed()) + Expect(k8sClient.Create(ctx, updateRun)).Should(Succeed()) + Eventually(func() error { + return verifyFailedInitCondition(updateRun, "The ClusterResourcePlacement does not have an external rollout strategy") + }, timeout, interval).Should(Succeed()) + }) + }) + + Describe("Test the initialize function", func() { + BeforeEach(func() { + crp.Spec.Strategy.Type = placementv1beta1.ExternalRolloutStrategyType + Expect(k8sClient.Create(ctx, crp)).Should(Succeed()) + }) + + It("Should fail validation for non-existent latest policySnapshot", func() { + Expect(k8sClient.Create(ctx, updateRun)).Should(Succeed()) + // we didn't create the policy snapshot, so it should fail validation + Eventually(func() error { + return verifyFailedInitCondition(updateRun, "no latest policy snapshot associated with cluster resource placement") + }, timeout, interval).Should(Succeed()) + }) + + It("Should fail validation for latest policySnapshot without node count annotation", func() { + // create the policy snapshot without the number of node annotation + Expect(k8sClient.Create(ctx, policySnapshot)).Should(Succeed()) + Expect(k8sClient.Create(ctx, updateRun)).Should(Succeed()) + Eventually(func() error { + return verifyFailedInitCondition(updateRun, "doesn't have node count annotation") + }, timeout, interval).Should(Succeed()) + }) + + It("Should fail validation for not scheduled policySnapshot", func() { + // create the policy snapshot with the number of node annotation + policySnapshot.Annotations = map[string]string{ + placementv1beta1.NumberOfClustersAnnotation: strconv.Itoa(numberOfClustersAnnotation), + } + Expect(k8sClient.Create(ctx, policySnapshot)).Should(Succeed()) + Expect(k8sClient.Create(ctx, updateRun)).Should(Succeed()) + Eventually(func() error { + return verifyFailedInitCondition(updateRun, "policy snapshot not fully scheduled yet") + }, timeout, interval).Should(Succeed()) + }) + + It("Should fail validation for no selected binding", func() { + // create the policy snapshot with the number of node annotation and mark it as scheduled but no binding + policySnapshot.Annotations = map[string]string{ + placementv1beta1.NumberOfClustersAnnotation: strconv.Itoa(numberOfClustersAnnotation), + } + Expect(k8sClient.Create(ctx, policySnapshot)).Should(Succeed()) + meta.SetStatusCondition(&policySnapshot.Status.Conditions, metav1.Condition{ + Type: string(placementv1beta1.PolicySnapshotScheduled), + Status: metav1.ConditionTrue, + ObservedGeneration: policySnapshot.Generation, + Reason: "scheduled", + }) + Expect(k8sClient.Status().Update(ctx, policySnapshot)).Should(Succeed()) + Expect(k8sClient.Create(ctx, updateRun)).Should(Succeed()) + Eventually(func() error { + return verifyFailedInitCondition(updateRun, "no scheduled bindings found for the policy snapshot") + }, timeout, interval).Should(Succeed()) + By("Deleting policySnapshot") + Expect(k8sClient.Delete(ctx, policySnapshot)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) + }) + + Describe("Test the stage initialization", func() { + var bindings []*placementv1beta1.ClusterResourceBinding + var memberClusters []*clusterv1beta1.MemberCluster + BeforeEach(func() { + // create the policy snapshot with the number of node annotation and mark it as scheduled + policySnapshot.Annotations = map[string]string{ + placementv1beta1.NumberOfClustersAnnotation: strconv.Itoa(numberOfClustersAnnotation), + } + Expect(k8sClient.Create(ctx, policySnapshot)).Should(Succeed()) + meta.SetStatusCondition(&policySnapshot.Status.Conditions, metav1.Condition{ + Type: string(placementv1beta1.PolicySnapshotScheduled), + Status: metav1.ConditionTrue, + ObservedGeneration: policySnapshot.Generation, + Reason: "scheduled", + }) + Expect(k8sClient.Status().Update(ctx, policySnapshot)).Should(Succeed()) + clusters := make([]string, numTargetCluster) + deletingClusters := make([]string, numDeletingCluster) + // create scheduled bindings for master snapshot on target clusters + for i := 0; i < numTargetCluster; i++ { + clusters[i] = "cluster-" + strconv.Itoa(i) + binding := generateClusterResourceBinding(placementv1beta1.BindingStateScheduled, testCRPName, masterSnapshot.Name, policySnapshot.Name, clusters[i]) + Expect(k8sClient.Create(ctx, binding)).Should(Succeed()) + By(fmt.Sprintf("target resource binding %s created", binding.Name)) + bindings = append(bindings, binding) + mc := generateCluster(clusters[i], map[string]string{"group": "prod"}) + Expect(k8sClient.Create(ctx, mc)).Should(Succeed()) + By(fmt.Sprintf("member cluster %s created", mc.Name)) + memberClusters = append(memberClusters, mc) + } + // create unscheduled bindings for master snapshot on target clusters + for i := 0; i < numDeletingCluster; i++ { + deletingClusters[i] = "deleting-cluster-" + strconv.Itoa(i) + binding := generateDeletingClusterResourceBinding(testCRPName, deletingClusters[i]) + Expect(k8sClient.Create(ctx, binding)).Should(Succeed()) + By(fmt.Sprintf("deleting resource binding %s created", binding.Name)) + bindings = append(bindings, binding) + mc := generateCluster(deletingClusters[i], map[string]string{"group": "staging"}) + Expect(k8sClient.Create(ctx, mc)).Should(Succeed()) + By(fmt.Sprintf("member cluster %s created", mc.Name)) + memberClusters = append(memberClusters, mc) + } + }) + + AfterEach(func() { + By("Deleting ClusterResourceBindings") + for _, binding := range bindings { + Expect(k8sClient.Delete(ctx, binding)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) + } + bindings = nil + By("Deleting MemberClusters") + for _, cluster := range memberClusters { + Expect(k8sClient.Delete(ctx, cluster)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) + } + memberClusters = nil + }) + + It("Should fail validation for no staged update strategy", func() { + Expect(k8sClient.Create(ctx, updateRun)).Should(Succeed()) + Eventually(func() error { + return verifyFailedInitCondition(updateRun, "referenced update strategy not found") + }, timeout, interval).Should(Succeed()) + }) + + It("Should fail validation for no cluster selected in stage", func() { + emptyStageName := "emptyStage" + updateStrategy.Spec = placementv1alpha1.StagedUpdateStrategySpec{ + Stages: []placementv1alpha1.StageConfig{ + { + Name: "duplicateStage1", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "group": "prod", + }, + }, + }, + { + Name: emptyStageName, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "group": "non-existent", + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, updateStrategy)).Should(Succeed()) + Expect(k8sClient.Create(ctx, updateRun)).Should(Succeed()) + Eventually(func() error { + errMsg := fmt.Sprintf("stage '%s' has no clusters selected", emptyStageName) + return verifyFailedInitCondition(updateRun, errMsg) + }, timeout, interval).Should(Succeed()) + }) + + It("Should fail validation for one cluster selected in two stages", func() { + mc := memberClusters[0] + mc.Labels = map[string]string{"app": "test", "group": "prod"} + Expect(k8sClient.Update(ctx, mc)).Should(Succeed()) + By(fmt.Sprintf("member cluster %s updated with more label", mc.Name)) + memberClusters = append(memberClusters, mc) + updateStrategy.Spec = placementv1alpha1.StagedUpdateStrategySpec{ + Stages: []placementv1alpha1.StageConfig{ + { + Name: "stage", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "group": "prod", + }, + }, + }, + { + Name: "duplicateStage", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "test", + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, updateStrategy)).Should(Succeed()) + Expect(k8sClient.Create(ctx, updateRun)).Should(Succeed()) + Eventually(func() error { + errMsg := fmt.Sprintf("cluster `%s` appears in more than one stage", mc.Name) + return verifyFailedInitCondition(updateRun, errMsg) + }, timeout, interval).Should(Succeed()) + }) + + It("Should fail validation for cluster not selected by any stage", func() { + // create an extra binding that is not selected by the stage + binding := generateClusterResourceBinding(placementv1beta1.BindingStateScheduled, testCRPName, masterSnapshot.Name, policySnapshot.Name, "extra-cluster") + Expect(k8sClient.Create(ctx, binding)).Should(Succeed()) + By(fmt.Sprintf("target resource binding %s created", binding.Name)) + bindings = append(bindings, binding) + // create a strategy with a stage that doesn't select the extra cluster + updateStrategy.Spec = placementv1alpha1.StagedUpdateStrategySpec{ + Stages: []placementv1alpha1.StageConfig{ + { + Name: "partialStage", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "group": "prod", + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, updateStrategy)).Should(Succeed()) + Expect(k8sClient.Create(ctx, updateRun)).Should(Succeed()) + Eventually(func() error { + return verifyFailedInitCondition(updateRun, "some clusters are not placed in any stage") + }, timeout, interval).Should(Succeed()) + }) + + It("Should fail validation for cluster not having the sorting key", func() { + sortingKey := "order" + // create a strategy with a stage that has a sorting key not exist in the cluster + updateStrategy.Spec = placementv1alpha1.StagedUpdateStrategySpec{ + Stages: []placementv1alpha1.StageConfig{ + { + Name: "partialStage", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "group": "prod", + }, + }, + SortingLabelKey: &sortingKey, + }, + }, + } + Expect(k8sClient.Create(ctx, updateStrategy)).Should(Succeed()) + Expect(k8sClient.Create(ctx, updateRun)).Should(Succeed()) + Eventually(func() error { + errMsg := fmt.Sprintf("the sorting label `%s` on cluster", sortingKey) + return verifyFailedInitCondition(updateRun, errMsg) + }, timeout, interval).Should(Succeed()) + }) + + It("Should pass validation with correct stage status with not selected clusters", func() { + mc := generateCluster("extra-cluster", map[string]string{"group": "prod"}) + Expect(k8sClient.Create(ctx, mc)).Should(Succeed()) + By(fmt.Sprintf("member cluster %s created", mc.Name)) + memberClusters = append(memberClusters, mc) + // create a strategy with a stage that doesn't select the extra cluster + updateStrategy.Spec = placementv1alpha1.StagedUpdateStrategySpec{ + Stages: []placementv1alpha1.StageConfig{ + { + Name: "prodSage", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "group": "prod", + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, updateStrategy)).Should(Succeed()) + Expect(k8sClient.Create(ctx, updateRun)).Should(Succeed()) + stagesStatus := []placementv1alpha1.StageUpdatingStatus{ + { + StageName: updateStrategy.Spec.Stages[0].Name, + }, + } + deletionStageStatus := &placementv1alpha1.StageUpdatingStatus{ + StageName: placementv1alpha1.UpdateRunDeleteStageName, + } + i := 0 + // add the first numTargetCluster clusters to the first stage + for i = 0; i < numTargetCluster; i++ { + stagesStatus[0].Clusters = append(stagesStatus[0].Clusters, placementv1alpha1.ClusterUpdatingStatus{ + ClusterName: memberClusters[i].Name, + }) + } + // add the rest of the cluster to the deletion stage + for ; i < numTargetCluster+numDeletingCluster; i++ { + deletionStageStatus.Clusters = append(deletionStageStatus.Clusters, placementv1alpha1.ClusterUpdatingStatus{ + ClusterName: memberClusters[i].Name, + }) + } + Eventually(func() error { + return validateSuccessfulInitStatus(updateRun, policySnapshot.Name, numberOfClustersAnnotation, crp.Spec.Strategy.ApplyStrategy, &updateStrategy.Spec, stagesStatus, deletionStageStatus) + }, timeout, interval).Should(Succeed()) + }) + + It("Should pass validation with correct stage status with multiple stages clusters", func() { + // create extra scheduled bindings for master snapshot on target clusters as canary group + numCanaryCluster := 2 + for i := numTargetCluster; i < numTargetCluster+numCanaryCluster; i++ { + clusterName := "cluster-" + strconv.Itoa(i) + binding := generateClusterResourceBinding(placementv1beta1.BindingStateScheduled, testCRPName, masterSnapshot.Name, policySnapshot.Name, clusterName) + Expect(k8sClient.Create(ctx, binding)).Should(Succeed()) + By(fmt.Sprintf("target resource binding %s created", binding.Name)) + bindings = append(bindings, binding) + mc := generateCluster(clusterName, map[string]string{"group": "canary"}) + Expect(k8sClient.Create(ctx, mc)).Should(Succeed()) + By(fmt.Sprintf("member cluster %s created", mc.Name)) + memberClusters = append(memberClusters, mc) + } + // generate some not selected clusters + mc := generateCluster("extra-prod-cluster", map[string]string{"group": "prod"}) + Expect(k8sClient.Create(ctx, mc)).Should(Succeed()) + By(fmt.Sprintf("member cluster %s created", mc.Name)) + memberClusters = append(memberClusters, mc) + mc = generateCluster("extra-canary-cluster", map[string]string{"group": "canary"}) + Expect(k8sClient.Create(ctx, mc)).Should(Succeed()) + By(fmt.Sprintf("member cluster %s created", mc.Name)) + memberClusters = append(memberClusters, mc) + // create a strategy with a stage that doesn't select the extra cluster + updateStrategy.Spec = placementv1alpha1.StagedUpdateStrategySpec{ + Stages: []placementv1alpha1.StageConfig{ + { + Name: "canaryStage", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "group": "canary", + }, + }, + }, + { + Name: "prodStage", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "group": "prod", + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, updateStrategy)).Should(Succeed()) + Expect(k8sClient.Create(ctx, updateRun)).Should(Succeed()) + stagesStatus := []placementv1alpha1.StageUpdatingStatus{ + { + StageName: updateStrategy.Spec.Stages[0].Name, + }, + { + StageName: updateStrategy.Spec.Stages[1].Name, + }, + } + deletionStageStatus := &placementv1alpha1.StageUpdatingStatus{ + StageName: placementv1alpha1.UpdateRunDeleteStageName, + } + i := 0 + // add the first numTargetCluster clusters to the prod stage + for i = 0; i < numTargetCluster; i++ { + stagesStatus[1].Clusters = append(stagesStatus[1].Clusters, placementv1alpha1.ClusterUpdatingStatus{ + ClusterName: memberClusters[i].Name, + }) + } + // add the next batch of the cluster to the deletion stage + for ; i < numTargetCluster+numDeletingCluster; i++ { + deletionStageStatus.Clusters = append(deletionStageStatus.Clusters, placementv1alpha1.ClusterUpdatingStatus{ + ClusterName: memberClusters[i].Name, + }) + } + // add the rest of the cluster to the canary stage + for ; i < numTargetCluster+numDeletingCluster+numCanaryCluster; i++ { + stagesStatus[0].Clusters = append(stagesStatus[0].Clusters, placementv1alpha1.ClusterUpdatingStatus{ + ClusterName: memberClusters[i].Name, + }) + } + Eventually(func() error { + return validateSuccessfulInitStatus(updateRun, policySnapshot.Name, numberOfClustersAnnotation, crp.Spec.Strategy.ApplyStrategy, &updateStrategy.Spec, stagesStatus, deletionStageStatus) + }, timeout, interval).Should(Succeed()) + }) + + It("Should pass validation with correct stage status with sorting key", func() { + // create extra scheduled bindings for master snapshot on target clusters as canary group + numCanaryCluster := 6 + sortingKey := "order" + for i := numTargetCluster; i < numTargetCluster+numCanaryCluster; i++ { + clusterName := "cluster-" + strconv.Itoa(i) + binding := generateClusterResourceBinding(placementv1beta1.BindingStateScheduled, testCRPName, masterSnapshot.Name, policySnapshot.Name, clusterName) + Expect(k8sClient.Create(ctx, binding)).Should(Succeed()) + By(fmt.Sprintf("target resource binding %s created", binding.Name)) + bindings = append(bindings, binding) + // set the order key reverse of the name so that the order is different from the name + mc := generateCluster(clusterName, map[string]string{"group": "canary", sortingKey: strconv.Itoa(numTargetCluster + numCanaryCluster - i)}) + Expect(k8sClient.Create(ctx, mc)).Should(Succeed()) + By(fmt.Sprintf("member cluster %s created", mc.Name)) + memberClusters = append(memberClusters, mc) + } + // generate some not selected clusters + mc := generateCluster("extra-prod-cluster2", map[string]string{"group": "prod"}) + Expect(k8sClient.Create(ctx, mc)).Should(Succeed()) + By(fmt.Sprintf("member cluster %s created", mc.Name)) + memberClusters = append(memberClusters, mc) + mc = generateCluster("extra-canary-cluster3", map[string]string{"group": "canary"}) + Expect(k8sClient.Create(ctx, mc)).Should(Succeed()) + By(fmt.Sprintf("member cluster %s created", mc.Name)) + memberClusters = append(memberClusters, mc) + // create a strategy with a stage that doesn't select the extra cluster + updateStrategy.Spec = placementv1alpha1.StagedUpdateStrategySpec{ + Stages: []placementv1alpha1.StageConfig{ + { + Name: "canaryStage", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "group": "canary", + }, + }, + SortingLabelKey: &sortingKey, + }, + { + Name: "prodStage", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "group": "prod", + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, updateStrategy)).Should(Succeed()) + Expect(k8sClient.Create(ctx, updateRun)).Should(Succeed()) + stagesStatus := []placementv1alpha1.StageUpdatingStatus{ + { + StageName: updateStrategy.Spec.Stages[0].Name, + }, + { + StageName: updateStrategy.Spec.Stages[1].Name, + }, + } + deletionStageStatus := &placementv1alpha1.StageUpdatingStatus{ + StageName: placementv1alpha1.UpdateRunDeleteStageName, + } + i := 0 + // add the first numTargetCluster clusters to the prod stage + for i = 0; i < numTargetCluster; i++ { + stagesStatus[1].Clusters = append(stagesStatus[1].Clusters, placementv1alpha1.ClusterUpdatingStatus{ + ClusterName: memberClusters[i].Name, + }) + } + // add the next batch of the cluster to the deletion stage + for ; i < numTargetCluster+numDeletingCluster; i++ { + deletionStageStatus.Clusters = append(deletionStageStatus.Clusters, placementv1alpha1.ClusterUpdatingStatus{ + ClusterName: memberClusters[i].Name, + }) + } + // add the rest of the cluster to the canary stage in the reverse order as the sorting key is reversed + for i = numTargetCluster + numDeletingCluster + numCanaryCluster - 1; i >= numTargetCluster+numDeletingCluster; i-- { + stagesStatus[0].Clusters = append(stagesStatus[0].Clusters, placementv1alpha1.ClusterUpdatingStatus{ + ClusterName: memberClusters[i].Name, + }) + } + Eventually(func() error { + return validateSuccessfulInitStatus(updateRun, policySnapshot.Name, numberOfClustersAnnotation, crp.Spec.Strategy.ApplyStrategy, &updateStrategy.Spec, stagesStatus, deletionStageStatus) + }, timeout, interval).Should(Succeed()) + }) + }) + }) +}) + +func verifyFailedInitCondition(updateRun *placementv1alpha1.ClusterStagedUpdateRun, message string) error { + var latestUpdateRun placementv1alpha1.ClusterStagedUpdateRun + if err := k8sClient.Get(ctx, types.NamespacedName{Name: updateRun.Name, Namespace: updateRun.Namespace}, &latestUpdateRun); err != nil { + return err + } + expectedCondition := []metav1.Condition{ + { + Type: string(placementv1alpha1.StagedUpdateRunConditionInitialized), + Status: metav1.ConditionFalse, + ObservedGeneration: latestUpdateRun.Generation, + Reason: condition.UpdateRunInitializeFailedReason, + }, + } + diff := cmp.Diff(expectedCondition, latestUpdateRun.Status.Conditions, cmpOptions...) + if diff != "" { + return fmt.Errorf("condition mismatch (-want +got):\n%s", diff) + } + initializedCondition := meta.FindStatusCondition(latestUpdateRun.Status.Conditions, string(placementv1alpha1.StagedUpdateRunConditionInitialized)) + if !strings.Contains(initializedCondition.Message, message) { + return fmt.Errorf("message mismatch: %s", initializedCondition.Message) + } + return nil +} + +func validateSuccessfulInitStatus(updateRun *placementv1alpha1.ClusterStagedUpdateRun, policySnapshotIndexUsed string, + policyObservedClusterCount int, applyStrategy *placementv1beta1.ApplyStrategy, stagedUpdateStrategySnapshot *placementv1alpha1.StagedUpdateStrategySpec, + stagesStatus []placementv1alpha1.StageUpdatingStatus, deletionStageStatus *placementv1alpha1.StageUpdatingStatus) error { + var latestUpdateRun placementv1alpha1.ClusterStagedUpdateRun + if err := k8sClient.Get(ctx, types.NamespacedName{Name: updateRun.Name, Namespace: updateRun.Namespace}, &latestUpdateRun); err != nil { + return err + } + expectedStatus := placementv1alpha1.StagedUpdateRunStatus{ + PolicySnapshotIndexUsed: policySnapshotIndexUsed, + PolicyObservedClusterCount: policyObservedClusterCount, + ApplyStrategy: applyStrategy, + StagedUpdateStrategySnapshot: stagedUpdateStrategySnapshot, + StagesStatus: stagesStatus, + DeletionStageStatus: deletionStageStatus, + Conditions: []metav1.Condition{ + { + Type: string(placementv1alpha1.StagedUpdateRunConditionInitialized), + Status: metav1.ConditionTrue, + ObservedGeneration: latestUpdateRun.Generation, + Reason: condition.UpdateRunInitializeSucceededReason, + }, + }, + } + diff := cmp.Diff(expectedStatus, latestUpdateRun.Status, cmpOptions...) + if diff != "" { + return fmt.Errorf("condition mismatch (-want +got):\n%s", diff) + } + return nil +} + +func generateCluster(clusterName string, labels map[string]string) *clusterv1beta1.MemberCluster { + return &clusterv1beta1.MemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName, + Labels: labels, + }, + Spec: clusterv1beta1.MemberClusterSpec{ + Identity: rbacv1.Subject{ + Name: "testUser", + Kind: "ServiceAccount", + Namespace: utils.FleetSystemNamespace, + }, + HeartbeatPeriodSeconds: 60, + }, + } +} + +func generateClusterResourceBinding(state placementv1beta1.BindingState, testCRPName, resourceSnapshotName, + policySnapshotName, targetCluster string) *placementv1beta1.ClusterResourceBinding { + binding := &placementv1beta1.ClusterResourceBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: "binding-" + resourceSnapshotName + "-" + targetCluster, + Labels: map[string]string{ + placementv1beta1.CRPTrackingLabel: testCRPName, + }, + }, + Spec: placementv1beta1.ResourceBindingSpec{ + State: state, + TargetCluster: targetCluster, + SchedulingPolicySnapshotName: policySnapshotName, + }, + } + if binding.Spec.State == placementv1beta1.BindingStateBound { + binding.Spec.ResourceSnapshotName = resourceSnapshotName + } + return binding +} + +func generateResourceSnapshot(testCRPName string, resourceIndex int, isLatest bool) *placementv1beta1.ClusterResourceSnapshot { + clusterResourceSnapshot := &placementv1beta1.ClusterResourceSnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(placementv1beta1.ResourceSnapshotNameFmt, testCRPName, resourceIndex), + Labels: map[string]string{ + placementv1beta1.CRPTrackingLabel: testCRPName, + placementv1beta1.IsLatestSnapshotLabel: strconv.FormatBool(isLatest), + }, + Annotations: map[string]string{ + placementv1beta1.ResourceGroupHashAnnotation: "hash", + }, + }, + } + rawContents := [][]byte{ + testResourceCRD, testNameSpace, testResource, testConfigMap, testDeployment, testService, + } + for _, rawContent := range rawContents { + clusterResourceSnapshot.Spec.SelectedResources = append(clusterResourceSnapshot.Spec.SelectedResources, + placementv1beta1.ResourceContent{ + RawExtension: runtime.RawExtension{Raw: rawContent}, + }, + ) + } + return clusterResourceSnapshot +} + +func generateDeletingClusterResourceBinding(testCRPName, targetCluster string) *placementv1beta1.ClusterResourceBinding { + binding := generateClusterResourceBinding(placementv1beta1.BindingStateUnscheduled, testCRPName, "resourcesnapshotname", "policysnapshotname", targetCluster) + binding.DeletionTimestamp = &metav1.Time{ + Time: time.Now(), + } + return binding +} diff --git a/pkg/controllers/updaterun/suite_test.go b/pkg/controllers/updaterun/suite_test.go new file mode 100644 index 000000000..8ab7f5a9c --- /dev/null +++ b/pkg/controllers/updaterun/suite_test.go @@ -0,0 +1,188 @@ +/* +Copyright (c) Microsoft Corporation. +Licensed under the MIT license. +*/ + +package updaterun + +import ( + "context" + "encoding/json" + "flag" + "os" + "path/filepath" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/yaml" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/klog/v2" + "k8s.io/klog/v2/textlogger" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/envtest" + "sigs.k8s.io/controller-runtime/pkg/manager" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + + clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1" + placementv1alpha1 "go.goms.io/fleet/apis/placement/v1alpha1" + placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" + "go.goms.io/fleet/test/utils/informer" +) + +var ( + cfg *rest.Config + mgr manager.Manager + k8sClient client.Client + testEnv *envtest.Environment + ctx context.Context + cancel context.CancelFunc + + // pre loaded test manifests + testResourceCRD, testNameSpace, testResource, testConfigMap, testDeployment, testService []byte +) + +func TestAPIs(t *testing.T) { + RegisterFailHandler(Fail) + + RunSpecs(t, "Work generator Controller Suite") +} + +var _ = BeforeSuite(func() { + ctx, cancel = context.WithCancel(context.TODO()) + + By("Setup klog") + var err error + fs := flag.NewFlagSet("klog", flag.ContinueOnError) + klog.InitFlags(fs) + Expect(fs.Parse([]string{"--v", "5", "-add_dir_header", "true"})).Should(Succeed()) + + // load test manifests + readTestManifests() + + By("bootstrapping test environment") + testEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{filepath.Join("../../../", "config", "crd", "bases")}, + ErrorIfCRDPathMissing: true, + } + cfg, err = testEnv.Start() + Expect(err).Should(Succeed()) + Expect(cfg).NotTo(BeNil()) + + //+kubebuilder:scaffold:scheme + By("Set all the customized scheme") + Expect(placementv1beta1.AddToScheme(scheme.Scheme)).Should(Succeed()) + Expect(placementv1alpha1.AddToScheme(scheme.Scheme)).Should(Succeed()) + Expect(clusterv1beta1.AddToScheme(scheme.Scheme)).Should(Succeed()) + + By("starting the controller manager") + klog.InitFlags(flag.CommandLine) + flag.Parse() + + mgr, err = ctrl.NewManager(cfg, ctrl.Options{ + Scheme: scheme.Scheme, + Metrics: metricsserver.Options{ + BindAddress: "0", + }, + Logger: textlogger.NewLogger(textlogger.NewConfig(textlogger.Verbosity(4))), + }) + Expect(err).Should(Succeed()) + + // make sure the k8s client is same as the controller client, or we can have cache delay + By("set k8s client same as the controller manager") + k8sClient = mgr.GetClient() + + // setup our main reconciler + fakeInformer := &informer.FakeManager{ + APIResources: map[schema.GroupVersionKind]bool{ + { + Group: "", + Version: "v1", + Kind: "Service", + }: true, + { + Group: "", + Version: "v1", + Kind: "Deployment", + }: true, + { + Group: "", + Version: "v1", + Kind: "Secret", + }: true, + }, + IsClusterScopedResource: false, + } + err = (&Reconciler{ + Client: k8sClient, + InformerManager: fakeInformer, + }).SetupWithManager(mgr) + Expect(err).Should(Succeed()) + + go func() { + defer GinkgoRecover() + err = mgr.Start(ctx) + Expect(err).Should(Succeed(), "failed to run manager") + }() +}) + +var _ = AfterSuite(func() { + defer klog.Flush() + + cancel() + By("tearing down the test environment") + err := testEnv.Stop() + Expect(err).Should(Succeed()) +}) + +func readTestManifests() { + By("Read testResource CRD") + rawByte, err := os.ReadFile("../../../test/manifests/test_testresources_crd.yaml") + Expect(err).Should(Succeed()) + testResourceCRD, err = yaml.ToJSON(rawByte) + Expect(err).Should(Succeed()) + + By("Read testResource CR") + rawByte, err = os.ReadFile("../../../test/manifests/test-resource.yaml") + Expect(err).Should(Succeed()) + testResource, err = yaml.ToJSON(rawByte) + Expect(err).Should(Succeed()) + + By("Read testConfigMap resource") + rawByte, err = os.ReadFile("../../../test/e2e/resources/test-configmap.yaml") + Expect(err).Should(Succeed()) + testConfigMap, err = yaml.ToJSON(rawByte) + Expect(err).Should(Succeed()) + + By("Read testDeployment") + rawByte, err = os.ReadFile("../../../test/e2e/resources/test-deployment.yaml") + Expect(err).Should(Succeed()) + testDeployment, err = yaml.ToJSON(rawByte) + Expect(err).Should(Succeed()) + + By("Read testService") + rawByte, err = os.ReadFile("../../../test/e2e/resources/test-service.yaml") + Expect(err).Should(Succeed()) + testService, err = yaml.ToJSON(rawByte) + Expect(err).Should(Succeed()) + + By("Create testNameSpace") + testNameSpace, err = json.Marshal(corev1.Namespace{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Namespace", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "app", + Labels: map[string]string{ + "fleet.azure.com/name": "test", + }, + }, + }) + Expect(err).Should(Succeed()) +} diff --git a/pkg/controllers/updaterun/validating.go b/pkg/controllers/updaterun/validating.go new file mode 100644 index 000000000..2c02640ff --- /dev/null +++ b/pkg/controllers/updaterun/validating.go @@ -0,0 +1,246 @@ +package updaterun + +import ( + "context" + "fmt" + + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + + placementv1alpha1 "go.goms.io/fleet/apis/placement/v1alpha1" + placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" + "go.goms.io/fleet/pkg/utils/condition" + "go.goms.io/fleet/pkg/utils/controller" +) + +var errStagedUpdatedAborted = fmt.Errorf("can not continue the StagedUpdateRun") + +// validateUpdateRunStatus validates the stagedUpdateRun status and ensures the update can be continued. +// The function returns the index of the stage that is updating, the list of clusters that are scheduled to be deleted. +// if the updating stage index is -1, it means all stages are finished, and the updateRun should be marked as finished. +// if the updating stage index is 0, the next stage to be updated will be the first stage. +// if the updating stage index is len(updateRun.Status.StagesStatus), the next stage to be updated will be the delete stage. +func (r *Reconciler) validateUpdateRunStatus(ctx context.Context, updateRun *placementv1alpha1.ClusterStagedUpdateRun) (int, []*placementv1beta1.ClusterResourceBinding, []*placementv1beta1.ClusterResourceBinding, error) { + // some of the validating function changes the object, so we need to make a copy of the object + updateRunRef := klog.KObj(updateRun) + updateRunCopy := updateRun.DeepCopy() + klog.V(2).InfoS("start to validate the stage update run", "stagedUpdateRun", updateRunRef) + // Validate the ClusterResourcePlacement object referenced by the ClusterStagedUpdateRun + placementName, err := r.validateCRP(ctx, updateRunCopy) + if err != nil { + return -1, nil, nil, err + } + // Record the latest policy snapshot associated with the ClusterResourcePlacement + latestPolicySnapshot, nodeCount, err := r.determinePolicySnapshot(ctx, placementName, updateRunCopy) + if err != nil { + return -1, nil, nil, err + } + // make sure the policy snapshot index used in the stagedUpdateRun is still valid + if updateRun.Status.PolicySnapshotIndexUsed != latestPolicySnapshot.Name { + misMatchErr := fmt.Errorf("the policy snapshot index used in the stagedUpdateRun is outdated, latest: %s, existing: %s", latestPolicySnapshot.Name, updateRun.Status.PolicySnapshotIndexUsed) + klog.ErrorS(misMatchErr, "there is a new latest policy snapshot", "clusterResourcePlacement", placementName, "stagedUpdateRun", updateRunRef) + return -1, nil, nil, fmt.Errorf("%w: %s", errStagedUpdatedAborted, misMatchErr.Error()) + } + // make sure the node count used in the stagedUpdateRun has not changed + if updateRun.Status.PolicyObservedClusterCount != nodeCount { + misMatchErr := fmt.Errorf("the node count used in the stagedUpdateRun is outdated, latest: %d, existing: %d", nodeCount, updateRun.Status.PolicyObservedClusterCount) + klog.ErrorS(misMatchErr, "The pick N node count has changed", "clusterResourcePlacement", placementName, "stagedUpdateRun", updateRunRef) + return -1, nil, nil, fmt.Errorf("%w: %s", errStagedUpdatedAborted, misMatchErr.Error()) + } + // Collect the scheduled clusters by the corresponding ClusterResourcePlacement with the latest policy snapshot + scheduledBinding, tobeDeleted, err := r.collectScheduledClusters(ctx, placementName, latestPolicySnapshot, updateRunCopy) + if err != nil { + return -1, nil, nil, err + } + // validate the applyStrategy and stagedUpdateStrategySnapshot + if updateRun.Status.ApplyStrategy == nil { + missingErr := fmt.Errorf("the updateRun has no applyStrategy") + klog.ErrorS(controller.NewUnexpectedBehaviorError(missingErr), "Failed to find the applyStrategy", "clusterResourcePlacement", placementName, "stagedUpdateRun", updateRunRef) + return -1, nil, nil, fmt.Errorf("%w: %s", errStagedUpdatedAborted, missingErr.Error()) + } + if updateRun.Status.StagedUpdateStrategySnapshot == nil { + missingErr := fmt.Errorf("the updateRun has no stagedUpdateStrategySnapshot") + klog.ErrorS(controller.NewUnexpectedBehaviorError(missingErr), "Failed to find the stagedUpdateStrategySnapshot", "clusterResourcePlacement", placementName, "stagedUpdateRun", updateRunRef) + return -1, nil, nil, fmt.Errorf("%w: %s", errStagedUpdatedAborted, missingErr.Error()) + } + if condition.IsConditionStatusFalse(meta.FindStatusCondition(updateRun.Status.Conditions, string(placementv1alpha1.StagedUpdateRunConditionProgressing)), updateRun.Generation) { + // the updateRun has not started + klog.V(2).InfoS("start the stage update run from the beginning", "stagedUpdateRun", updateRunRef) + return 0, scheduledBinding, tobeDeleted, nil + } + // validate the stageStatus and deleteStageStatus in the updateRun + updatingStageIndex, err := r.validateStagesStatus(ctx, scheduledBinding, updateRun) + if err != nil { + return -1, nil, nil, err + } + return updatingStageIndex, scheduledBinding, tobeDeleted, err +} + +// validateStagesStatus validates the both the update and delete stage in the updateRun. +// The function returns the stage index that is updating, and any error that is encountered. +// if the updating stage index is -1, it means all stages are finished, and the updateRun should be marked as finished. +// if the updating stage index is 0, the next stage to be updated will be the first stage. +// if the updating stage index is len(updateRun.Status.StagesStatus), the next stage to be updated will be the delete stage. +func (r *Reconciler) validateStagesStatus(ctx context.Context, scheduledBinding []*placementv1beta1.ClusterResourceBinding, updateRun *placementv1alpha1.ClusterStagedUpdateRun) (int, error) { + // take a copy of the existing updateRun + existingStageStatus := updateRun.Status.StagesStatus + existingDeleteStageStatus := updateRun.Status.DeletionStageStatus + updateRunCopy := updateRun.DeepCopy() + updateRunRef := klog.KObj(updateRun) + // compute the stage status which does not include the delete stage + if err := r.computeRunStageStatus(ctx, scheduledBinding, updateRunCopy); err != nil { + return -1, err + } + // validate the stages in the updateRun and return the updating stage index + updatingStageIndex, lastFinishedStageIndex, validateErr := validateUpdateStageStatus(existingStageStatus, updateRunCopy) + if validateErr != nil { + return -1, validateErr + } + deleteStageFinishedCond := meta.FindStatusCondition(existingDeleteStageStatus.Conditions, string(placementv1alpha1.StageUpdatingConditionSucceeded)) + deleteStageProgressingCond := meta.FindStatusCondition(existingDeleteStageStatus.Conditions, string(placementv1alpha1.StageUpdatingConditionProgressing)) + // check if the there is any active updating stage + if updatingStageIndex != -1 || lastFinishedStageIndex < len(existingDeleteStageStatus.Clusters) { + // there are still stages updating before the delete staging, make sure the delete stage is not active/finished + if condition.IsConditionStatusTrue(deleteStageFinishedCond, updateRun.Generation) || condition.IsConditionStatusFalse(deleteStageFinishedCond, updateRun.Generation) || + condition.IsConditionStatusTrue(deleteStageProgressingCond, updateRun.Generation) { + updateErr := fmt.Errorf("the delete stage is active, but there are still stages updating, updatingStageIndex: %d, lastFinishedStageIndex: %d", updatingStageIndex, lastFinishedStageIndex) + klog.ErrorS(controller.NewUnexpectedBehaviorError(updateErr), "There are more than one stage active", "stagedUpdateRun", updateRunRef) + return -1, fmt.Errorf("%w: %s", errStagedUpdatedAborted, updateErr.Error()) + } + // if no stage is updating, continue from the last finished stage (which will result it start from 0) + if updatingStageIndex == -1 { + updatingStageIndex = lastFinishedStageIndex + 1 + } + return updatingStageIndex, nil + } + klog.InfoS("All stages are finished, continue from the delete stage", "stagedUpdateRun", updateRunRef) + // check if the delete stage has finished successfully + if condition.IsConditionStatusTrue(deleteStageFinishedCond, updateRun.Generation) { + klog.InfoS("The delete stage has finished successfully, no more stage to update", "stagedUpdateRun", updateRunRef) + return -1, nil + } + // check if the delete stage has failed + if condition.IsConditionStatusFalse(deleteStageFinishedCond, updateRun.Generation) { + failedErr := fmt.Errorf("the delete stage has failed, err: %s", deleteStageFinishedCond.Message) + klog.ErrorS(failedErr, "The delete stage has failed", "stageCond", deleteStageFinishedCond, "stagedUpdateRun", updateRunRef) + return -1, fmt.Errorf("%w: %s", errStagedUpdatedAborted, failedErr.Error()) + } + if condition.IsConditionStatusTrue(deleteStageProgressingCond, updateRun.Generation) { + klog.InfoS("The delete stage is updating", "stagedUpdateRun", updateRunRef) + return len(existingDeleteStageStatus.Clusters), nil + } + // all stages are finished but the delete stage is not active/or finished + updateErr := fmt.Errorf("the delete stage is not active, but all stages finished, updatingStageIndex: %d, lastFinishedStageIndex: %d", updatingStageIndex, lastFinishedStageIndex) + klog.ErrorS(controller.NewUnexpectedBehaviorError(updateErr), "There is no stage active", "stagedUpdateRun", updateRunRef) + return -1, fmt.Errorf("%w: %s", errStagedUpdatedAborted, updateErr.Error()) +} + +// validateUpdateStageStatus is a helper function to validate the updating stages in the updateRun. +// it compares the existing stage status with the latest list of clusters to be updated. +// it returns the index of the updating stage, the index of the last finished stage and any error that is encountered. +func validateUpdateStageStatus(existingStageStatus []placementv1alpha1.StageUpdatingStatus, updateRun *placementv1alpha1.ClusterStagedUpdateRun) (int, int, error) { + updatingStageIndex := -1 + lastFinishedStageIndex := -1 + // remember the newly computed stage status + newStageStatus := updateRun.Status.StagesStatus + // make sure number of stages in the updateRun are still the same + if len(existingStageStatus) != len(newStageStatus) { + misMatchErr := fmt.Errorf("the number of stages in the stagedUpdateRun has changed, latest: %d, existing: %d", len(newStageStatus), len(existingStageStatus)) + klog.ErrorS(misMatchErr, "The number of stages has changed", "stagedUpdateRun", klog.KObj(updateRun)) + return -1, -1, fmt.Errorf("%w: %s", errStagedUpdatedAborted, misMatchErr.Error()) + } + // make sure the stages in the updateRun are still the same + for curStage := range existingStageStatus { + if existingStageStatus[curStage].StageName != newStageStatus[curStage].StageName { + misMatchErr := fmt.Errorf("the `%d` stage in the stagedUpdateRun has changed, latest: %s, existing: %s", curStage, newStageStatus[curStage].StageName, existingStageStatus[curStage].StageName) + klog.ErrorS(misMatchErr, "The stage has changed", "stagedUpdateRun", klog.KObj(updateRun)) + return -1, -1, fmt.Errorf("%w: %s", errStagedUpdatedAborted, misMatchErr.Error()) + } + if len(existingStageStatus[curStage].Clusters) != len(newStageStatus[curStage].Clusters) { + misMatchErr := fmt.Errorf("the number of clusters in the stage `%s` has changed, latest: %d, existing: %d", existingStageStatus[curStage].StageName, len(newStageStatus), len(existingStageStatus)) + klog.ErrorS(misMatchErr, "The number of clusters in a stage has changed", "stagedUpdateRun", klog.KObj(updateRun)) + return -1, -1, fmt.Errorf("%w: %s", errStagedUpdatedAborted, misMatchErr.Error()) + } + // check that the clusters in the stage are still the same + for j := range existingStageStatus[curStage].Clusters { + if existingStageStatus[curStage].Clusters[j].ClusterName != newStageStatus[curStage].Clusters[j].ClusterName { + misMatchErr := fmt.Errorf("the `%d`th cluster in the stage `%s` has changed, latest: %s, existing: %s", j, existingStageStatus[curStage].StageName, newStageStatus[curStage].Clusters[j].ClusterName, existingStageStatus[curStage].Clusters[j].ClusterName) + klog.ErrorS(misMatchErr, "The cluster in a stage has changed", "stagedUpdateRun", klog.KObj(updateRun)) + return -1, -1, fmt.Errorf("%w: %s", errStagedUpdatedAborted, misMatchErr.Error()) + } + } + stageSucceedCond := meta.FindStatusCondition(existingStageStatus[curStage].Conditions, string(placementv1alpha1.StageUpdatingConditionSucceeded)) + stageStartedCond := meta.FindStatusCondition(existingStageStatus[curStage].Conditions, string(placementv1alpha1.StageUpdatingConditionProgressing)) + if condition.IsConditionStatusTrue(stageSucceedCond, updateRun.Generation) { // the stage has finished + if updatingStageIndex != -1 && curStage > updatingStageIndex { + // the finished stage is after the updating stage + unExpectedErr := fmt.Errorf("the finished stage `%d` is after the updating stage `%d`", curStage, updatingStageIndex) + klog.ErrorS(controller.NewUnexpectedBehaviorError(unExpectedErr), "The finished stage is after the updating stage", "currentStage", existingStageStatus[curStage].StageName, "updatingStage", existingStageStatus[updatingStageIndex].StageName, "stagedUpdateRun", klog.KObj(updateRun)) + return -1, -1, fmt.Errorf("%w: %s", errStagedUpdatedAborted, unExpectedErr.Error()) + } + // record the last finished stage so we can continue from the next stage if no stage is updating + lastFinishedStageIndex = curStage + // make sure that all the clusters are upgraded + for curCluster := range existingStageStatus[curStage].Clusters { + // check if the cluster is updating + if condition.IsConditionStatusFalse(meta.FindStatusCondition(existingStageStatus[curStage].Clusters[curCluster].Conditions, string(placementv1alpha1.ClusterUpdatingConditionSucceeded)), updateRun.Generation) { + // the clusters in the finished stage should all be finished too + unExpectedErr := fmt.Errorf("there is an updating cluster in finished stage `%s` , the updating clusrter: %s, the updating clusrter index: %d", existingStageStatus[curStage].StageName, existingStageStatus[curStage].Clusters[curCluster].ClusterName, curCluster) + klog.ErrorS(controller.NewUnexpectedBehaviorError(unExpectedErr), "Detected updating clusters in finished stage", "stagedUpdateRun", klog.KObj(updateRun)) + return -1, -1, fmt.Errorf("%w: %s", errStagedUpdatedAborted, unExpectedErr.Error()) + } + } + } else if condition.IsConditionStatusFalse(stageSucceedCond, updateRun.Generation) { // the stage is failed + failedErr := fmt.Errorf("the stage `%s` has failed, err: %s", existingStageStatus[curStage].StageName, stageSucceedCond.Message) + klog.ErrorS(failedErr, "The stage has failed", "stageCond", stageSucceedCond, "stagedUpdateRun", klog.KObj(updateRun)) + return -1, -1, fmt.Errorf("%w: %s", errStagedUpdatedAborted, failedErr.Error()) + } else if stageStartedCond != nil { // the stage is updating + // check this is the only stage that is updating + if updatingStageIndex != -1 { + dupErr := fmt.Errorf("more than one updating stage, previous updating stage: %s, new updating stage: %s", existingStageStatus[updatingStageIndex].StageName, existingStageStatus[curStage].StageName) + klog.ErrorS(controller.NewUnexpectedBehaviorError(dupErr), "Detected more than one updating stage", "stagedUpdateRun", klog.KObj(updateRun)) + return -1, -1, fmt.Errorf("%w: %s", errStagedUpdatedAborted, dupErr.Error()) + } + if curStage != lastFinishedStageIndex+1 { + // the previous stages are not all finished + unexpectedErr := fmt.Errorf("the updating stage `%d` is not immediate after the last finished stage `%d`", curStage, lastFinishedStageIndex) + klog.ErrorS(controller.NewUnexpectedBehaviorError(unexpectedErr), "There is not started stage before the updating stage", "currentStage", existingStageStatus[curStage].StageName, "stagedUpdateRun", klog.KObj(updateRun)) + return -1, -1, fmt.Errorf("%w: %s", errStagedUpdatedAborted, unexpectedErr.Error()) + } + updatingStageIndex = curStage + // collect the updating clusters + var updatingClusters []string + for j := range existingStageStatus[curStage].Clusters { + // check if the cluster is updating + if condition.IsConditionStatusTrue(meta.FindStatusCondition(existingStageStatus[curStage].Clusters[j].Conditions, string(placementv1alpha1.ClusterUpdatingConditionStarted)), updateRun.Generation) && + condition.IsConditionStatusFalse(meta.FindStatusCondition(existingStageStatus[curStage].Clusters[j].Conditions, string(placementv1alpha1.ClusterUpdatingConditionSucceeded)), updateRun.Generation) { + updatingClusters = append(updatingClusters, existingStageStatus[curStage].Clusters[j].ClusterName) + } + } + // We don't allow more than one cluster to be updating at the same time for now + if len(updatingClusters) > 1 { + dupErr := fmt.Errorf("more than one updating cluster in stage `%s`, updating clusrters: %v", existingStageStatus[curStage].StageName, updatingClusters) + klog.ErrorS(controller.NewUnexpectedBehaviorError(dupErr), "Detected more than one updating cluster", "stagedUpdateRun", klog.KObj(updateRun)) + return -1, -1, fmt.Errorf("%w: %s", errStagedUpdatedAborted, dupErr.Error()) + } + } + } + return updatingStageIndex, lastFinishedStageIndex, nil +} + +// recordUpdateRunFailed records the failed update run in the ClusterStagedUpdateRun status. +func (r *Reconciler) recordUpdateRunFailed(ctx context.Context, updateRun *placementv1alpha1.ClusterStagedUpdateRun, message string) error { + meta.SetStatusCondition(&updateRun.Status.Conditions, metav1.Condition{ + Type: string(placementv1alpha1.StagedUpdateRunConditionSucceeded), + Status: metav1.ConditionFalse, + ObservedGeneration: updateRun.Generation, + Reason: condition.UpdateRunFailedReason, + Message: message, + }) + if updateErr := r.Client.Status().Update(ctx, updateRun); updateErr != nil { + klog.ErrorS(updateErr, "Failed to update the ClusterStagedUpdateRun status as failed", "stagedUpdateRun", klog.KObj(updateRun)) + return updateErr + } + return nil +} diff --git a/pkg/resourcewatcher/change_dector.go b/pkg/resourcewatcher/change_dector.go index 0fbea75d2..957ce97aa 100644 --- a/pkg/resourcewatcher/change_dector.go +++ b/pkg/resourcewatcher/change_dector.go @@ -221,7 +221,7 @@ func (d *ChangeDetector) dynamicResourceFilter(obj interface{}) bool { } if unstructuredObj, ok := obj.(*unstructured.Unstructured); ok { - shouldPropagate, err := utils.ShouldPropagateObj(d.InformerManager, unstructuredObj.DeepCopy()) + shouldPropagate, err := controller.ShouldPropagateObj(d.InformerManager, unstructuredObj.DeepCopy()) if err != nil || !shouldPropagate { klog.V(5).InfoS("Skip watching resource in namespace", "namespace", cwKey.Namespace, "group", cwKey.Group, "version", cwKey.Version, "kind", cwKey.Kind, "object", cwKey.Name) diff --git a/pkg/utils/common.go b/pkg/utils/common.go index 4fb1086f7..deb02aefd 100644 --- a/pkg/utils/common.go +++ b/pkg/utils/common.go @@ -16,14 +16,10 @@ import ( appv1 "k8s.io/api/apps/v1" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" - discoveryv1 "k8s.io/api/discovery/v1" rbacv1 "k8s.io/api/rbac/v1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/api/equality" - apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/client-go/discovery" "k8s.io/client-go/util/retry" @@ -37,8 +33,6 @@ import ( placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" fleetv1alpha1 "go.goms.io/fleet/apis/v1alpha1" "go.goms.io/fleet/pkg/utils/condition" - "go.goms.io/fleet/pkg/utils/controller" - "go.goms.io/fleet/pkg/utils/informer" ) const ( @@ -67,9 +61,10 @@ const ( ) const ( - PlacementFieldManagerName = "cluster-placement-controller" - MCControllerFieldManagerName = "member-cluster-controller" - OverrideControllerFieldManagerName = "override-controller" + PlacementFieldManagerName = "cluster-placement-controller" + MCControllerFieldManagerName = "member-cluster-controller" + OverrideControllerFieldManagerName = "override-controller" + UpdateRunControllerFieldManagerName = "updaterun-controller" ) // TODO(ryanzhang): move this to the api directory @@ -433,52 +428,6 @@ func CheckCRDInstalled(discoveryClient discovery.DiscoveryInterface, gvk schema. return err } -// ShouldPropagateObj decides if one should propagate the object -func ShouldPropagateObj(informerManager informer.Manager, uObj *unstructured.Unstructured) (bool, error) { - // TODO: add more special handling for different resource kind - switch uObj.GroupVersionKind() { - case corev1.SchemeGroupVersion.WithKind(ConfigMapKind): - // Skip the built-in custom CA certificate created in the namespace - if uObj.GetName() == "kube-root-ca.crt" { - return false, nil - } - case corev1.SchemeGroupVersion.WithKind("ServiceAccount"): - // Skip the default service account created in the namespace - if uObj.GetName() == "default" { - return false, nil - } - case corev1.SchemeGroupVersion.WithKind("Secret"): - // The secret, with type 'kubernetes.io/service-account-token', is created along with `ServiceAccount` should be - // prevented from propagating. - var secret corev1.Secret - if err := runtime.DefaultUnstructuredConverter.FromUnstructured(uObj.Object, &secret); err != nil { - return false, controller.NewUnexpectedBehaviorError(fmt.Errorf("failed to convert a secret object %s in namespace %s: %w", uObj.GetName(), uObj.GetNamespace(), err)) - } - if secret.Type == corev1.SecretTypeServiceAccountToken { - return false, nil - } - case corev1.SchemeGroupVersion.WithKind("Endpoints"): - // we assume that all endpoints with the same name of a service is created by the service controller - if _, err := informerManager.Lister(ServiceGVR).ByNamespace(uObj.GetNamespace()).Get(uObj.GetName()); err != nil { - if apierrors.IsNotFound(err) { - // there is no service of the same name as the end point, - // we assume that this endpoint is created by the user - return true, nil - } - return false, controller.NewAPIServerError(true, fmt.Errorf("failed to get the service %s in namespace %s: %w", uObj.GetName(), uObj.GetNamespace(), err)) - } - // we find a service of the same name as the endpoint, we assume it's created by the service - return false, nil - case discoveryv1.SchemeGroupVersion.WithKind("EndpointSlice"): - // all EndpointSlice created by the EndpointSlice controller has a managed by label - if _, exist := uObj.GetLabels()[discoveryv1.LabelManagedBy]; exist { - // do not propagate hub cluster generated endpoint slice - return false, nil - } - } - return true, nil -} - // IsReservedNamespace indicates if an argued namespace is reserved. func IsReservedNamespace(namespace string) bool { return strings.HasPrefix(namespace, fleetPrefix) || strings.HasPrefix(namespace, kubePrefix) diff --git a/pkg/utils/condition/condition.go b/pkg/utils/condition/condition.go index f6a901e77..5951a9e80 100644 --- a/pkg/utils/condition/condition.go +++ b/pkg/utils/condition/condition.go @@ -97,6 +97,54 @@ const ( AllWorkAvailableReason = "AllWorkAreAvailable" ) +// A group of condition reason string which is used to populate the staged update run. +const ( + // UpdateRunInitializeSucceededReason is the reason string of condition if the update run is initialized successfully. + UpdateRunInitializeSucceededReason = "UpdateRunInitializedSuccessfully" + + // UpdateRunInitializeFailedReason is the reason string of condition if the update run is failed to initialize. + UpdateRunInitializeFailedReason = "UpdateRunInitializedFailed" + + // UpdateRunStartedReason is the reason string of condition if the staged update run has started. + UpdateRunStartedReason = "UpdateRunStarted" + + // UpdateRunFailedReason is the reason string of condition if the staged update run failed. + UpdateRunFailedReason = "UpdateRunFailed" + + // UpdateRunSucceededReason is the reason string of condition if the staged update run succeeded. + UpdateRunSucceededReason = "UpdateRunSucceeded" + + // StageUpdatingStartedReason is the reason string of condition if the stage updating has started. + StageUpdatingStartedReason = "StageUpdatingStarted" + + // StageUpdatingWaitingReason is the reason string of condition if the stage updating is waiting. + StageUpdatingWaitingReason = "StageUpdatingWaiting" + + // StageUpdatingFailedReason is the reason string of condition if the stage updating failed. + StageUpdatingFailedReason = "StageUpdatingFailed" + + // StageUpdatingSucceededReason is the reason string of condition if the stage updating succeeded. + StageUpdatingSucceededReason = "StageUpdatingSucceeded" + + // ClusterUpdatingStartedReason is the reason string of condition if the cluster updating has started. + ClusterUpdatingStartedReason = "ClusterUpdatingStarted" + + // ClusterUpdatingFailedReason is the reason string of condition if the cluster updating failed. + ClusterUpdatingFailedReason = "ClusterUpdatingFailed" + + // ClusterUpdatingSucceededReason is the reason string of condition if the cluster updating succeeded. + ClusterUpdatingSucceededReason = "ClusterUpdatingSucceeded" + + // AfterStageTaskApprovalRequestApprovedReason is the reason string of condition if the approval request for after stage task has been approved. + AfterStageTaskApprovalRequestApprovedReason = "AfterStageTaskApprovalRequestApproved" + + // AfterStageTaskApprovalRequestCreatedReason is the reason string of condition if the approval request for after stage task has been created. + AfterStageTaskApprovalRequestCreatedReason = "AfterStageTaskApprovalRequestCreated" + + // AfterStageTaskWaitTimeElapsedReason is the reason string of condition if the wait time for after stage task has elapsed. + AfterStageTaskWaitTimeElapsedReason = "AfterStageTaskWaitTimeElapsed" +) + // EqualCondition compares one condition with another; it ignores the LastTransitionTime and Message fields, // and will consider the ObservedGeneration values from the two conditions a match if the current // condition is newer. diff --git a/pkg/utils/controller/controller.go b/pkg/utils/controller/controller.go index 5ef85cfef..1f55fd76b 100644 --- a/pkg/utils/controller/controller.go +++ b/pkg/utils/controller/controller.go @@ -14,7 +14,11 @@ import ( "sync" "time" + v1 "k8s.io/api/core/v1" + v12 "k8s.io/api/discovery/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/client-go/util/workqueue" "k8s.io/klog/v2" @@ -22,7 +26,9 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" fleetv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" + "go.goms.io/fleet/pkg/utils" "go.goms.io/fleet/pkg/utils/controller/metrics" + "go.goms.io/fleet/pkg/utils/informer" "go.goms.io/fleet/pkg/utils/keys" "go.goms.io/fleet/pkg/utils/labels" ) @@ -351,7 +357,53 @@ type MemberController interface { // Join describes the process of joining the fleet as a member. Join(ctx context.Context) error - // Leaves describes the process of leaving the fleet as a member. + // Leave describes the process of leaving the fleet as a member. // For example, delete all the resources created by the member controller. Leave(ctx context.Context) error } + +// ShouldPropagateObj decides if one should propagate the object +func ShouldPropagateObj(informerManager informer.Manager, uObj *unstructured.Unstructured) (bool, error) { + // TODO: add more special handling for different resource kind + switch uObj.GroupVersionKind() { + case v1.SchemeGroupVersion.WithKind(utils.ConfigMapKind): + // Skip the built-in custom CA certificate created in the namespace + if uObj.GetName() == "kube-root-ca.crt" { + return false, nil + } + case v1.SchemeGroupVersion.WithKind("ServiceAccount"): + // Skip the default service account created in the namespace + if uObj.GetName() == "default" { + return false, nil + } + case v1.SchemeGroupVersion.WithKind("Secret"): + // The secret, with type 'kubernetes.io/service-account-token', is created along with `ServiceAccount` should be + // prevented from propagating. + var secret v1.Secret + if err := runtime.DefaultUnstructuredConverter.FromUnstructured(uObj.Object, &secret); err != nil { + return false, NewUnexpectedBehaviorError(fmt.Errorf("failed to convert a secret object %s in namespace %s: %w", uObj.GetName(), uObj.GetNamespace(), err)) + } + if secret.Type == v1.SecretTypeServiceAccountToken { + return false, nil + } + case v1.SchemeGroupVersion.WithKind("Endpoints"): + // we assume that all endpoints with the same name of a service is created by the service controller + if _, err := informerManager.Lister(utils.ServiceGVR).ByNamespace(uObj.GetNamespace()).Get(uObj.GetName()); err != nil { + if apierrors.IsNotFound(err) { + // there is no service of the same name as the end point, + // we assume that this endpoint is created by the user + return true, nil + } + return false, NewAPIServerError(true, fmt.Errorf("failed to get the service %s in namespace %s: %w", uObj.GetName(), uObj.GetNamespace(), err)) + } + // we find a service of the same name as the endpoint, we assume it's created by the service + return false, nil + case v12.SchemeGroupVersion.WithKind("EndpointSlice"): + // all EndpointSlice created by the EndpointSlice controller has a managed by label + if _, exist := uObj.GetLabels()[v12.LabelManagedBy]; exist { + // do not propagate hub cluster generated endpoint slice + return false, nil + } + } + return true, nil +} diff --git a/pkg/utils/controller/controller_test.go b/pkg/utils/controller/controller_test.go index 60247dd36..db959ff16 100644 --- a/pkg/utils/controller/controller_test.go +++ b/pkg/utils/controller/controller_test.go @@ -10,7 +10,6 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -222,14 +221,6 @@ func TestNewCreateIgnoreAlreadyExistError(t *testing.T) { } } -func serviceScheme(t *testing.T) *runtime.Scheme { - scheme := runtime.NewScheme() - if err := fleetv1beta1.AddToScheme(scheme); err != nil { - t.Fatalf("Failed to add scheme: %v", err) - } - return scheme -} - func TestFetchAllClusterResourceSnapshots(t *testing.T) { crp := "my-test-crp" tests := []struct { diff --git a/pkg/controllers/rollout/override_test.go b/pkg/utils/controller/override_test.go similarity index 97% rename from pkg/controllers/rollout/override_test.go rename to pkg/utils/controller/override_test.go index 4d23f5974..4507b6a7e 100644 --- a/pkg/controllers/rollout/override_test.go +++ b/pkg/utils/controller/override_test.go @@ -3,7 +3,7 @@ Copyright (c) Microsoft Corporation. Licensed under the MIT license. */ -package rollout +package controller import ( "context" @@ -22,7 +22,6 @@ import ( clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1" placementv1alpha1 "go.goms.io/fleet/apis/placement/v1alpha1" placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" - "go.goms.io/fleet/pkg/utils/controller" "go.goms.io/fleet/test/utils/informer" "go.goms.io/fleet/test/utils/resource" ) @@ -780,11 +779,7 @@ func TestFetchAllMatchingOverridesForResourceSnapshot(t *testing.T) { WithScheme(scheme). WithObjects(objects...). Build() - r := Reconciler{ - Client: fakeClient, - InformerManager: &fakeInformer, - } - gotCRO, gotRO, err := r.fetchAllMatchingOverridesForResourceSnapshot(context.Background(), crpName, tc.master) + gotCRO, gotRO, err := FetchAllMatchOverridesForResourceSnapshot(context.Background(), fakeClient, &fakeInformer, crpName, tc.master) if err != nil { t.Fatalf("fetchAllMatchingOverridesForResourceSnapshot() failed, got err %v, want no err", err) } @@ -812,6 +807,7 @@ func TestFetchAllMatchingOverridesForResourceSnapshot(t *testing.T) { } func TestPickFromResourceMatchedOverridesForTargetCluster(t *testing.T) { + clusterName := "cluster-1" tests := []struct { name string cluster *clusterv1beta1.MemberCluster @@ -825,7 +821,7 @@ func TestPickFromResourceMatchedOverridesForTargetCluster(t *testing.T) { name: "empty overrides", cluster: &clusterv1beta1.MemberCluster{ ObjectMeta: metav1.ObjectMeta{ - Name: "cluster-1", + Name: clusterName, }, }, wantCRO: nil, @@ -835,7 +831,7 @@ func TestPickFromResourceMatchedOverridesForTargetCluster(t *testing.T) { name: "non-latest override snapshots", cluster: &clusterv1beta1.MemberCluster{ ObjectMeta: metav1.ObjectMeta{ - Name: "cluster-1", + Name: clusterName, }, }, croList: []*placementv1alpha1.ClusterResourceOverrideSnapshot{ @@ -966,13 +962,13 @@ func TestPickFromResourceMatchedOverridesForTargetCluster(t *testing.T) { Name: "cluster-not-exist", }, }, - wantErr: controller.ErrExpectedBehavior, + wantErr: ErrExpectedBehavior, }, { name: "matched overrides with empty cluster label", cluster: &clusterv1beta1.MemberCluster{ ObjectMeta: metav1.ObjectMeta{ - Name: "cluster-1", + Name: clusterName, }, }, croList: []*placementv1alpha1.ClusterResourceOverrideSnapshot{ @@ -1090,7 +1086,7 @@ func TestPickFromResourceMatchedOverridesForTargetCluster(t *testing.T) { name: "matched overrides with non-empty cluster label", cluster: &clusterv1beta1.MemberCluster{ ObjectMeta: metav1.ObjectMeta{ - Name: "cluster-1", + Name: clusterName, Labels: map[string]string{ "key1": "value1", "key2": "value2", @@ -1235,7 +1231,7 @@ func TestPickFromResourceMatchedOverridesForTargetCluster(t *testing.T) { name: "no matched overrides with non-empty cluster label", cluster: &clusterv1beta1.MemberCluster{ ObjectMeta: metav1.ObjectMeta{ - Name: "cluster-1", + Name: clusterName, Labels: map[string]string{ "key1": "value1", "key2": "value2", @@ -1317,18 +1313,7 @@ func TestPickFromResourceMatchedOverridesForTargetCluster(t *testing.T) { WithScheme(scheme). WithObjects(objects...). Build() - r := Reconciler{ - Client: fakeClient, - } - binding := &placementv1beta1.ClusterResourceBinding{ - ObjectMeta: metav1.ObjectMeta{ - Name: "binding-1", - }, - Spec: placementv1beta1.ResourceBindingSpec{ - TargetCluster: "cluster-1", - }, - } - gotCRO, gotRO, err := r.pickFromResourceMatchedOverridesForTargetCluster(context.Background(), binding, tc.croList, tc.roList) + gotCRO, gotRO, err := PickFromResourceMatchedOverridesForTargetCluster(context.Background(), fakeClient, clusterName, tc.croList, tc.roList) if gotErr, wantErr := err != nil, tc.wantErr != nil; gotErr != wantErr || !errors.Is(err, tc.wantErr) { t.Fatalf("pickFromResourceMatchedOverridesForTargetCluster() got error %v, want error %v", err, tc.wantErr) } diff --git a/pkg/controllers/rollout/override.go b/pkg/utils/controller/overrider.go similarity index 69% rename from pkg/controllers/rollout/override.go rename to pkg/utils/controller/overrider.go index fed2af39c..133f5c3b7 100644 --- a/pkg/controllers/rollout/override.go +++ b/pkg/utils/controller/overrider.go @@ -1,9 +1,4 @@ -/* -Copyright (c) Microsoft Corporation. -Licensed under the MIT license. -*/ - -package rollout +package controller import ( "context" @@ -11,33 +6,34 @@ import ( "sort" "strconv" - apierrors "k8s.io/apimachinery/pkg/api/errors" + errors2 "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/types" "k8s.io/klog/v2" "sigs.k8s.io/controller-runtime/pkg/client" - clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1" + "go.goms.io/fleet/apis/cluster/v1beta1" placementv1alpha1 "go.goms.io/fleet/apis/placement/v1alpha1" placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" "go.goms.io/fleet/pkg/utils" - "go.goms.io/fleet/pkg/utils/controller" + "go.goms.io/fleet/pkg/utils/informer" "go.goms.io/fleet/pkg/utils/overrider" ) -// fetchAllMatchingOverridesForResourceSnapshot fetches all the matching overrides which are attached to the selected resources. -func (r *Reconciler) fetchAllMatchingOverridesForResourceSnapshot(ctx context.Context, crp string, masterResourceSnapshot *placementv1beta1.ClusterResourceSnapshot) ([]*placementv1alpha1.ClusterResourceOverrideSnapshot, []*placementv1alpha1.ResourceOverrideSnapshot, error) { +// FetchAllMatchOverridesForResourceSnapshot finds all the overrides that selected the resources in any of the resource in the resource snapshot. +func FetchAllMatchOverridesForResourceSnapshot(ctx context.Context, c client.Client, manager informer.Manager, crp string, + masterResourceSnapshot *placementv1beta1.ClusterResourceSnapshot) ([]*placementv1alpha1.ClusterResourceOverrideSnapshot, []*placementv1alpha1.ResourceOverrideSnapshot, error) { // fetch the cro and ro snapshot list first before finding the matched ones. latestSnapshotLabelMatcher := client.MatchingLabels{ placementv1beta1.IsLatestSnapshotLabel: strconv.FormatBool(true), } croList := &placementv1alpha1.ClusterResourceOverrideSnapshotList{} - if err := r.Client.List(ctx, croList, latestSnapshotLabelMatcher); err != nil { + if err := c.List(ctx, croList, latestSnapshotLabelMatcher); err != nil { klog.ErrorS(err, "Failed to list all the clusterResourceOverrideSnapshots") return nil, nil, err } roList := &placementv1alpha1.ResourceOverrideSnapshotList{} - if err := r.Client.List(ctx, roList, latestSnapshotLabelMatcher); err != nil { + if err := c.List(ctx, roList, latestSnapshotLabelMatcher); err != nil { klog.ErrorS(err, "Failed to list all the resourceOverrideSnapshots") return nil, nil, err } @@ -46,7 +42,7 @@ func (r *Reconciler) fetchAllMatchingOverridesForResourceSnapshot(ctx context.Co return nil, nil, nil // no overrides and nothing to do } - resourceSnapshots, err := controller.FetchAllClusterResourceSnapshots(ctx, r.Client, crp, masterResourceSnapshot) + resourceSnapshots, err := FetchAllClusterResourceSnapshots(ctx, c, crp, masterResourceSnapshot) if err != nil { return nil, nil, err } @@ -59,11 +55,11 @@ func (r *Reconciler) fetchAllMatchingOverridesForResourceSnapshot(ctx context.Co var uResource unstructured.Unstructured if err := uResource.UnmarshalJSON(res.Raw); err != nil { klog.ErrorS(err, "Resource has invalid content", "snapshot", klog.KObj(snapshot), "selectedResource", res.Raw) - return nil, nil, controller.NewUnexpectedBehaviorError(err) + return nil, nil, NewUnexpectedBehaviorError(err) } // If the resource is namespaced scope resource, the resource could be selected by the namespace or selected // by the object itself. - if !r.InformerManager.IsClusterScopedResources(uResource.GroupVersionKind()) { + if !manager.IsClusterScopedResources(uResource.GroupVersionKind()) { croKey := placementv1beta1.ResourceIdentifier{ Group: utils.NamespaceMetaGVK.Group, Version: utils.NamespaceMetaGVK.Version, @@ -125,24 +121,21 @@ func (r *Reconciler) fetchAllMatchingOverridesForResourceSnapshot(ctx context.Co return filteredCRO, filteredRO, nil } -// pickFromResourceMatchedOverridesForTargetCluster will look for any overrides associated with the "Bound" or "Scheduled" binding. -// croList is a list of clusterResourceOverrides attached to the selected resources. -// roList is a list of resourceOverrides attached to the selected resources. -// It returns names of cro and ro attached to the target cluster, and they're ordered by its namespace (if present) and -// then name. -func (r *Reconciler) pickFromResourceMatchedOverridesForTargetCluster(ctx context.Context, binding *placementv1beta1.ClusterResourceBinding, croList []*placementv1alpha1.ClusterResourceOverrideSnapshot, roList []*placementv1alpha1.ResourceOverrideSnapshot) ([]string, []placementv1beta1.NamespacedName, error) { +// PickFromResourceMatchedOverridesForTargetCluster filter the overrides that are matched with resources to the target cluster. +func PickFromResourceMatchedOverridesForTargetCluster(ctx context.Context, c client.Client, targetCluster string, + croList []*placementv1alpha1.ClusterResourceOverrideSnapshot, roList []*placementv1alpha1.ResourceOverrideSnapshot) ([]string, []placementv1beta1.NamespacedName, error) { + // the common case that there is no override for the resources. if len(croList) == 0 && len(roList) == 0 { return nil, nil, nil } - - cluster := clusterv1beta1.MemberCluster{} - if err := r.Client.Get(ctx, types.NamespacedName{Name: binding.Spec.TargetCluster}, &cluster); err != nil { - if apierrors.IsNotFound(err) { - klog.V(2).InfoS("MemberCluster has been deleted and we expect that scheduler will update the spec of binding to unscheduled", "memberCluster", binding.Spec.TargetCluster, "clusterResourceBinding", klog.KObj(binding)) - return nil, nil, controller.NewExpectedBehaviorError(err) + cluster := v1beta1.MemberCluster{} + if err := c.Get(ctx, types.NamespacedName{Name: targetCluster}, &cluster); err != nil { + if errors2.IsNotFound(err) { + klog.V(2).InfoS("MemberCluster has been deleted and we expect that scheduler will update the spec of binding to unscheduled", "memberCluster", targetCluster) + return nil, nil, NewExpectedBehaviorError(err) } - klog.ErrorS(err, "Failed to get the memberCluster", "memberCluster", binding.Spec.TargetCluster, "clusterResourceBinding", klog.KObj(binding)) - return nil, nil, controller.NewAPIServerError(true, err) + klog.ErrorS(err, "Failed to get the memberCluster", "memberCluster", targetCluster) + return nil, nil, NewAPIServerError(true, err) } croFiltered := make([]*placementv1alpha1.ClusterResourceOverrideSnapshot, 0, len(croList)) @@ -150,7 +143,7 @@ func (r *Reconciler) pickFromResourceMatchedOverridesForTargetCluster(ctx contex matched, err := isClusterMatched(cluster, cro.Spec.OverrideSpec.Policy) if err != nil { klog.ErrorS(err, "Invalid clusterResourceOverride", "clusterResourceOverride", klog.KObj(cro)) - return nil, nil, controller.NewUnexpectedBehaviorError(err) + return nil, nil, NewUnexpectedBehaviorError(err) } if matched { croFiltered = append(croFiltered, croList[i]) @@ -166,7 +159,7 @@ func (r *Reconciler) pickFromResourceMatchedOverridesForTargetCluster(ctx contex matched, err := isClusterMatched(cluster, ro.Spec.OverrideSpec.Policy) if err != nil { klog.ErrorS(err, "Invalid resourceOverride", "resourceOverride", klog.KObj(ro)) - return nil, nil, controller.NewUnexpectedBehaviorError(err) + return nil, nil, NewUnexpectedBehaviorError(err) } if matched { roFiltered = append(roFiltered, roList[i]) @@ -187,11 +180,11 @@ func (r *Reconciler) pickFromResourceMatchedOverridesForTargetCluster(ctx contex for i, o := range roFiltered { roNames[i] = placementv1beta1.NamespacedName{Name: o.Name, Namespace: o.Namespace} } - klog.V(2).InfoS("Found matched overrides for the binding", "binding", klog.KObj(binding), "matchedCROCount", len(croNames), "matchedROCount", len(roNames)) + klog.V(2).InfoS("Found matched overrides for the binding", "memberCluster", targetCluster, "matchedCROCount", len(croNames), "matchedROCount", len(roNames)) return croNames, roNames, nil } -func isClusterMatched(cluster clusterv1beta1.MemberCluster, policy *placementv1alpha1.OverridePolicy) (bool, error) { +func isClusterMatched(cluster v1beta1.MemberCluster, policy *placementv1alpha1.OverridePolicy) (bool, error) { if policy == nil { return false, errors.New("policy is nil") }