From 5832461e2b3782e758c80b5c79c43d0b5c29a670 Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes <145056127+britaniar@users.noreply.github.com> Date: Fri, 24 May 2024 15:09:38 -0700 Subject: [PATCH] docs: update troubleshooting guide with latest conditions (#798) --- docs/troubleshooting/README.md | 631 +----------------- .../clusterResourcePlacementApplied.md | 241 +++++++ .../clusterResourcePlacementAvailable.md | 206 ++++++ .../clusterResourcePlacementOverridden.md | 159 +++++ .../clusterResourcePlacementRolloutStarted.md | 351 ++++++++++ .../clusterResourcePlacementScheduled.md | 199 ++++++ ...lusterResourcePlacementWorkSynchronized.md | 115 ++++ 7 files changed, 1290 insertions(+), 612 deletions(-) create mode 100644 docs/troubleshooting/clusterResourcePlacementApplied.md create mode 100644 docs/troubleshooting/clusterResourcePlacementAvailable.md create mode 100644 docs/troubleshooting/clusterResourcePlacementOverridden.md create mode 100644 docs/troubleshooting/clusterResourcePlacementRolloutStarted.md create mode 100644 docs/troubleshooting/clusterResourcePlacementScheduled.md create mode 100644 docs/troubleshooting/clusterResourcePlacementWorkSynchronized.md diff --git a/docs/troubleshooting/README.md b/docs/troubleshooting/README.md index 797fd357a..c38bb9b11 100644 --- a/docs/troubleshooting/README.md +++ b/docs/troubleshooting/README.md @@ -13,616 +13,23 @@ Internal Objects to keep in mind when troubleshooting CRP related errors on the - `Work` Please read the API reference for more details about each object https://github.com/Azure/fleet/blob/main/docs/api-references.md. - -## How can I debug when my CRP status is ClusterResourcePlacementScheduled condition status is set to false? - -### Common scenarios: - -Instances where this condition may arise: - -- When the placement policy is set to `PickFixed`, but the specified cluster names do not match any joined member cluster name in the fleet, or the specified cluster is no longer connected to the fleet. -- When the placement policy is set to `PickN`, and N clusters are specified, but there are fewer than N clusters that have joined the fleet or satisfy the placement policy. - ->>Note: When the placement policy is set to `PickAll`, the `ClusterResourcePlacementScheduled` condition is always set to `true`. - -### Example Scenario: - -The example output below demonstrates a `ClusterResourcePlacement` with a `PickN` placement policy attempting to propagate resources to two clusters labeled `env:prod`. In this instance, two clusters, namely `kind-cluster-1` and `kind-cluster-2`, are joined to the fleet, with only one member cluster, `kind-cluster-1`, having the label `env:prod`. - -### CRP spec: -``` -spec: - policy: - affinity: - clusterAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - clusterSelectorTerms: - - labelSelector: - matchLabels: - env: prod - numberOfClusters: 2 - placementType: PickN - resourceSelectors: - ... - revisionHistoryLimit: 10 - strategy: - type: RollingUpdate -``` - -### CRP status: -``` -status: - conditions: - - lastTransitionTime: "2023-11-27T20:25:19Z" - message: could not find all the clusters needed as specified by the scheduling - policy - observedGeneration: 2 - reason: SchedulingPolicyUnfulfilled - status: "False" - type: ClusterResourcePlacementScheduled - - lastTransitionTime: "2023-11-27T20:25:24Z" - message: All 1 cluster(s) are synchronized to the latest resources on the hub - cluster - observedGeneration: 2 - reason: SynchronizeSucceeded - status: "True" - type: ClusterResourcePlacementSynchronized - - lastTransitionTime: "2023-11-27T20:25:24Z" - message: Successfully applied resources to 1 member clusters - observedGeneration: 2 - reason: ApplySucceeded - status: "True" - type: ClusterResourcePlacementApplied - placementStatuses: - - clusterName: kind-cluster-1 - conditions: - - lastTransitionTime: "2023-11-27T20:25:19Z" - message: 'Successfully scheduled resources for placement in kind-cluster-1 (affinity - score: 0, topology spread score: 0): picked by scheduling policy' - observedGeneration: 2 - reason: ScheduleSucceeded - status: "True" - type: ResourceScheduled - - lastTransitionTime: "2023-11-27T20:25:24Z" - message: Successfully Synchronized work(s) for placement - observedGeneration: 2 - reason: WorkSynchronizeSucceeded - status: "True" - type: WorkSynchronized - - lastTransitionTime: "2023-11-27T20:25:24Z" - message: Successfully applied resources - observedGeneration: 2 - reason: ApplySucceeded - status: "True" - type: ResourceApplied - - conditions: - - lastTransitionTime: "2023-11-27T20:25:40Z" - message: 'kind-cluster-2 is not selected: ClusterUnschedulable, none of the - nonempty required cluster affinity term (total number: 1) is matched' - observedGeneration: 2 - reason: ScheduleFailed - status: "False" - type: ResourceScheduled - selectedResources: - ... -``` - -The `ClusterResourcePlacementScheduled` condition is set to `false`, the goal is to select two clusters with the label `env:prod`, but only one member cluster possesses the correct label as specified in `clusterAffinity`. - -We can also take a look at the `ClusterSchedulingPolicySnapshot` status to figure out why the scheduler could not schedule the resource for the placement policy specified. - -The corresponding `ClusterSchedulingPolicySnapshot` spec and status gives us even more information on why scheduling failed. Please refer to this [section](#how-to-find--verify-the-latest-clusterschedulingpolicysnapshot-for-a-crp) to learn how to get the latest `ClusterSchedulingPolicySnapshot`. - -### Latest ClusterSchedulingPolicySnapshot: -``` -apiVersion: placement.kubernetes-fleet.io/v1beta1 -kind: ClusterSchedulingPolicySnapshot -metadata: - annotations: - kubernetes-fleet.io/CRP-generation: "2" - kubernetes-fleet.io/number-of-clusters: "2" - creationTimestamp: "2023-11-27T21:33:01Z" - generation: 1 - labels: - kubernetes-fleet.io/is-latest-snapshot: "true" - kubernetes-fleet.io/parent-CRP: crp-4 - kubernetes-fleet.io/policy-index: "0" - name: ... - ownerReferences: - - apiVersion: placement.kubernetes-fleet.io/v1beta1 - blockOwnerDeletion: true - controller: true - kind: ClusterResourcePlacement - name: ... - uid: 37e83327-26e0-4c48-8276-e62cc6aa067f - resourceVersion: "10085" - uid: f2a3d0ea-c9fa-455d-be09-51b5d090e5d6 -spec: - policy: - affinity: - clusterAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - clusterSelectorTerms: - - labelSelector: - matchLabels: - env: prod - placementType: PickN - policyHash: ZjE0Yjk4YjYyMTVjY2U3NzQ1MTZkNWRhZjRiNjQ1NzQ4NjllNTUyMzZkODBkYzkyYmRkMGU3OTI3MWEwOTkyNQ== -status: - conditions: - - lastTransitionTime: "2023-11-27T21:33:01Z" - message: could not find all the clusters needed as specified by the scheduling - policy - observedGeneration: 1 - reason: SchedulingPolicyUnfulfilled - status: "False" - type: Scheduled - observedCRPGeneration: 2 - targetClusters: - - clusterName: kind-cluster-1 - clusterScore: - affinityScore: 0 - priorityScore: 0 - reason: picked by scheduling policy - selected: true - - clusterName: kind-cluster-2 - reason: 'ClusterUnschedulable, none of the nonempty required cluster affinity - term (total number: 1) is matched' - selected: false -``` - -### Resolution: -The solution here is to add the `env:prod` label to the member cluster resource for `kind-cluster-2` as well, so that the scheduler can select the cluster to propagate resources. - -## How can I debug when my CRP status is ClusterResourcePlacementSynchronized condition status is set to false? - -The `ClusterResourcePlacementSynchronized` condition status is set to `false` under the following circumstances: when the `Work` is not created or updated for a new `ClusterResourceSnapshot`. - -### Investigation Steps: - -- In the `ClusterResourcePlacement` status section, examine the `placementStatuses` to identify clusters with the `WorkSynchronized` status set to `false`. -- Locate the corresponding `ClusterResourceBinding` for the identified cluster. Please check this [section](#how-to-find-the-latest-clusterresourcebinding-resource) to learn how to get the latest `ClusterResourceBinding`. This resource should indicate the status of the `Work` whether it was created or updated. -- A common scenario leading to this issue is the user input for the `rollingUpdate` configuration being too strict. Verify the values for `maxUnavailable` and `maxSurge` to ensure they align with your expectations. - -### Example Scenario: - -In the following example, an attempt is made to propagate a namespace to three member clusters. However, during the initial creation of the `ClusterResourcePlacement`, the namespace doesn't exist on the hub cluster, and the fleet currently comprises two member clusters named `kind-cluster-1` and `kind-cluster-2`. - -### CRP spec: -``` -spec: - policy: - numberOfClusters: 3 - placementType: PickN - resourceSelectors: - - group: "" - kind: Namespace - name: test-ns - version: v1 - revisionHistoryLimit: 10 - strategy: - type: RollingUpdate -``` - -### CRP status: -``` -status: - conditions: - - lastTransitionTime: "2023-11-29T21:36:49Z" - message: could not find all the clusters needed as specified by the scheduling - policy - observedGeneration: 2 - reason: SchedulingPolicyUnfulfilled - status: "False" - type: ClusterResourcePlacementScheduled - - lastTransitionTime: "2023-11-29T21:36:54Z" - message: All 2 cluster(s) are synchronized to the latest resources on the hub - cluster - observedGeneration: 2 - reason: SynchronizeSucceeded - status: "True" - type: ClusterResourcePlacementSynchronized - - lastTransitionTime: "2023-11-29T21:36:54Z" - message: Successfully applied resources to 2 member clusters - observedGeneration: 2 - reason: ApplySucceeded - status: "True" - type: ClusterResourcePlacementApplied - placementStatuses: - - clusterName: kind-cluster-2 - conditions: - - lastTransitionTime: "2023-11-29T21:36:49Z" - message: 'Successfully scheduled resources for placement in kind-cluster-2 (affinity - score: 0, topology spread score: 0): picked by scheduling policy' - observedGeneration: 2 - reason: ScheduleSucceeded - status: "True" - type: ResourceScheduled - - lastTransitionTime: "2023-11-29T21:36:49Z" - message: Successfully Synchronized work(s) for placement - observedGeneration: 2 - reason: WorkSynchronizeSucceeded - status: "True" - type: WorkSynchronized - - lastTransitionTime: "2023-11-29T21:36:54Z" - message: Successfully applied resources - observedGeneration: 2 - reason: ApplySucceeded - status: "True" - type: ResourceApplied - - clusterName: kind-cluster-1 - conditions: - - lastTransitionTime: "2023-11-29T21:36:49Z" - message: 'Successfully scheduled resources for placement in kind-cluster-1 (affinity - score: 0, topology spread score: 0): picked by scheduling policy' - observedGeneration: 2 - reason: ScheduleSucceeded - status: "True" - type: ResourceScheduled - - lastTransitionTime: "2023-11-29T21:36:54Z" - message: Successfully Synchronized work(s) for placement - observedGeneration: 2 - reason: WorkSynchronizeSucceeded - status: "True" - type: WorkSynchronized - - lastTransitionTime: "2023-11-29T21:36:54Z" - message: Successfully applied resources - observedGeneration: 2 - reason: ApplySucceeded - status: "True" - type: ResourceApplied -``` - -Given that the resource `test-ns` namespace never existed on the hub cluster, the `ClusterResourcePlacement` status reflects the following: -- `ClusterResourcePlacementScheduled` is set to `false`, as the specified policy aims to pick three clusters, but the scheduler can only accommodate placement in two currently available and joined clusters. -- `ClusterResourcePlacementSynchronized` is set to `true`. -- `ClusterResourcePlacementApplied` is set to `true`. - -Subsequently, we proceed to create the `test-ns` namespace on the hub cluster. We anticipate the seamless propagation of the namespace across the relevant clusters. - -### CRP status after namespace test-ns is created on the hub cluster: -``` -status: - conditions: - - lastTransitionTime: "2023-11-29T21:36:49Z" - message: could not find all the clusters needed as specified by the scheduling - policy - observedGeneration: 2 - reason: SchedulingPolicyUnfulfilled - status: "False" - type: ClusterResourcePlacementScheduled - - lastTransitionTime: "2023-11-29T21:49:43Z" - message: There are still 2 cluster(s) pending to be sychronized on the hub cluster - observedGeneration: 2 - reason: SynchronizePending - status: "False" - type: ClusterResourcePlacementSynchronized - - lastTransitionTime: "2023-11-29T21:49:43Z" - message: 'Works need to be synchronized on the hub cluster or there are still - manifests pending to be processed by the 2 member clusters ' - observedGeneration: 2 - reason: ApplyPending - status: Unknown - type: ClusterResourcePlacementApplied - placementStatuses: - - clusterName: kind-cluster-2 - conditions: - - lastTransitionTime: "2023-11-29T21:36:49Z" - message: 'Successfully scheduled resources for placement in kind-cluster-2 (affinity - score: 0, topology spread score: 0): picked by scheduling policy' - observedGeneration: 2 - reason: ScheduleSucceeded - status: "True" - type: ResourceScheduled - - lastTransitionTime: "2023-11-29T21:49:43Z" - message: 'In the process of synchronizing or operation is blocked by the rollout - strategy ' - observedGeneration: 2 - reason: WorkSynchronizePending - status: "False" - type: WorkSynchronized - - lastTransitionTime: "2023-11-29T21:49:43Z" - message: Works need to be synchronized on the hub cluster or there are still - manifests pending to be processed by the member cluster - observedGeneration: 2 - reason: ApplyPending - status: Unknown - type: ResourceApplied - - clusterName: kind-cluster-1 - conditions: - - lastTransitionTime: "2023-11-29T21:36:49Z" - message: 'Successfully scheduled resources for placement in kind-cluster-1 (affinity - score: 0, topology spread score: 0): picked by scheduling policy' - observedGeneration: 2 - reason: ScheduleSucceeded - status: "True" - type: ResourceScheduled - - lastTransitionTime: "2023-11-29T21:49:43Z" - message: 'In the process of synchronizing or operation is blocked by the rollout - strategy ' - observedGeneration: 2 - reason: WorkSynchronizePending - status: "False" - type: WorkSynchronized - - lastTransitionTime: "2023-11-29T21:49:43Z" - message: Works need to be synchronized on the hub cluster or there are still - manifests pending to be processed by the member cluster - observedGeneration: 2 - reason: ApplyPending - status: Unknown - type: ResourceApplied - selectedResources: - - kind: Namespace - name: test-ns - version: v1 -``` - -Upon examination, the `ClusterResourcePlacementSynchronized` status is found to be `false`, and in the `placementStatus` section for `WorkSynchronized` condition we see a message indicating that `Works need to be synchronized on the hub cluster, or there are still manifests pending to be processed by the 2 member clusters` - -Let's check the latest `ClusterResourceSnapshot`. Please refer to this [section](#how-to-find-the-latest-clusterresourcesnapshot-resource) to learn how to get the latest `ClusterResourceSnapshot`. - -### Latest ClusterResourceSnapshot: -``` -metadata: - annotations: - kubernetes-fleet.io/number-of-enveloped-object: "0" - kubernetes-fleet.io/number-of-resource-snapshots: "1" - kubernetes-fleet.io/resource-hash: 72344be6e268bc7af29d75b7f0aad588d341c228801aab50d6f9f5fc33dd9c7c - creationTimestamp: "2023-12-05T04:36:24Z" - generation: 1 - labels: - kubernetes-fleet.io/is-latest-snapshot: "true" - kubernetes-fleet.io/parent-CRP: test-crp - kubernetes-fleet.io/resource-index: "1" - name: test-crp-1-snapshot - ownerReferences: - - apiVersion: placement.kubernetes-fleet.io/v1beta1 - blockOwnerDeletion: true - controller: true - kind: ClusterResourcePlacement - name: test-crp - uid: 1c474983-cda0-49cb-bf60-3d2a42f122ba - resourceVersion: "4489" - uid: a520f775-14cc-4bf5-b8cd-c4efc0e2be34 -spec: - selectedResources: - - apiVersion: v1 - kind: Namespace - metadata: - labels: - kubernetes.io/metadata.name: test-ns - name: test-ns - spec: - finalizers: - - kubernetes -``` - -Upon inspecting `ClusterResourceSnapshot` spec, we observe that the `selectedResources` section now has the namespace `test-ns`. - -Let's check the `ClusterResourceBinding` for `kind-cluster-1` to see if it got updated after the namespace `test-ns` was created. Please check this [section](#how-to-find-the-latest-clusterresourcebinding-resource) to learn how to get the latest `ClusterResourceBinding`. - -### ClusterResourceBinding for kind-cluster-1: -``` -apiVersion: placement.kubernetes-fleet.io/v1beta1 -kind: ClusterResourceBinding -metadata: - creationTimestamp: "2023-12-05T04:17:49Z" - finalizers: - - kubernetes-fleet.io/work-cleanup - generation: 2 - labels: - kubernetes-fleet.io/parent-CRP: test-crp - name: test-crp-kind-cluster-1-4e5c873b - resourceVersion: "2572" - uid: 8ae9741d-e95c-44f8-b36a-29d73f6b833c -spec: - clusterDecision: - clusterName: kind-cluster-1 - clusterScore: - affinityScore: 0 - priorityScore: 0 - reason: picked by scheduling policy - selected: true - resourceSnapshotName: test-crp-0-snapshot - schedulingPolicySnapshotName: test-crp-0 - state: Bound - targetCluster: kind-cluster-1 -status: - conditions: - - lastTransitionTime: "2023-12-05T04:17:50Z" - message: "" - observedGeneration: 2 - reason: AllWorkSynced - status: "True" - type: Bound - - lastTransitionTime: "2023-12-05T04:17:50Z" - message: "" - observedGeneration: 2 - reason: AllWorkHasBeenApplied - status: "True" - type: Applied -``` - -Upon inspection, it is observed that the `ClusterResourceBinding` remains unchanged. Notably, in the spec, the `resourceSnapshotName` still references the old `ClusterResourceSnapshot` name. - -This scenario arises due to the absence of explicit `rollingUpdate` input from the user. Consequently, the default values are applied: - -- `maxUnavailable` is configured to `25% * 3 (desired number), rounded to 1` -- `maxSurge` is configured to `25% * 3 (desired number), rounded to 1` - -### Summary of Events: -1. Initially, when the CRP was created, two `ClusterResourceBindings` were generated. However, since the `test-ns` namespace did not exist on the hub cluster, the `Work` object was created with an empty list of manifests, and `ClusterResourcePlacementSynchronized` was set to `true`. -2. Upon creating the `test-ns` namespace on the hub, the rollout controller attempted to update the two existing `ClusterResourceBindings`. However, the `rollingUpdate` configuration was too strict: `maxUnavailable` was set to 1, which was already the case due to a missing member cluster. If, during the update, even one of the bindings failed to apply, it would violate the `rollingUpdate` configuration since `maxUnavailable` was set to 1. - -### Resolution: -- To address this specific issue, consider manually setting `maxUnavailable` to a value greater than 2 to relax the `rollingUpdate` configuration. -- Alternatively, you can also join a third member cluster. - -## How can I debug when my CRP ClusterResourcePlacementApplied condition is set to false? - -### Investigation steps: - -1. Check `placementStatuses`: In the `ClusterResourcePlacement` status section, inspect the `placementStatuses` to identify which clusters have the `ResourceApplied` condition set to `false` and note down their `clusterName`. -2. Locate `Work` Object in Hub Cluster: Use the identified `clusterName` to locate the `Work` object associated with the member cluster. Please refer to this [section](#how-and-where-to-find-the-correct-work-resource) to learn how to get the correct `Work` resource. -3. Check `Work` object status: Inspect the status of the `Work` object to understand the specific issues preventing successful resource application. - -### Example Scenario: -In this example, the `ClusterResourcePlacement` is attempting to propagate a namespace containing a deployment to two member clusters. However, the namespace already exists on one member cluster, specifically named `kind-cluster-1`. - -### CRP spec: -``` - policy: - clusterNames: - - kind-cluster-1 - - kind-cluster-2 - placementType: PickFixed - resourceSelectors: - - group: "" - kind: Namespace - name: test-ns - version: v1 - revisionHistoryLimit: 10 - strategy: - type: RollingUpdate -``` - -### CRP status: -``` - conditions: - - lastTransitionTime: "2023-11-28T20:56:15Z" - message: found all the clusters needed as specified by the scheduling policy - observedGeneration: 2 - reason: SchedulingPolicyFulfilled - status: "True" - type: ClusterResourcePlacementScheduled - - lastTransitionTime: "2023-11-28T20:56:21Z" - message: All 2 cluster(s) are synchronized to the latest resources on the hub - cluster - observedGeneration: 2 - reason: SynchronizeSucceeded - status: "True" - type: ClusterResourcePlacementSynchronized - - lastTransitionTime: "2023-11-28T20:56:21Z" - message: Failed to apply manifests to 1 clusters, please check the `failedResourcePlacements` - status - observedGeneration: 2 - reason: ApplyFailed - status: "False" - type: ClusterResourcePlacementApplied - placementStatuses: - - clusterName: kind-cluster-1 - conditions: - - lastTransitionTime: "2023-11-28T20:56:15Z" - message: 'Successfully scheduled resources for placement in kind-cluster-1: - picked by scheduling policy' - observedGeneration: 2 - reason: ScheduleSucceeded - status: "True" - type: ResourceScheduled - - lastTransitionTime: "2023-11-28T20:56:21Z" - message: Successfully Synchronized work(s) for placement - observedGeneration: 2 - reason: WorkSynchronizeSucceeded - status: "True" - type: WorkSynchronized - - lastTransitionTime: "2023-11-28T20:56:21Z" - message: Failed to apply manifests, please check the `failedResourcePlacements` - status - observedGeneration: 2 - reason: ApplyFailed - status: "False" - type: ResourceApplied - failedPlacements: - - condition: - lastTransitionTime: "2023-11-28T20:56:16Z" - message: 'Failed to apply manifest: resource is not managed by the work controller' - reason: AppliedManifestFailedReason - status: "False" - type: Applied - kind: Namespace - name: test-ns - version: v1 - - clusterName: kind-cluster-2 - conditions: - - lastTransitionTime: "2023-11-28T20:56:15Z" - message: 'Successfully scheduled resources for placement in kind-cluster-2: - picked by scheduling policy' - observedGeneration: 2 - reason: ScheduleSucceeded - status: "True" - type: ResourceScheduled - - lastTransitionTime: "2023-11-28T20:56:15Z" - message: Successfully Synchronized work(s) for placement - observedGeneration: 2 - reason: WorkSynchronizeSucceeded - status: "True" - type: WorkSynchronized - - lastTransitionTime: "2023-11-28T20:56:21Z" - message: Successfully applied resources - observedGeneration: 2 - reason: ApplySucceeded - status: "True" - type: ResourceApplied - selectedResources: - - group: apps - kind: Deployment - name: test-nginx - namespace: test-ns - version: v1 - - kind: Namespace - name: test-ns - version: v1 -``` - -In the `ClusterResourcePlacement` status, `placementStatuses` for `kind-cluster-1` in the `failedPlacements` section, we get a clear message as to why the resource failed to apply on the member cluster. - -To gain more insights also take a look at the `work` object, please check this [section](#how-and-where-to-find-the-correct-work-resource) for more details, - -### Work status: -``` - status: - conditions: - - lastTransitionTime: "2023-11-28T21:07:15Z" - message: Failed to apply work - observedGeneration: 1 - reason: AppliedWorkFailed - status: "False" - type: Applied - manifestConditions: - - conditions: - - lastTransitionTime: "2023-11-28T20:56:16Z" - message: ManifestNoChange - observedGeneration: 1 - reason: ManifestNoChange - status: "True" - type: Applied - identifier: - group: apps - kind: Deployment - name: test-nginx - namespace: test-ns - ordinal: 0 - resource: deployments - version: v1 - - conditions: - - lastTransitionTime: "2023-11-28T20:56:16Z" - message: 'Failed to apply manifest: resource is not managed by the work controller' - reason: AppliedManifestFailedReason - status: "False" - type: Applied - identifier: - kind: Namespace - name: test-ns - ordinal: 1 - resource: namespaces - version: v1 -``` - -From looking at the `Work` status and specifically the `manifestConditions` section, we could see that the namespace could not be applied but the deployment within the namespace got propagated from hub to the member cluster. - -### Resolution: -In this scenario, a potential solution is to delete the existing namespace on the member cluster. However, it's essential to note that this decision rests with the user, as the namespace might already contain resources. - +____ +The order in which the conditions are updated is important for understanding the status of a cluster resource placement and failures encountered. +The order is as follows: +1. `ClusterResourcePlacementScheduled` condition is updated to indicate that a resource has been scheduled for placement. + - If this condition is false, refer to [How can I debug when my CRP status is ClusterResourcePlacementScheduled condition status is set to false?](./clusterResourcePlacementScheduled.md). +2. `ClusterResourcePlacementRolloutStarted` condition is updated to indicate that the rollout process has begun. + - If this condition is false refer to [How can I debug when my CRP status is ClusterResourcePlacementRolloutStarted condition status is set to false?](./clusterResourcePlacementRolloutStarted.md) +3. `ClusterResourcePlacementOverridden` condition is updated to indicate that the resource has been overridden. + - If this condition is false, refer to [How can I debug when my CRP status is ClusterResourcePlacementOverridden condition status is set to false?](./clusterResourcePlacementOverridden.md) +4. `ClusterResourcePlacementWorkSynchronized` condition is updated to indicate that the work objects have been synchronized. + - If this condition is false, refer to [How can I debug when my CRP status is ClusterResourcePlacementWorkSynchronized condition status is set to false?](./clusterResourcePlacementWorkSynchronized.md) +5. `ClusterResourcePlacementApplied` condition is updated to indicate that the resource has been applied. + - If this condition is false, refer to [How can I debug when my CRP status is ClusterResourcePlacementApplied condition is set to false?](./clusterResourcePlacementApplied.md) +6. `ClusterResourcePlacementAvailable` condition is updated to indicate that the resource is available. + - If this condition is false, refer to [How can I debug when my CRP status is ClusterResourcePlacementAvailable condition status is set to false?](./clusterResourcePlacementAvailable.md) + +___ ## How can I debug when some clusters are not selected as expected? Check the status of the `ClusterSchedulingPolicySnapshot` to determine which clusters were selected along with the reason. @@ -630,8 +37,8 @@ Check the status of the `ClusterSchedulingPolicySnapshot` to determine which clu ## How can I debug when a selected cluster does not have the expected resources on it or if CRP doesn't pick up the latest changes? Please check the following cases, -- Check to see if `ClusterResourcePlacementSynchronized` condition in CRP status is set to `true` or `false`. -- If it's set to `false` check this [question](#how-can-i-debug-when-my-crp-status-is-clusterresourceplacementsynchronized-condition-status-is-set-to-false). +- Check to see if `ClusterResourcePlacementRolloutStarted` condition in CRP status is set to `true` or `false`. +- If it's set to `false` check this [question](#how-can-i-debug-when-my-crp-status-is-clusterresourceplacementrolloutstarted-condition-status-is-set-to-false). - If it's set to `true`, - Check to see if `ClusterResourcePlacementApplied` condition is set to `unknown`, `false` or `true`. - If it's set to `unknown`, please wait as the resources are still being applied to the member cluster (if it's stuck in unknown state for a while, please raise a github issue as it's an unexpected behavior). diff --git a/docs/troubleshooting/clusterResourcePlacementApplied.md b/docs/troubleshooting/clusterResourcePlacementApplied.md new file mode 100644 index 000000000..f2e3225e5 --- /dev/null +++ b/docs/troubleshooting/clusterResourcePlacementApplied.md @@ -0,0 +1,241 @@ +# How can I debug when my CRP ClusterResourcePlacementApplied condition is set to false? +> Note: In addition, it may be helpful to look into the logs for the [apply work controller](https://github.com/Azure/fleet/blob/main/pkg/controllers/work/apply_controller.go) to get more information on why the resources are not available + +### Common scenarios: +- When the CRP is unable to propagate resources to a selected cluster due to the resource already existing on the cluster and not being managed by the fleet controller. +To remedy, CRP can `AllowCoOwnership` within `ApplyStrategy` to allow the resource to be managed by the fleet controller. +- When the CRP is unable to propagate resource to selected due to another CRP already managing the resource for selected cluster with a different apply strategy. +- When the CRP is unable to propagate resource due to failing to apply manifest due to syntax errors (which can happen when a resource is being propagated through an envelope object) or invalid resource configurations. + +### Investigation steps: + +1. Check `placementStatuses`: In the `ClusterResourcePlacement` status section, inspect the `placementStatuses` to identify which clusters have the `ResourceApplied` condition set to `false` and note down their `clusterName`. +2. Locate `Work` Object in Hub Cluster: Use the identified `clusterName` to locate the `Work` object associated with the member cluster. Please refer to this [section](#how-and-where-to-find-the-correct-work-resource) to learn how to get the correct `Work` resource. +3. Check `Work` object status: Inspect the status of the `Work` object to understand the specific issues preventing successful resource application. + +### Example Scenario: +In this example, the `ClusterResourcePlacement` is attempting to propagate a namespace containing a deployment to two member clusters. However, the namespace already exists on one member cluster, specifically named `kind-cluster-1`. + +### CRP spec: +``` + policy: + clusterNames: + - kind-cluster-1 + - kind-cluster-2 + placementType: PickFixed + resourceSelectors: + - group: "" + kind: Namespace + name: test-ns + version: v1 + revisionHistoryLimit: 10 + strategy: + type: RollingUpdate +``` + +### CRP status: +``` +status: + conditions: + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: could not find all the clusters needed as specified by the scheduling + policy + observedGeneration: 1 + reason: SchedulingPolicyUnfulfilled + status: "False" + type: ClusterResourcePlacementScheduled + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: All 2 cluster(s) start rolling out the latest resource + observedGeneration: 1 + reason: RolloutStarted + status: "True" + type: ClusterResourcePlacementRolloutStarted + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: No override rules are configured for the selected resources + observedGeneration: 1 + reason: NoOverrideSpecified + status: "True" + type: ClusterResourcePlacementOverridden + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: Works(s) are succcesfully created or updated in the 2 target clusters' + namespaces + observedGeneration: 1 + reason: WorkSynchronized + status: "True" + type: ClusterResourcePlacementWorkSynchronized + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: Failed to apply resources to 1 clusters, please check the `failedPlacements` + status + observedGeneration: 1 + reason: ApplyFailed + status: "False" + type: ClusterResourcePlacementApplied + observedResourceIndex: "0" + placementStatuses: + - clusterName: kind-cluster-2 + conditions: + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: 'Successfully scheduled resources for placement in kind-cluster-2 (affinity + score: 0, topology spread score: 0): picked by scheduling policy' + observedGeneration: 1 + reason: Scheduled + status: "True" + type: Scheduled + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: Detected the new changes on the resources and started the rollout process + observedGeneration: 1 + reason: RolloutStarted + status: "True" + type: RolloutStarted + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: No override rules are configured for the selected resources + observedGeneration: 1 + reason: NoOverrideSpecified + status: "True" + type: Overridden + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: All of the works are synchronized to the latest + observedGeneration: 1 + reason: AllWorkSynced + status: "True" + type: WorkSynchronized + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: All corresponding work objects are applied + observedGeneration: 1 + reason: AllWorkHaveBeenApplied + status: "True" + type: Applied + - lastTransitionTime: "2024-05-07T23:32:49Z" + message: The availability of work object crp-4-work is not trackable + observedGeneration: 1 + reason: WorkNotTrackable + status: "True" + type: Available + - clusterName: kind-cluster-1 + conditions: + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: 'Successfully scheduled resources for placement in kind-cluster-1 (affinity + score: 0, topology spread score: 0): picked by scheduling policy' + observedGeneration: 1 + reason: Scheduled + status: "True" + type: Scheduled + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: Detected the new changes on the resources and started the rollout process + observedGeneration: 1 + reason: RolloutStarted + status: "True" + type: RolloutStarted + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: No override rules are configured for the selected resources + observedGeneration: 1 + reason: NoOverrideSpecified + status: "True" + type: Overridden + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: All of the works are synchronized to the latest + observedGeneration: 1 + reason: AllWorkSynced + status: "True" + type: WorkSynchronized + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: Work object crp-4-work is not applied + observedGeneration: 1 + reason: NotAllWorkHaveBeenApplied + status: "False" + type: Applied + failedPlacements: + - condition: + lastTransitionTime: "2024-05-07T23:32:40Z" + message: 'Failed to apply manifest: failed to process the request due to a + client error: resource exists and is not managed by the fleet controller + and co-ownernship is disallowed' + reason: ManifestsAlreadyOwnedByOthers + status: "False" + type: Applied + kind: Namespace + name: test-ns + version: v1 + selectedResources: + - kind: Namespace + name: test-ns + version: v1 + - group: apps + kind: Deployment + name: test-nginx + namespace: test-ns + version: v1 +``` + + +In the `ClusterResourcePlacement` status, within the `failedPlacements` section for `kind-cluster-1`, we get a clear message +as to why the resource failed to apply on the member cluster. Immediately preceding this in the conditions section, +the `Applied` condition for `kind-cluster-1` is flagged as false, citing the `NotAllWorkHaveBeenApplied` reason. +This signifies that the Work object intended for the member cluster `kind-cluster-1` has not been applied. + +To gain more insights also take a look at the `work` object, please check this [section](#how-and-where-to-find-the-correct-work-resource) for more details, + +### Work status of kind-cluster-1: +``` + status: + conditions: + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: 'Apply manifest {Ordinal:0 Group: Version:v1 Kind:Namespace Resource:namespaces + Namespace: Name:test-ns} failed' + observedGeneration: 1 + reason: WorkAppliedFailed + status: "False" + type: Applied + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: "" + observedGeneration: 1 + reason: WorkAppliedFailed + status: Unknown + type: Available + manifestConditions: + - conditions: + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: 'Failed to apply manifest: failed to process the request due to a client + error: resource exists and is not managed by the fleet controller and co-ownernship + is disallowed' + reason: ManifestsAlreadyOwnedByOthers + status: "False" + type: Applied + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: Manifest is not applied yet + reason: ManifestApplyFailed + status: Unknown + type: Available + identifier: + kind: Namespace + name: test-ns + ordinal: 0 + resource: namespaces + version: v1 + - conditions: + - lastTransitionTime: "2024-05-07T23:32:40Z" + message: Manifest is already up to date + observedGeneration: 1 + reason: ManifestAlreadyUpToDate + status: "True" + type: Applied + - lastTransitionTime: "2024-05-07T23:32:51Z" + message: Manifest is trackable and available now + observedGeneration: 1 + reason: ManifestAvailable + status: "True" + type: Available + identifier: + group: apps + kind: Deployment + name: test-nginx + namespace: test-ns + ordinal: 1 + resource: deployments + version: v1 +``` + +From looking at the `Work` status and specifically the `manifestConditions` section, we could see that the namespace could not be applied but the deployment within the namespace got propagated from hub to the member cluster. + +### Resolution: +In this scenario, a potential solution is to delete the existing namespace on the member cluster. However, it's essential to note that this decision rests with the user, as the namespace might already contain resources. diff --git a/docs/troubleshooting/clusterResourcePlacementAvailable.md b/docs/troubleshooting/clusterResourcePlacementAvailable.md new file mode 100644 index 000000000..69b45581e --- /dev/null +++ b/docs/troubleshooting/clusterResourcePlacementAvailable.md @@ -0,0 +1,206 @@ +# How can I debug when my CRP ClusterResourcePlacementAvailable condition is set to false? +The ClusterResourcePlacementAvailable condition is false when some of the resources are not available yet. We will place some of the detailed failure in the `FailedResourcePlacement` array. +> Note: In addition, it may be helpful to look into the logs for the [apply work controller](https://github.com/Azure/fleet/blob/main/pkg/controllers/work/apply_controller.go) to get more information on why the resources are not available + +### Common scenarios: +- When the CRP is unable to propagate resources to a selected cluster due the member cluster not having enough resource availability. +- When the CRP is unable to propagate resource to a selected cluster due the deployment having a bad image name. + +### Example Scenario: +The example output below demonstrates a scenario where the CRP is unable to propagate a deployment to a member cluster due to the deployment having a bad image name. + +#### CRP spec: +``` +spec: + resourceSelectors: + - group: "" + kind: Namespace + name: test-ns + version: v1 + policy: + placementType: PickN + numberOfClusters: 1 + strategy: + type: RollingUpdate +``` + +#### CRP status: +``` +status: + conditions: + - lastTransitionTime: "2024-05-14T18:52:30Z" + message: found all cluster needed as specified by the scheduling policy, found + 1 cluster(s) + observedGeneration: 1 + reason: SchedulingPolicyFulfilled + status: "True" + type: ClusterResourcePlacementScheduled + - lastTransitionTime: "2024-05-14T18:52:31Z" + message: All 1 cluster(s) start rolling out the latest resource + observedGeneration: 1 + reason: RolloutStarted + status: "True" + type: ClusterResourcePlacementRolloutStarted + - lastTransitionTime: "2024-05-14T18:52:31Z" + message: No override rules are configured for the selected resources + observedGeneration: 1 + reason: NoOverrideSpecified + status: "True" + type: ClusterResourcePlacementOverridden + - lastTransitionTime: "2024-05-14T18:52:31Z" + message: Works(s) are succcesfully created or updated in 1 target cluster(s)' + namespaces + observedGeneration: 1 + reason: WorkSynchronized + status: "True" + type: ClusterResourcePlacementWorkSynchronized + - lastTransitionTime: "2024-05-14T18:52:31Z" + message: The selected resources are successfully applied to 1 cluster(s) + observedGeneration: 1 + reason: ApplySucceeded + status: "True" + type: ClusterResourcePlacementApplied + - lastTransitionTime: "2024-05-14T18:52:31Z" + message: The selected resources in 1 cluster(s) are still not available yet + observedGeneration: 1 + reason: ResourceNotAvailableYet + status: "False" + type: ClusterResourcePlacementAvailable + observedResourceIndex: "0" + placementStatuses: + - clusterName: kind-cluster-1 + conditions: + - lastTransitionTime: "2024-05-14T18:52:30Z" + message: 'Successfully scheduled resources for placement in kind-cluster-1 (affinity + score: 0, topology spread score: 0): picked by scheduling policy' + observedGeneration: 1 + reason: Scheduled + status: "True" + type: Scheduled + - lastTransitionTime: "2024-05-14T18:52:31Z" + message: Detected the new changes on the resources and started the rollout process + observedGeneration: 1 + reason: RolloutStarted + status: "True" + type: RolloutStarted + - lastTransitionTime: "2024-05-14T18:52:31Z" + message: No override rules are configured for the selected resources + observedGeneration: 1 + reason: NoOverrideSpecified + status: "True" + type: Overridden + - lastTransitionTime: "2024-05-14T18:52:31Z" + message: All of the works are synchronized to the latest + observedGeneration: 1 + reason: AllWorkSynced + status: "True" + type: WorkSynchronized + - lastTransitionTime: "2024-05-14T18:52:31Z" + message: All corresponding work objects are applied + observedGeneration: 1 + reason: AllWorkHaveBeenApplied + status: "True" + type: Applied + - lastTransitionTime: "2024-05-14T18:52:31Z" + message: Work object crp1-work is not available + observedGeneration: 1 + reason: NotAllWorkAreAvailable + status: "False" + type: Available + failedPlacements: + - condition: + lastTransitionTime: "2024-05-14T18:52:31Z" + message: Manifest is trackable but not available yet + observedGeneration: 1 + reason: ManifestNotAvailableYet + status: "False" + type: Available + group: apps + kind: Deployment + name: my-deployment + namespace: test-ns + version: v1 + selectedResources: + - kind: Namespace + name: test-ns + version: v1 + - group: apps + kind: Deployment + name: my-deployment + namespace: test-ns + version: v1 + ``` +In the `ClusterResourcePlacement` status, within the `failedPlacements` section for `kind-cluster-1`, we get a clear message +as to why the resource failed to apply on the member cluster. Immediately preceding this in the conditions section, +the `Available` condition for `kind-cluster-1` is flagged as false, citing the `NotAllWorkAreAvailable` reason. +This signifies that the Work object intended for the member cluster `kind-cluster-1` is not yet available. + +To gain more insights also take a look at the `work` object, please check this [section](#how-and-where-to-find-the-correct-work-resource) for more details, + +### Work status of kind-cluster-1: +``` +status: +conditions: +- lastTransitionTime: "2024-05-14T18:52:31Z" + message: Work is applied successfully + observedGeneration: 1 + reason: WorkAppliedCompleted + status: "True" + type: Applied +- lastTransitionTime: "2024-05-14T18:52:31Z" + message: Manifest {Ordinal:1 Group:apps Version:v1 Kind:Deployment Resource:deployments + Namespace:test-ns Name:my-deployment} is not available yet + observedGeneration: 1 + reason: WorkNotAvailableYet + status: "False" + type: Available + manifestConditions: +- conditions: + - lastTransitionTime: "2024-05-14T18:52:31Z" + message: Manifest is already up to date + reason: ManifestAlreadyUpToDate + status: "True" + type: Applied + - lastTransitionTime: "2024-05-14T18:52:31Z" + message: Manifest is trackable and available now + reason: ManifestAvailable + status: "True" + type: Available + identifier: + kind: Namespace + name: test-ns + ordinal: 0 + resource: namespaces + version: v1 +- conditions: + - lastTransitionTime: "2024-05-14T18:52:31Z" + message: Manifest is already up to date + observedGeneration: 1 + reason: ManifestAlreadyUpToDate + status: "True" + type: Applied + - lastTransitionTime: "2024-05-14T18:52:31Z" + message: Manifest is trackable but not available yet + observedGeneration: 1 + reason: ManifestNotAvailableYet + status: "False" + type: Available + identifier: + group: apps + kind: Deployment + name: my-deployment + namespace: test-ns + ordinal: 1 + resource: deployments + version: v1 +``` +Looking at the status `Available` condition for `kind-cluster-1`, we see that the deployment `my-deployment` is not available yet on the member cluster. +Therefore, there might be something wrong with the deployment manifest. + +#### Resolution: +In this scenario, a viable solution is to take a look at the deployment in the member cluster, as this may clearly indicate that the root cause of the issue is a bad image name. +Once the issue has been identified, you can then proceed to rectify the deployment manifest and update it accordingly. +After fixing the resource manifest and updating it, the CRP will automatically propagate the corrected resource to the member cluster. + +For all other scenarios, it's crucial to confirm that the propagated resource is configured correctly. +Additionally, ensure that the selected cluster possesses sufficient available capacity to accommodate the new resources. diff --git a/docs/troubleshooting/clusterResourcePlacementOverridden.md b/docs/troubleshooting/clusterResourcePlacementOverridden.md new file mode 100644 index 000000000..736c26fcb --- /dev/null +++ b/docs/troubleshooting/clusterResourcePlacementOverridden.md @@ -0,0 +1,159 @@ +# How can I debug when my CRP status is ClusterResourcePlacementOverridden condition status is set to false? + +The status of the `ClusterResourcePlacementOverridden` condition is set to `false` when there is an Override API related issue. +> Note: In addition, it may be helpful to look into the logs for the overrider controller (includes +> controller for [ClusterResourceOverride](https://github.com/Azure/fleet/blob/main/pkg/controllers/overrider/clusterresource_controller.go) and +> [ResourceOverride](https://github.com/Azure/fleet/blob/main/pkg/controllers/overrider/resource_controller.go)) to get more information on why the override did not succeed. + +## Common scenarios: + +- The `ClusterResourceOverride` or `ResourceOverride` is created with an invalid field path for the resource. + +## Example Scenario: +In the following example, an attempt is made to override the cluster role `secret-reader` that is being propagated by the `ClusterResourcePlacement` to the selected clusters. +However, the `ClusterResourceOverride` is created with an invalid path for the resource. + +### ClusterRole: +``` +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: +annotations: +kubectl.kubernetes.io/last-applied-configuration: | +{"apiVersion":"rbac.authorization.k8s.io/v1","kind":"ClusterRole","metadata":{"annotations":{},"name":"secret-reader"},"rules":[{"apiGroups":[""],"resources":["secrets"],"verbs":["get","watch","list"]}]} +creationTimestamp: "2024-05-14T15:36:48Z" +name: secret-reader +resourceVersion: "81334" +uid: 108e6312-3416-49be-aa3d-a665c5df58b4 +rules: +- apiGroups: + - "" + resources: + - secrets + verbs: + - get + - watch + - list +``` +The `ClusterRole` `secret-reader` that is being propagated to the member clusters by the `ClusterResourcePlacement`. + +### ClusterResourceOverride spec: +``` +spec: + clusterResourceSelectors: + - group: rbac.authorization.k8s.io + kind: ClusterRole + name: secret-reader + version: v1 + policy: + overrideRules: + - clusterSelector: + clusterSelectorTerms: + - labelSelector: + matchLabels: + env: canary + jsonPatchOverrides: + - op: add + path: /metadata/labels/new-label + value: new-value +``` +The `ClusterResourceOverride` is created to override the `ClusterRole` `secret-reader` by adding a new label `new-label` +with a value `new-value` for the clusters with the label `env: canary`. + +### CRP Spec: +``` +spec: + resourceSelectors: + - group: rbac.authorization.k8s.io + kind: ClusterRole + name: secret-reader + version: v1 + policy: + placementType: PickN + numberOfClusters: 1 + affinity: + clusterAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + clusterSelectorTerms: + - labelSelector: + matchLabels: + env: canary + strategy: + type: RollingUpdate + applyStrategy: + allowCoOwnership: true +``` + +### CRP Status: +``` +status: + conditions: + - lastTransitionTime: "2024-05-14T16:16:18Z" + message: found all cluster needed as specified by the scheduling policy, found + 1 cluster(s) + observedGeneration: 1 + reason: SchedulingPolicyFulfilled + status: "True" + type: ClusterResourcePlacementScheduled + - lastTransitionTime: "2024-05-14T16:16:18Z" + message: All 1 cluster(s) start rolling out the latest resource + observedGeneration: 1 + reason: RolloutStarted + status: "True" + type: ClusterResourcePlacementRolloutStarted + - lastTransitionTime: "2024-05-14T16:16:18Z" + message: Failed to override resources in 1 cluster(s) + observedGeneration: 1 + reason: OverriddenFailed + status: "False" + type: ClusterResourcePlacementOverridden + observedResourceIndex: "0" + placementStatuses: + - applicableClusterResourceOverrides: + - cro-1-0 + clusterName: kind-cluster-1 + conditions: + - lastTransitionTime: "2024-05-14T16:16:18Z" + message: 'Successfully scheduled resources for placement in kind-cluster-1 (affinity + score: 0, topology spread score: 0): picked by scheduling policy' + observedGeneration: 1 + reason: Scheduled + status: "True" + type: Scheduled + - lastTransitionTime: "2024-05-14T16:16:18Z" + message: Detected the new changes on the resources and started the rollout process + observedGeneration: 1 + reason: RolloutStarted + status: "True" + type: RolloutStarted + - lastTransitionTime: "2024-05-14T16:16:18Z" + message: 'Failed to apply the override rules on the resources: add operation + does not apply: doc is missing path: "/metadata/labels/new-label": missing + value' + observedGeneration: 1 + reason: OverriddenFailed + status: "False" + type: Overridden + selectedResources: + - group: rbac.authorization.k8s.io + kind: ClusterRole + name: secret-reader + version: v1 +``` +The CRP attempted to override a propagated resource utilizing an applicable `ClusterResourceOverrideSnapshot`. +However, as the `ClusterResourcePlacementOverridden` condition remains false, looking at the placement status for the cluster +where the condition `Overriden` failed will offer insights into the exact cause of the failure. +The accompanying message highlights that the override failed due to the absence of the path `/metadata/labels/new-label` and its corresponding value. +Based on the previous example of the cluster role `secret-reader`, it's evident that the path `/metadata/labels` does not exist, meaning that `labels` does not exist. +Therefore, a new label cannot be added. + +### Resolution: +The solution here is to correct the path and value in the `ClusterResourceOverride` to successfully override the `ClusterRole` `secret-reader` as shown below: +``` +jsonPatchOverrides: + - op: add + path: /metadata/labels + value: + newlabel: new-value +``` +This will successfully add the new label `newlabel` with the value `new-value` to the `ClusterRole` `secret-reader`, as we are creating the `labels` field and adding a new value `newlabel: new-value` to it. \ No newline at end of file diff --git a/docs/troubleshooting/clusterResourcePlacementRolloutStarted.md b/docs/troubleshooting/clusterResourcePlacementRolloutStarted.md new file mode 100644 index 000000000..836a1edeb --- /dev/null +++ b/docs/troubleshooting/clusterResourcePlacementRolloutStarted.md @@ -0,0 +1,351 @@ +# How can I debug when my CRP status is ClusterResourcePlacementRolloutStarted condition status is set to false? + +The `ClusterResourcePlacementRolloutStarted` condition status is set to `false` under the following circumstances: the selected resources have not been rolled out in all scheduled clusters yet. +> Note: In addition, it may be helpful to look into the logs for the [rollout controller](https://github.com/Azure/fleet/blob/main/pkg/controllers/rollout/controller.go) to get more information on why the rollout did not start. + +## Common scenarios: + +- The CRP rollout strategy is blocked due to the `rollingUpdate` configuration being too strict. + +### Investigation Steps: + +- In the `ClusterResourcePlacement` status section, examine the `placementStatuses` to identify clusters with the `RolloutStarted` status set to `false`. +- Locate the corresponding `ClusterResourceBinding` for the identified cluster. Please check this [section](#how-to-find-the-latest-clusterresourcebinding-resource) to learn how to get the latest `ClusterResourceBinding`. This resource should indicate the status of the `Work` whether it was created or updated. +- A common scenario leading to this issue is the user input for the `rollingUpdate` configuration being too strict. Verify the values for `maxUnavailable` and `maxSurge` to ensure they align with your expectations. + +### Example Scenario: + +In the following example, an attempt is made to propagate a namespace to three member clusters. However, during the initial creation of the `ClusterResourcePlacement`, the namespace doesn't exist on the hub cluster, and the fleet currently comprises two member clusters named `kind-cluster-1` and `kind-cluster-2`. + +### CRP spec: +``` +spec: + policy: + numberOfClusters: 3 + placementType: PickN + resourceSelectors: + - group: "" + kind: Namespace + name: test-ns + version: v1 + revisionHistoryLimit: 10 + strategy: + type: RollingUpdate +``` + +### CRP status: +``` +status: + conditions: + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: could not find all the clusters needed as specified by the scheduling + policy + observedGeneration: 1 + reason: SchedulingPolicyUnfulfilled + status: "False" + type: ClusterResourcePlacementScheduled + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: All 2 cluster(s) start rolling out the latest resource + observedGeneration: 1 + reason: RolloutStarted + status: "True" + type: ClusterResourcePlacementRolloutStarted + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: No override rules are configured for the selected resources + observedGeneration: 1 + reason: NoOverrideSpecified + status: "True" + type: ClusterResourcePlacementOverridden + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: Works(s) are succcesfully created or updated in the 2 target clusters' + namespaces + observedGeneration: 1 + reason: WorkSynchronized + status: "True" + type: ClusterResourcePlacementWorkSynchronized + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: The selected resources are successfully applied to 2 clusters + observedGeneration: 1 + reason: ApplySucceeded + status: "True" + type: ClusterResourcePlacementApplied + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: The selected resources in 2 cluster are available now + observedGeneration: 1 + reason: ResourceAvailable + status: "True" + type: ClusterResourcePlacementAvailable + observedResourceIndex: "0" + placementStatuses: + - clusterName: kind-cluster-2 + conditions: + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: 'Successfully scheduled resources for placement in kind-cluster-2 (affinity + score: 0, topology spread score: 0): picked by scheduling policy' + observedGeneration: 1 + reason: Scheduled + status: "True" + type: Scheduled + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: Detected the new changes on the resources and started the rollout process + observedGeneration: 1 + reason: RolloutStarted + status: "True" + type: RolloutStarted + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: No override rules are configured for the selected resources + observedGeneration: 1 + reason: NoOverrideSpecified + status: "True" + type: Overridden + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: All of the works are synchronized to the latest + observedGeneration: 1 + reason: AllWorkSynced + status: "True" + type: WorkSynchronized + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: All corresponding work objects are applied + observedGeneration: 1 + reason: AllWorkHaveBeenApplied + status: "True" + type: Applied + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: All corresponding work objects are available + observedGeneration: 1 + reason: AllWorkAreAvailable + status: "True" + type: Available + - clusterName: kind-cluster-1 + conditions: + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: 'Successfully scheduled resources for placement in kind-cluster-1 (affinity + score: 0, topology spread score: 0): picked by scheduling policy' + observedGeneration: 1 + reason: Scheduled + status: "True" + type: Scheduled + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: Detected the new changes on the resources and started the rollout process + observedGeneration: 1 + reason: RolloutStarted + status: "True" + type: RolloutStarted + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: No override rules are configured for the selected resources + observedGeneration: 1 + reason: NoOverrideSpecified + status: "True" + type: Overridden + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: All of the works are synchronized to the latest + observedGeneration: 1 + reason: AllWorkSynced + status: "True" + type: WorkSynchronized + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: All corresponding work objects are applied + observedGeneration: 1 + reason: AllWorkHaveBeenApplied + status: "True" + type: Applied + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: All corresponding work objects are available + observedGeneration: 1 + reason: AllWorkAreAvailable + status: "True" + type: Available +``` + +Given that the resource `test-ns` namespace never existed on the hub cluster, the `ClusterResourcePlacement` status reflects the following: +- `ClusterResourcePlacementScheduled` is set to `false`, as the specified policy aims to pick three clusters, but the scheduler can only accommodate placement in two currently available and joined clusters. +- `ClusterResourcePlacementRolloutStarted` is set to `true`, as the rollout process has commenced with 2 clusters being selected. +- `ClusterResourcePlacementOverridden` is set to `true`, as no override rules are configured for the selected resources. +- `ClusterResourcePlacementWorkSynchronized` is set to `true`. +- `ClusterResourcePlacementApplied` is set to `true`. +- `ClusterResourcePlacementAvailable` is set to `true`. + +Subsequently, we proceed to create the `test-ns` namespace on the hub cluster. We anticipate the seamless propagation of the namespace across the relevant clusters. + +### CRP status after namespace test-ns is created on the hub cluster: +``` +status: + conditions: + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: could not find all the clusters needed as specified by the scheduling + policy + observedGeneration: 1 + reason: SchedulingPolicyUnfulfilled + status: "False" + type: ClusterResourcePlacementScheduled + - lastTransitionTime: "2024-05-07T23:13:51Z" + message: The rollout is being blocked by the rollout strategy in 2 cluster(s) + observedGeneration: 1 + reason: RolloutNotStartedYet + status: "False" + type: ClusterResourcePlacementRolloutStarted + observedResourceIndex: "1" + placementStatuses: + - clusterName: kind-cluster-2 + conditions: + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: 'Successfully scheduled resources for placement in kind-cluster-2 (affinity + score: 0, topology spread score: 0): picked by scheduling policy' + observedGeneration: 1 + reason: Scheduled + status: "True" + type: Scheduled + - lastTransitionTime: "2024-05-07T23:13:51Z" + message: The rollout is being blocked by the rollout strategy + observedGeneration: 1 + reason: RolloutNotStartedYet + status: "False" + type: RolloutStarted + - clusterName: kind-cluster-1 + conditions: + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: 'Successfully scheduled resources for placement in kind-cluster-1 (affinity + score: 0, topology spread score: 0): picked by scheduling policy' + observedGeneration: 1 + reason: Scheduled + status: "True" + type: Scheduled + - lastTransitionTime: "2024-05-07T23:13:51Z" + message: The rollout is being blocked by the rollout strategy + observedGeneration: 1 + reason: RolloutNotStartedYet + status: "False" + type: RolloutStarted + selectedResources: + - kind: Namespace + name: test-ns + version: v1 +``` + +Upon examination, the `ClusterResourcePlacementScheduled` status is found to be `false`, and for the +`ClusterResourcePlacementRolloutStarted` status condition we see a message indicating that `The rollout is being blocked by the rollout strategy in 2 cluster(s)` + +Let's check the latest `ClusterResourceSnapshot`. Please refer to this [section](#how-to-find-the-latest-clusterresourcesnapshot-resource) to learn how to get the latest `ClusterResourceSnapshot`. + +### Latest ClusterResourceSnapshot: +``` +apiVersion: placement.kubernetes-fleet.io/v1 +kind: ClusterResourceSnapshot +metadata: + annotations: + kubernetes-fleet.io/number-of-enveloped-object: "0" + kubernetes-fleet.io/number-of-resource-snapshots: "1" + kubernetes-fleet.io/resource-hash: 72344be6e268bc7af29d75b7f0aad588d341c228801aab50d6f9f5fc33dd9c7c + creationTimestamp: "2024-05-07T23:13:51Z" + generation: 1 + labels: + kubernetes-fleet.io/is-latest-snapshot: "true" + kubernetes-fleet.io/parent-CRP: crp-3 + kubernetes-fleet.io/resource-index: "1" + name: crp-3-1-snapshot + ownerReferences: + - apiVersion: placement.kubernetes-fleet.io/v1beta1 + blockOwnerDeletion: true + controller: true + kind: ClusterResourcePlacement + name: crp-3 + uid: b4f31b9a-971a-480d-93ac-93f093ee661f + resourceVersion: "14434" + uid: 85ee0e81-92c9-4362-932b-b0bf57d78e3f +spec: + selectedResources: + - apiVersion: v1 + kind: Namespace + metadata: + labels: + kubernetes.io/metadata.name: test-ns + name: test-ns + spec: + finalizers: + - kubernetes +``` + +Upon inspecting `ClusterResourceSnapshot` spec, we observe that the `selectedResources` section now has the namespace `test-ns`. + +Let's check the `ClusterResourceBinding` for `kind-cluster-1` to see if it got updated after the namespace `test-ns` was created. Please check this [section](#how-to-find-the-latest-clusterresourcebinding-resource) to learn how to get the latest `ClusterResourceBinding`. + +### ClusterResourceBinding for kind-cluster-1: +``` +apiVersion: placement.kubernetes-fleet.io/v1 +kind: ClusterResourceBinding +metadata: + creationTimestamp: "2024-05-07T23:08:53Z" + finalizers: + - kubernetes-fleet.io/work-cleanup + generation: 2 + labels: + kubernetes-fleet.io/parent-CRP: crp-3 + name: crp-3-kind-cluster-1-7114c253 + resourceVersion: "14438" + uid: 0db4e480-8599-4b40-a1cc-f33bcb24b1a7 +spec: + applyStrategy: + type: ClientSideApply + clusterDecision: + clusterName: kind-cluster-1 + clusterScore: + affinityScore: 0 + priorityScore: 0 + reason: picked by scheduling policy + selected: true + resourceSnapshotName: crp-3-0-snapshot + schedulingPolicySnapshotName: crp-3-0 + state: Bound + targetCluster: kind-cluster-1 +status: + conditions: + - lastTransitionTime: "2024-05-07T23:13:51Z" + message: The resources cannot be updated to the latest because of the rollout + strategy + observedGeneration: 2 + reason: RolloutNotStartedYet + status: "False" + type: RolloutStarted + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: No override rules are configured for the selected resources + observedGeneration: 2 + reason: NoOverrideSpecified + status: "True" + type: Overridden + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: All of the works are synchronized to the latest + observedGeneration: 2 + reason: AllWorkSynced + status: "True" + type: WorkSynchronized + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: All corresponding work objects are applied + observedGeneration: 2 + reason: AllWorkHaveBeenApplied + status: "True" + type: Applied + - lastTransitionTime: "2024-05-07T23:08:53Z" + message: All corresponding work objects are available + observedGeneration: 2 + reason: AllWorkAreAvailable + status: "True" + type: Available +``` + +Upon inspection, it is observed that the `ClusterResourceBinding` remains unchanged. Notably, in the spec, the `resourceSnapshotName` still references the old `ClusterResourceSnapshot` name. + +This scenario arises due to the absence of explicit `rollingUpdate` input from the user. Consequently, the default values are applied: + +- `maxUnavailable` is configured to `25% * 3 (desired number), rounded to 1` +- `maxSurge` is configured to `25% * 3 (desired number), rounded to 1` + +### Summary of Events: +1. Initially, when the CRP was created, two `ClusterResourceBindings` were generated. However, since the `test-ns` + namespace did not exist on the hub cluster, rollout could be started on the selected clusters, and `ClusterResourcePlacementRolloutStarted` was set to `true`. +2. Upon creating the `test-ns` namespace on the hub, the rollout controller attempted to update the two existing `ClusterResourceBindings`. + However, the `rollingUpdate` configuration was too strict: `maxUnavailable` was set to 1, which was already the case due to a missing member cluster. + If, during the update, even one of the bindings failed to apply, it would violate the `rollingUpdate` configuration since `maxUnavailable` was set to 1. + +### Resolution: +- To address this specific issue, consider manually setting `maxUnavailable` to a value greater than 2 to relax the `rollingUpdate` configuration. +- Alternatively, you can also join a third member cluster. diff --git a/docs/troubleshooting/clusterResourcePlacementScheduled.md b/docs/troubleshooting/clusterResourcePlacementScheduled.md new file mode 100644 index 000000000..0a69e587f --- /dev/null +++ b/docs/troubleshooting/clusterResourcePlacementScheduled.md @@ -0,0 +1,199 @@ +# How can I debug when my CRP status is ClusterResourcePlacementScheduled condition status is set to false? +The `ClusterResourcePlacementScheduled` condition is set to `false` when the scheduler cannot find all the clusters needed as specified by the scheduling policy. +> Note: In addition, it may be helpful to look into the logs for the [scheduler](https://github.com/Azure/fleet/blob/main/pkg/scheduler/scheduler.go) to get more information on why the scheduling failed. + +### Common scenarios: + +Instances where this condition may arise: + +- When the placement policy is set to `PickFixed`, but the specified cluster names do not match any joined member cluster name in the fleet, or the specified cluster is no longer connected to the fleet. +- When the placement policy is set to `PickN`, and N clusters are specified, but there are fewer than N clusters that have joined the fleet or satisfy the placement policy. +- When the CRP resource selector selects a reserved namespace. + +>Note: When the placement policy is set to `PickAll`, the `ClusterResourcePlacementScheduled` condition is always set to `true`. + +### Example Scenario: + +The example output below demonstrates a `ClusterResourcePlacement` with a `PickN` placement policy attempting to propagate resources to two clusters labeled `env:prod`. In this instance, two clusters, namely `kind-cluster-1` and `kind-cluster-2`, are joined to the fleet, with only one member cluster, `kind-cluster-1`, having the label `env:prod`. + +### CRP spec: +``` +spec: + policy: + affinity: + clusterAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + clusterSelectorTerms: + - labelSelector: + matchLabels: + env: prod + numberOfClusters: 2 + placementType: PickN + resourceSelectors: + ... + revisionHistoryLimit: 10 + strategy: + type: RollingUpdate +``` + +### CRP status: +``` +status: + conditions: + - lastTransitionTime: "2024-05-07T22:36:33Z" + message: could not find all the clusters needed as specified by the scheduling + policy + observedGeneration: 1 + reason: SchedulingPolicyUnfulfilled + status: "False" + type: ClusterResourcePlacementScheduled + - lastTransitionTime: "2024-05-07T22:36:33Z" + message: All 1 cluster(s) start rolling out the latest resource + observedGeneration: 1 + reason: RolloutStarted + status: "True" + type: ClusterResourcePlacementRolloutStarted + - lastTransitionTime: "2024-05-07T22:36:33Z" + message: No override rules are configured for the selected resources + observedGeneration: 1 + reason: NoOverrideSpecified + status: "True" + type: ClusterResourcePlacementOverridden + - lastTransitionTime: "2024-05-07T22:36:33Z" + message: Works(s) are succcesfully created or updated in the 1 target clusters' + namespaces + observedGeneration: 1 + reason: WorkSynchronized + status: "True" + type: ClusterResourcePlacementWorkSynchronized + - lastTransitionTime: "2024-05-07T22:36:33Z" + message: The selected resources are successfully applied to 1 clusters + observedGeneration: 1 + reason: ApplySucceeded + status: "True" + type: ClusterResourcePlacementApplied + - lastTransitionTime: "2024-05-07T22:36:33Z" + message: The selected resources in 1 cluster are available now + observedGeneration: 1 + reason: ResourceAvailable + status: "True" + type: ClusterResourcePlacementAvailable + observedResourceIndex: "0" + placementStatuses: + - clusterName: kind-cluster-1 + conditions: + - lastTransitionTime: "2024-05-07T22:36:33Z" + message: 'Successfully scheduled resources for placement in kind-cluster-1 (affinity + score: 0, topology spread score: 0): picked by scheduling policy' + observedGeneration: 1 + reason: Scheduled + status: "True" + type: Scheduled + - lastTransitionTime: "2024-05-07T22:36:33Z" + message: Detected the new changes on the resources and started the rollout process + observedGeneration: 1 + reason: RolloutStarted + status: "True" + type: RolloutStarted + - lastTransitionTime: "2024-05-07T22:36:33Z" + message: No override rules are configured for the selected resources + observedGeneration: 1 + reason: NoOverrideSpecified + status: "True" + type: Overridden + - lastTransitionTime: "2024-05-07T22:36:33Z" + message: All of the works are synchronized to the latest + observedGeneration: 1 + reason: AllWorkSynced + status: "True" + type: WorkSynchronized + - lastTransitionTime: "2024-05-07T22:36:33Z" + message: All corresponding work objects are applied + observedGeneration: 1 + reason: AllWorkHaveBeenApplied + status: "True" + type: Applied + - lastTransitionTime: "2024-05-07T22:36:33Z" + message: All corresponding work objects are available + observedGeneration: 1 + reason: AllWorkAreAvailable + status: "True" + type: Available + - conditions: + - lastTransitionTime: "2024-05-07T22:36:33Z" + message: 'kind-cluster-2 is not selected: ClusterUnschedulable, cluster does not + match with any of the required cluster affinity terms' + observedGeneration: 1 + reason: ScheduleFailed + status: "False" + type: Scheduled + selectedResources: + ... +``` + +The `ClusterResourcePlacementScheduled` condition is set to `false`, the goal is to select two clusters with the label `env:prod`, but only one member cluster possesses the correct label as specified in `clusterAffinity`. + +We can also take a look at the `ClusterSchedulingPolicySnapshot` status to figure out why the scheduler could not schedule the resource for the placement policy specified. + +The corresponding `ClusterSchedulingPolicySnapshot` spec and status gives us even more information on why scheduling failed. Please refer to this [section](#how-to-find--verify-the-latest-clusterschedulingpolicysnapshot-for-a-crp) to learn how to get the latest `ClusterSchedulingPolicySnapshot`. + +### Latest ClusterSchedulingPolicySnapshot: +``` +apiVersion: placement.kubernetes-fleet.io/v1 +kind: ClusterSchedulingPolicySnapshot +metadata: + annotations: + kubernetes-fleet.io/CRP-generation: "1" + kubernetes-fleet.io/number-of-clusters: "2" + creationTimestamp: "2024-05-07T22:36:33Z" + generation: 1 + labels: + kubernetes-fleet.io/is-latest-snapshot: "true" + kubernetes-fleet.io/parent-CRP: crp-2 + kubernetes-fleet.io/policy-index: "0" + name: crp-2-0 + ownerReferences: + - apiVersion: placement.kubernetes-fleet.io/v1beta1 + blockOwnerDeletion: true + controller: true + kind: ClusterResourcePlacement + name: crp-2 + uid: 48bc1e92-a8b9-4450-a2d5-c6905df2cbf0 + resourceVersion: "10090" + uid: 2137887e-45fd-4f52-bbb7-b96f39854625 +spec: + policy: + affinity: + clusterAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + clusterSelectorTerms: + - labelSelector: + matchLabels: + env: prod + placementType: PickN + policyHash: ZjE0Yjk4YjYyMTVjY2U3NzQ1MTZkNWRhZjRiNjQ1NzQ4NjllNTUyMzZkODBkYzkyYmRkMGU3OTI3MWEwOTkyNQ== +status: + conditions: + - lastTransitionTime: "2024-05-07T22:36:33Z" + message: could not find all the clusters needed as specified by the scheduling + policy + observedGeneration: 1 + reason: SchedulingPolicyUnfulfilled + status: "False" + type: Scheduled + observedCRPGeneration: 1 + targetClusters: + - clusterName: kind-cluster-1 + clusterScore: + affinityScore: 0 + priorityScore: 0 + reason: picked by scheduling policy + selected: true + - clusterName: kind-cluster-2 + reason: ClusterUnschedulable, cluster does not match with any of the required + cluster affinity terms + selected: false +``` + +### Resolution: +The solution here is to add the `env:prod` label to the member cluster resource for `kind-cluster-2` as well, so that the scheduler can select the cluster to propagate resources. \ No newline at end of file diff --git a/docs/troubleshooting/clusterResourcePlacementWorkSynchronized.md b/docs/troubleshooting/clusterResourcePlacementWorkSynchronized.md new file mode 100644 index 000000000..557790364 --- /dev/null +++ b/docs/troubleshooting/clusterResourcePlacementWorkSynchronized.md @@ -0,0 +1,115 @@ +# How can I debug when my CRP status is ClusterResourcePlacementWorkSynchronized condition status is set to false? + +The `ClusterResourcePlacementWorkSynchronized` condition is false when the CRP has been recently updated but the associated work objects have not yet been synchronized with the changes. +> Note: In addition, it may be helpful to look into the logs for the [work generator controller](https://github.com/Azure/fleet/blob/main/pkg/controllers/workgenerator/controller.go) to get more information on why the work synchronization failed. + +## Common Scenarios: +- If used, the `ClusterResourceOverride` or `ResourceOverride` is created with an invalid value for the resource. +- The CRP is unable to propagate resources to a selected cluster due to the selected cluster being terminated. + +### Example Scenario: +The CRP is attempting to propagate a resource to a selected cluster, but the work object has not been updated to reflect the latest changes due to the selected cluster being terminated. + +### CRP Spec: +``` +spec: + resourceSelectors: + - group: rbac.authorization.k8s.io + kind: ClusterRole + name: secret-reader + version: v1 + policy: + placementType: PickN + numberOfClusters: 1 + strategy: + type: RollingUpdate + ``` + +### CRP Status: +``` +spec: + policy: + numberOfClusters: 1 + placementType: PickN + resourceSelectors: + - group: "" + kind: Namespace + name: test-ns + version: v1 + revisionHistoryLimit: 10 + strategy: + type: RollingUpdate +status: + conditions: + - lastTransitionTime: "2024-05-14T18:05:04Z" + message: found all cluster needed as specified by the scheduling policy, found + 1 cluster(s) + observedGeneration: 1 + reason: SchedulingPolicyFulfilled + status: "True" + type: ClusterResourcePlacementScheduled + - lastTransitionTime: "2024-05-14T18:05:05Z" + message: All 1 cluster(s) start rolling out the latest resource + observedGeneration: 1 + reason: RolloutStarted + status: "True" + type: ClusterResourcePlacementRolloutStarted + - lastTransitionTime: "2024-05-14T18:05:05Z" + message: No override rules are configured for the selected resources + observedGeneration: 1 + reason: NoOverrideSpecified + status: "True" + type: ClusterResourcePlacementOverridden + - lastTransitionTime: "2024-05-14T18:05:05Z" + message: There are 1 cluster(s) which have not finished creating or updating work(s) + yet + observedGeneration: 1 + reason: WorkNotSynchronizedYet + status: "False" + type: ClusterResourcePlacementWorkSynchronized + observedResourceIndex: "0" + placementStatuses: + - clusterName: kind-cluster-1 + conditions: + - lastTransitionTime: "2024-05-14T18:05:04Z" + message: 'Successfully scheduled resources for placement in kind-cluster-1 (affinity + score: 0, topology spread score: 0): picked by scheduling policy' + observedGeneration: 1 + reason: Scheduled + status: "True" + type: Scheduled + - lastTransitionTime: "2024-05-14T18:05:05Z" + message: Detected the new changes on the resources and started the rollout process + observedGeneration: 1 + reason: RolloutStarted + status: "True" + type: RolloutStarted + - lastTransitionTime: "2024-05-14T18:05:05Z" + message: No override rules are configured for the selected resources + observedGeneration: 1 + reason: NoOverrideSpecified + status: "True" + type: Overridden + - lastTransitionTime: "2024-05-14T18:05:05Z" + message: 'Failed to sychronize the work to the latest: works.placement.kubernetes-fleet.io + "crp1-work" is forbidden: unable to create new content in namespace fleet-member-kind-cluster-1 + because it is being terminated' + observedGeneration: 1 + reason: SyncWorkFailed + status: "False" + type: WorkSynchronized + selectedResources: + - kind: Namespace + name: test-ns + version: v1 +``` +The `ClusterResourcePlacementWorkSynchronized` condition in the CRP status is flagged as false. It is clear from the message +that the work object `crp1-work` is prohibited from generating new content within the namespace `fleet-member-kind-cluster-1` +as it's currently undergoing termination. + +### Resolution: +To address the issue at hand, there are several potential solutions: +- One option is to modify the Cluster Resource Placement (CRP) with a newly selected cluster. +- Another option is to delete the CRP to remove work through garbage collection. +- It's also worth noting that the namespace can only regenerate if the cluster is re-joined, so another potential solution is to re-join the member cluster. +- In other scenarios, you might opt to wait for the work to finish propagating.