From afce166bd392befe652abebaf1bc7e30445d5357 Mon Sep 17 00:00:00 2001 From: Amanuel Engeda <74629455+engedaam@users.noreply.github.com> Date: Thu, 21 Mar 2024 23:45:52 -0700 Subject: [PATCH] cherrypick(v0.33.x): feat: Add Versioned for NodePool Hash to Prevent Drift on NodePool CRD Upgrade (#1129) --- pkg/apis/v1beta1/labels.go | 1 + pkg/apis/v1beta1/nodepool.go | 6 + pkg/controllers/nodeclaim/disruption/drift.go | 11 +- .../nodeclaim/disruption/drift_test.go | 26 +++- pkg/controllers/nodepool/hash/controller.go | 51 +++++++- pkg/controllers/nodepool/hash/suite_test.go | 111 +++++++++++++++++- .../scheduling/nodeclaimtemplate.go | 7 +- 7 files changed, 202 insertions(+), 11 deletions(-) diff --git a/pkg/apis/v1beta1/labels.go b/pkg/apis/v1beta1/labels.go index 745993e7d1..e8e7902e74 100644 --- a/pkg/apis/v1beta1/labels.go +++ b/pkg/apis/v1beta1/labels.go @@ -44,6 +44,7 @@ const ( ProviderCompatabilityAnnotationKey = CompatabilityGroup + "/provider" ManagedByAnnotationKey = Group + "/managed-by" NodePoolHashAnnotationKey = Group + "/nodepool-hash" + NodePoolHashVersionAnnotationKey = Group + "/nodepool-hash-version" ) // Karpenter specific finalizers diff --git a/pkg/apis/v1beta1/nodepool.go b/pkg/apis/v1beta1/nodepool.go index dd5c48d87e..7804f53e35 100644 --- a/pkg/apis/v1beta1/nodepool.go +++ b/pkg/apis/v1beta1/nodepool.go @@ -139,6 +139,12 @@ type NodePool struct { Status NodePoolStatus `json:"status,omitempty"` } +// We need to bump the NodePoolHashVersion when we make an update to the NodePool CRD under these conditions: +// 1. A field changes its default value for an existing field that is already hashed +// 2. A field is added to the hash calculation with an already-set value +// 3. A field is removed from the hash calculations +const NodePoolHashVersion = "v1" + func (in *NodePool) Hash() string { return fmt.Sprint(lo.Must(hashstructure.Hash(in.Spec.Template, hashstructure.FormatV2, &hashstructure.HashOptions{ SlicesAsSets: true, diff --git a/pkg/controllers/nodeclaim/disruption/drift.go b/pkg/controllers/nodeclaim/disruption/drift.go index 52c6270e57..99cdeef672 100644 --- a/pkg/controllers/nodeclaim/disruption/drift.go +++ b/pkg/controllers/nodeclaim/disruption/drift.go @@ -107,12 +107,19 @@ func (d *Drift) isDrifted(ctx context.Context, nodePool *v1beta1.NodePool, nodeC return driftedReason, nil } -// Eligible fields for static drift are described in the docs +// Eligible fields for drift are described in the docs // https://karpenter.sh/docs/concepts/deprovisioning/#drift func areStaticFieldsDrifted(nodePool *v1beta1.NodePool, nodeClaim *v1beta1.NodeClaim) cloudprovider.DriftReason { nodePoolHash, foundHashNodePool := nodePool.Annotations[v1beta1.NodePoolHashAnnotationKey] + nodePoolVersionHash, foundVersionHashNodePool := nodePool.Annotations[v1beta1.NodePoolHashVersionAnnotationKey] nodeClaimHash, foundHashNodeClaim := nodeClaim.Annotations[v1beta1.NodePoolHashAnnotationKey] - if !foundHashNodePool || !foundHashNodeClaim { + nodeClaimVersionHash, foundVersionHashNodeClaim := nodeClaim.Annotations[v1beta1.NodePoolHashVersionAnnotationKey] + + if !foundHashNodePool || !foundHashNodeClaim || !foundVersionHashNodePool || !foundVersionHashNodeClaim { + return "" + } + // validate that the version of the crd is the same + if nodePoolVersionHash != nodeClaimVersionHash { return "" } return lo.Ternary(nodePoolHash != nodeClaimHash, NodePoolDrifted, "") diff --git a/pkg/controllers/nodeclaim/disruption/drift_test.go b/pkg/controllers/nodeclaim/disruption/drift_test.go index cc76261fe4..6fe67e66a1 100644 --- a/pkg/controllers/nodeclaim/disruption/drift_test.go +++ b/pkg/controllers/nodeclaim/disruption/drift_test.go @@ -120,7 +120,12 @@ var _ = Describe("Drift", func() { It("should detect static drift before cloud provider drift", func() { cp.Drifted = "drifted" nodePool.Annotations = lo.Assign(nodePool.Annotations, map[string]string{ - v1beta1.NodePoolHashAnnotationKey: "123456789", + v1beta1.NodePoolHashAnnotationKey: "test-123456789", + v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion, + }) + nodeClaim.Annotations = lo.Assign(nodeClaim.Annotations, map[string]string{ + v1beta1.NodePoolHashAnnotationKey: "test-123", + v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion, }) ExpectApplied(ctx, env.Client, nodePool, nodeClaim) ExpectReconcileSucceeded(ctx, nodeClaimDisruptionController, client.ObjectKeyFromObject(nodeClaim)) @@ -425,8 +430,9 @@ var _ = Describe("Drift", func() { Template: v1beta1.NodeClaimTemplate{ ObjectMeta: v1beta1.ObjectMeta{ Annotations: map[string]string{ - "keyAnnotation": "valueAnnotation", - "keyAnnotation2": "valueAnnotation2", + "keyAnnotation": "valueAnnotation", + "keyAnnotation2": "valueAnnotation2", + v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion, }, Labels: map[string]string{ "keyLabel": "valueLabel", @@ -493,5 +499,19 @@ var _ = Describe("Drift", func() { nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) Expect(nodeClaim.StatusConditions().GetCondition(v1beta1.Drifted)).To(BeNil()) }) + It("should not return drifted if the NodeClaim's karpenter.sh/nodepool-hash-version annotation does not match the NodePool's", func() { + nodePool.ObjectMeta.Annotations = map[string]string{ + v1beta1.NodePoolHashAnnotationKey: "test-hash-1", + v1beta1.NodePoolHashVersionAnnotationKey: "test-version-1", + } + nodeClaim.ObjectMeta.Annotations = map[string]string{ + v1beta1.NodePoolHashAnnotationKey: "test-hash-2", + v1beta1.NodePoolHashVersionAnnotationKey: "test-version-2", + } + ExpectApplied(ctx, env.Client, nodePool, nodeClaim) + ExpectReconcileSucceeded(ctx, nodeClaimDisruptionController, client.ObjectKeyFromObject(nodeClaim)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().GetCondition(v1beta1.Drifted)).To(BeNil()) + }) }) }) diff --git a/pkg/controllers/nodepool/hash/controller.go b/pkg/controllers/nodepool/hash/controller.go index 18dd06577c..bd3fbe3236 100644 --- a/pkg/controllers/nodepool/hash/controller.go +++ b/pkg/controllers/nodepool/hash/controller.go @@ -18,6 +18,7 @@ import ( "context" "github.com/samber/lo" + "go.uber.org/multierr" "k8s.io/apimachinery/pkg/api/equality" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -48,7 +49,16 @@ func NewController(kubeClient client.Client) operatorcontroller.Controller { // Reconcile the resource func (c *Controller) Reconcile(ctx context.Context, np *v1beta1.NodePool) (reconcile.Result, error) { stored := np.DeepCopy() - np.Annotations = lo.Assign(np.Annotations, nodepoolutil.HashAnnotation(np)) + + if np.Annotations[v1beta1.NodePoolHashVersionAnnotationKey] != v1beta1.NodePoolHashVersion { + if err := c.updateNodeClaimHash(ctx, np); err != nil { + return reconcile.Result{}, err + } + } + np.Annotations = lo.Assign(np.Annotations, map[string]string{ + v1beta1.NodePoolHashAnnotationKey: np.Hash(), + v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion, + }) if !equality.Semantic.DeepEqual(stored, np) { if err := nodepoolutil.Patch(ctx, c.kubeClient, stored, np); err != nil { @@ -70,3 +80,42 @@ func (c *Controller) Builder(_ context.Context, m manager.Manager) operatorcontr WithOptions(controller.Options{MaxConcurrentReconciles: 10}), ) } + +// Updating `nodepool-hash-version` annotation inside the controller means a breaking change has made to the hash function calculating +// `nodepool-hash` on both the NodePool and NodeClaim, automatically making the `nodepool-hash` on the NodeClaim different from +// NodePool. Since we can not rely on the hash on the NodeClaims, we will need to re-calculate the hash and update the annotation. +// Look at designs/drift-hash-version.md for more information. +func (c *Controller) updateNodeClaimHash(ctx context.Context, np *v1beta1.NodePool) error { + ncList := &v1beta1.NodeClaimList{} + if err := c.kubeClient.List(ctx, ncList, client.MatchingLabels(map[string]string{v1beta1.NodePoolLabelKey: np.Name})); err != nil { + return err + } + + errs := make([]error, len(ncList.Items)) + for i := range ncList.Items { + nc := ncList.Items[i] + stored := nc.DeepCopy() + + if nc.Annotations[v1beta1.NodePoolHashVersionAnnotationKey] != v1beta1.NodePoolHashVersion { + nc.Annotations = lo.Assign(nc.Annotations, map[string]string{ + v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion, + }) + + // Any NodeClaim that is already drifted will remain drifted if the karpenter.sh/nodepool-hash-version doesn't match + // Since the hashing mechanism has changed we will not be able to determine if the drifted status of the node has changed + if nc.StatusConditions().GetCondition(v1beta1.Drifted) == nil { + nc.Annotations = lo.Assign(nc.Annotations, map[string]string{ + v1beta1.NodePoolHashAnnotationKey: np.Hash(), + }) + } + + if !equality.Semantic.DeepEqual(stored, nc) { + if err := c.kubeClient.Patch(ctx, &nc, client.MergeFrom(stored)); err != nil { + errs[i] = client.IgnoreNotFound(err) + } + } + } + } + + return multierr.Combine(errs...) +} diff --git a/pkg/controllers/nodepool/hash/suite_test.go b/pkg/controllers/nodepool/hash/suite_test.go index c7f173088c..caac61f2ca 100644 --- a/pkg/controllers/nodepool/hash/suite_test.go +++ b/pkg/controllers/nodepool/hash/suite_test.go @@ -28,9 +28,11 @@ import ( "knative.dev/pkg/ptr" "sigs.k8s.io/controller-runtime/pkg/client" - . "sigs.k8s.io/karpenter/pkg/test/expectations" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/karpenter/pkg/apis" + . "sigs.k8s.io/karpenter/pkg/test/expectations" + "sigs.k8s.io/karpenter/pkg/apis/v1beta1" "sigs.k8s.io/karpenter/pkg/controllers/nodepool/hash" "sigs.k8s.io/karpenter/pkg/operator/controller" @@ -93,7 +95,7 @@ var _ = Describe("Static Drift Hash", func() { }, }) }) - It("should update the static drift hash when NodePool static field is updated", func() { + It("should update the drift hash when NodePool static field is updated", func() { ExpectApplied(ctx, env.Client, nodePool) ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool)) nodePool = ExpectExists(ctx, env.Client, nodePool) @@ -109,7 +111,7 @@ var _ = Describe("Static Drift Hash", func() { expectedHashTwo := nodePool.Hash() Expect(nodePool.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, expectedHashTwo)) }) - It("should not update the static drift hash when NodePool behavior field is updated", func() { + It("should not update the drift hash when NodePool behavior field is updated", func() { ExpectApplied(ctx, env.Client, nodePool) ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool)) nodePool = ExpectExists(ctx, env.Client, nodePool) @@ -132,4 +134,107 @@ var _ = Describe("Static Drift Hash", func() { Expect(nodePool.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, expectedHash)) }) + It("should update nodepool hash version when the nodepool hash version is out of sync with the controller hash version", func() { + nodePool.Annotations = map[string]string{ + v1beta1.NodePoolHashAnnotationKey: "abceduefed", + v1beta1.NodePoolHashVersionAnnotationKey: "test", + } + ExpectApplied(ctx, env.Client, nodePool) + + ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool)) + nodePool = ExpectExists(ctx, env.Client, nodePool) + + expectedHash := nodePool.Hash() + Expect(nodePool.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, expectedHash)) + Expect(nodePool.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashVersionAnnotationKey, v1beta1.NodePoolHashVersion)) + }) + It("should update nodepool hash versions on all nodeclaims when the hash versions don't match the controller hash version", func() { + nodePool.Annotations = map[string]string{ + v1beta1.NodePoolHashAnnotationKey: "abceduefed", + v1beta1.NodePoolHashVersionAnnotationKey: "test", + } + nodeClaimOne := test.NodeClaim(v1beta1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{v1beta1.NodePoolLabelKey: nodePool.Name}, + Annotations: map[string]string{ + v1beta1.NodePoolHashAnnotationKey: "123456", + v1beta1.NodePoolHashVersionAnnotationKey: "test", + }, + }, + }) + nodeClaimTwo := test.NodeClaim(v1beta1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{v1beta1.NodePoolLabelKey: nodePool.Name}, + Annotations: map[string]string{ + v1beta1.NodePoolHashAnnotationKey: "123456", + v1beta1.NodePoolHashVersionAnnotationKey: "test", + }, + }, + }) + + ExpectApplied(ctx, env.Client, nodePool, nodeClaimOne, nodeClaimTwo) + + ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool)) + nodePool = ExpectExists(ctx, env.Client, nodePool) + nodeClaimOne = ExpectExists(ctx, env.Client, nodeClaimOne) + nodeClaimTwo = ExpectExists(ctx, env.Client, nodeClaimTwo) + + expectedHash := nodePool.Hash() + Expect(nodeClaimOne.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, expectedHash)) + Expect(nodeClaimOne.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashVersionAnnotationKey, v1beta1.NodePoolHashVersion)) + Expect(nodeClaimTwo.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, expectedHash)) + Expect(nodeClaimTwo.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashVersionAnnotationKey, v1beta1.NodePoolHashVersion)) + }) + It("should not update nodepool hash on all nodeclaims when the hash versions match the controller hash version", func() { + nodePool.Annotations = map[string]string{ + v1beta1.NodePoolHashAnnotationKey: "abceduefed", + v1beta1.NodePoolHashVersionAnnotationKey: "test-version", + } + nodeClaim := test.NodeClaim(v1beta1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{v1beta1.NodePoolLabelKey: nodePool.Name}, + Annotations: map[string]string{ + v1beta1.NodePoolHashAnnotationKey: "1234564654", + v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion, + }, + }, + }) + ExpectApplied(ctx, env.Client, nodePool, nodeClaim) + + ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool)) + nodePool = ExpectExists(ctx, env.Client, nodePool) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + + expectedHash := nodePool.Hash() + + // Expect NodeClaims to have been updated to the original hash + Expect(nodePool.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, expectedHash)) + Expect(nodePool.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashVersionAnnotationKey, v1beta1.NodePoolHashVersion)) + Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, "1234564654")) + Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashVersionAnnotationKey, v1beta1.NodePoolHashVersion)) + }) + It("should not update nodepool hash on the nodeclaim if it's drifted", func() { + nodePool.Annotations = map[string]string{ + v1beta1.NodePoolHashAnnotationKey: "abceduefed", + v1beta1.NodePoolHashVersionAnnotationKey: "test", + } + nodeClaim := test.NodeClaim(v1beta1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{v1beta1.NodePoolLabelKey: nodePool.Name}, + Annotations: map[string]string{ + v1beta1.NodePoolHashAnnotationKey: "123456", + v1beta1.NodePoolHashVersionAnnotationKey: "test", + }, + }, + }) + nodeClaim.StatusConditions().MarkTrue(v1beta1.Drifted) + ExpectApplied(ctx, env.Client, nodePool, nodeClaim) + + ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + + // Expect NodeClaims hash to not have been updated + Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, "123456")) + Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashVersionAnnotationKey, v1beta1.NodePoolHashVersion)) + }) }) diff --git a/pkg/controllers/provisioning/scheduling/nodeclaimtemplate.go b/pkg/controllers/provisioning/scheduling/nodeclaimtemplate.go index 57f6770f10..97db022e5d 100644 --- a/pkg/controllers/provisioning/scheduling/nodeclaimtemplate.go +++ b/pkg/controllers/provisioning/scheduling/nodeclaimtemplate.go @@ -60,8 +60,11 @@ func (i *NodeClaimTemplate) ToNodeClaim(nodePool *v1beta1.NodePool) *v1beta1.Nod nc := &v1beta1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ GenerateName: fmt.Sprintf("%s-", i.NodePoolName), - Annotations: lo.Assign(i.Annotations, map[string]string{v1beta1.NodePoolHashAnnotationKey: nodePool.Hash()}), - Labels: i.Labels, + Annotations: lo.Assign(i.Annotations, map[string]string{ + v1beta1.NodePoolHashAnnotationKey: nodePool.Hash(), + v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion, + }), + Labels: i.Labels, OwnerReferences: []metav1.OwnerReference{ { APIVersion: v1beta1.SchemeGroupVersion.String(),