From afce166bd392befe652abebaf1bc7e30445d5357 Mon Sep 17 00:00:00 2001
From: Amanuel Engeda <74629455+engedaam@users.noreply.github.com>
Date: Thu, 21 Mar 2024 23:45:52 -0700
Subject: [PATCH] cherrypick(v0.33.x): feat: Add Versioned for NodePool Hash to
 Prevent Drift on NodePool CRD Upgrade (#1129)

---
 pkg/apis/v1beta1/labels.go                    |   1 +
 pkg/apis/v1beta1/nodepool.go                  |   6 +
 pkg/controllers/nodeclaim/disruption/drift.go |  11 +-
 .../nodeclaim/disruption/drift_test.go        |  26 +++-
 pkg/controllers/nodepool/hash/controller.go   |  51 +++++++-
 pkg/controllers/nodepool/hash/suite_test.go   | 111 +++++++++++++++++-
 .../scheduling/nodeclaimtemplate.go           |   7 +-
 7 files changed, 202 insertions(+), 11 deletions(-)

diff --git a/pkg/apis/v1beta1/labels.go b/pkg/apis/v1beta1/labels.go
index 745993e7d1..e8e7902e74 100644
--- a/pkg/apis/v1beta1/labels.go
+++ b/pkg/apis/v1beta1/labels.go
@@ -44,6 +44,7 @@ const (
 	ProviderCompatabilityAnnotationKey = CompatabilityGroup + "/provider"
 	ManagedByAnnotationKey             = Group + "/managed-by"
 	NodePoolHashAnnotationKey          = Group + "/nodepool-hash"
+	NodePoolHashVersionAnnotationKey   = Group + "/nodepool-hash-version"
 )
 
 // Karpenter specific finalizers
diff --git a/pkg/apis/v1beta1/nodepool.go b/pkg/apis/v1beta1/nodepool.go
index dd5c48d87e..7804f53e35 100644
--- a/pkg/apis/v1beta1/nodepool.go
+++ b/pkg/apis/v1beta1/nodepool.go
@@ -139,6 +139,12 @@ type NodePool struct {
 	Status NodePoolStatus `json:"status,omitempty"`
 }
 
+// We need to bump the NodePoolHashVersion when we make an update to the NodePool CRD under these conditions:
+// 1. A field changes its default value for an existing field that is already hashed
+// 2. A field is added to the hash calculation with an already-set value
+// 3. A field is removed from the hash calculations
+const NodePoolHashVersion = "v1"
+
 func (in *NodePool) Hash() string {
 	return fmt.Sprint(lo.Must(hashstructure.Hash(in.Spec.Template, hashstructure.FormatV2, &hashstructure.HashOptions{
 		SlicesAsSets:    true,
diff --git a/pkg/controllers/nodeclaim/disruption/drift.go b/pkg/controllers/nodeclaim/disruption/drift.go
index 52c6270e57..99cdeef672 100644
--- a/pkg/controllers/nodeclaim/disruption/drift.go
+++ b/pkg/controllers/nodeclaim/disruption/drift.go
@@ -107,12 +107,19 @@ func (d *Drift) isDrifted(ctx context.Context, nodePool *v1beta1.NodePool, nodeC
 	return driftedReason, nil
 }
 
-// Eligible fields for static drift are described in the docs
+// Eligible fields for drift are described in the docs
 // https://karpenter.sh/docs/concepts/deprovisioning/#drift
 func areStaticFieldsDrifted(nodePool *v1beta1.NodePool, nodeClaim *v1beta1.NodeClaim) cloudprovider.DriftReason {
 	nodePoolHash, foundHashNodePool := nodePool.Annotations[v1beta1.NodePoolHashAnnotationKey]
+	nodePoolVersionHash, foundVersionHashNodePool := nodePool.Annotations[v1beta1.NodePoolHashVersionAnnotationKey]
 	nodeClaimHash, foundHashNodeClaim := nodeClaim.Annotations[v1beta1.NodePoolHashAnnotationKey]
-	if !foundHashNodePool || !foundHashNodeClaim {
+	nodeClaimVersionHash, foundVersionHashNodeClaim := nodeClaim.Annotations[v1beta1.NodePoolHashVersionAnnotationKey]
+
+	if !foundHashNodePool || !foundHashNodeClaim || !foundVersionHashNodePool || !foundVersionHashNodeClaim {
+		return ""
+	}
+	// validate that the version of the crd is the same
+	if nodePoolVersionHash != nodeClaimVersionHash {
 		return ""
 	}
 	return lo.Ternary(nodePoolHash != nodeClaimHash, NodePoolDrifted, "")
diff --git a/pkg/controllers/nodeclaim/disruption/drift_test.go b/pkg/controllers/nodeclaim/disruption/drift_test.go
index cc76261fe4..6fe67e66a1 100644
--- a/pkg/controllers/nodeclaim/disruption/drift_test.go
+++ b/pkg/controllers/nodeclaim/disruption/drift_test.go
@@ -120,7 +120,12 @@ var _ = Describe("Drift", func() {
 	It("should detect static drift before cloud provider drift", func() {
 		cp.Drifted = "drifted"
 		nodePool.Annotations = lo.Assign(nodePool.Annotations, map[string]string{
-			v1beta1.NodePoolHashAnnotationKey: "123456789",
+			v1beta1.NodePoolHashAnnotationKey:        "test-123456789",
+			v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion,
+		})
+		nodeClaim.Annotations = lo.Assign(nodeClaim.Annotations, map[string]string{
+			v1beta1.NodePoolHashAnnotationKey:        "test-123",
+			v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion,
 		})
 		ExpectApplied(ctx, env.Client, nodePool, nodeClaim)
 		ExpectReconcileSucceeded(ctx, nodeClaimDisruptionController, client.ObjectKeyFromObject(nodeClaim))
@@ -425,8 +430,9 @@ var _ = Describe("Drift", func() {
 					Template: v1beta1.NodeClaimTemplate{
 						ObjectMeta: v1beta1.ObjectMeta{
 							Annotations: map[string]string{
-								"keyAnnotation":  "valueAnnotation",
-								"keyAnnotation2": "valueAnnotation2",
+								"keyAnnotation":                          "valueAnnotation",
+								"keyAnnotation2":                         "valueAnnotation2",
+								v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion,
 							},
 							Labels: map[string]string{
 								"keyLabel":  "valueLabel",
@@ -493,5 +499,19 @@ var _ = Describe("Drift", func() {
 			nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
 			Expect(nodeClaim.StatusConditions().GetCondition(v1beta1.Drifted)).To(BeNil())
 		})
+		It("should not return drifted if the NodeClaim's karpenter.sh/nodepool-hash-version annotation does not match the NodePool's", func() {
+			nodePool.ObjectMeta.Annotations = map[string]string{
+				v1beta1.NodePoolHashAnnotationKey:        "test-hash-1",
+				v1beta1.NodePoolHashVersionAnnotationKey: "test-version-1",
+			}
+			nodeClaim.ObjectMeta.Annotations = map[string]string{
+				v1beta1.NodePoolHashAnnotationKey:        "test-hash-2",
+				v1beta1.NodePoolHashVersionAnnotationKey: "test-version-2",
+			}
+			ExpectApplied(ctx, env.Client, nodePool, nodeClaim)
+			ExpectReconcileSucceeded(ctx, nodeClaimDisruptionController, client.ObjectKeyFromObject(nodeClaim))
+			nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
+			Expect(nodeClaim.StatusConditions().GetCondition(v1beta1.Drifted)).To(BeNil())
+		})
 	})
 })
diff --git a/pkg/controllers/nodepool/hash/controller.go b/pkg/controllers/nodepool/hash/controller.go
index 18dd06577c..bd3fbe3236 100644
--- a/pkg/controllers/nodepool/hash/controller.go
+++ b/pkg/controllers/nodepool/hash/controller.go
@@ -18,6 +18,7 @@ import (
 	"context"
 
 	"github.com/samber/lo"
+	"go.uber.org/multierr"
 	"k8s.io/apimachinery/pkg/api/equality"
 	controllerruntime "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
@@ -48,7 +49,16 @@ func NewController(kubeClient client.Client) operatorcontroller.Controller {
 // Reconcile the resource
 func (c *Controller) Reconcile(ctx context.Context, np *v1beta1.NodePool) (reconcile.Result, error) {
 	stored := np.DeepCopy()
-	np.Annotations = lo.Assign(np.Annotations, nodepoolutil.HashAnnotation(np))
+
+	if np.Annotations[v1beta1.NodePoolHashVersionAnnotationKey] != v1beta1.NodePoolHashVersion {
+		if err := c.updateNodeClaimHash(ctx, np); err != nil {
+			return reconcile.Result{}, err
+		}
+	}
+	np.Annotations = lo.Assign(np.Annotations, map[string]string{
+		v1beta1.NodePoolHashAnnotationKey:        np.Hash(),
+		v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion,
+	})
 
 	if !equality.Semantic.DeepEqual(stored, np) {
 		if err := nodepoolutil.Patch(ctx, c.kubeClient, stored, np); err != nil {
@@ -70,3 +80,42 @@ func (c *Controller) Builder(_ context.Context, m manager.Manager) operatorcontr
 		WithOptions(controller.Options{MaxConcurrentReconciles: 10}),
 	)
 }
+
+// Updating `nodepool-hash-version` annotation inside the controller means a breaking change has made to the hash function calculating
+// `nodepool-hash` on both the NodePool and NodeClaim, automatically making the `nodepool-hash` on the NodeClaim different from
+// NodePool. Since we can not rely on the hash on the NodeClaims, we will need to re-calculate the hash and update the annotation.
+// Look at designs/drift-hash-version.md for more information.
+func (c *Controller) updateNodeClaimHash(ctx context.Context, np *v1beta1.NodePool) error {
+	ncList := &v1beta1.NodeClaimList{}
+	if err := c.kubeClient.List(ctx, ncList, client.MatchingLabels(map[string]string{v1beta1.NodePoolLabelKey: np.Name})); err != nil {
+		return err
+	}
+
+	errs := make([]error, len(ncList.Items))
+	for i := range ncList.Items {
+		nc := ncList.Items[i]
+		stored := nc.DeepCopy()
+
+		if nc.Annotations[v1beta1.NodePoolHashVersionAnnotationKey] != v1beta1.NodePoolHashVersion {
+			nc.Annotations = lo.Assign(nc.Annotations, map[string]string{
+				v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion,
+			})
+
+			// Any NodeClaim that is already drifted will remain drifted if the karpenter.sh/nodepool-hash-version doesn't match
+			// Since the hashing mechanism has changed we will not be able to determine if the drifted status of the node has changed
+			if nc.StatusConditions().GetCondition(v1beta1.Drifted) == nil {
+				nc.Annotations = lo.Assign(nc.Annotations, map[string]string{
+					v1beta1.NodePoolHashAnnotationKey: np.Hash(),
+				})
+			}
+
+			if !equality.Semantic.DeepEqual(stored, nc) {
+				if err := c.kubeClient.Patch(ctx, &nc, client.MergeFrom(stored)); err != nil {
+					errs[i] = client.IgnoreNotFound(err)
+				}
+			}
+		}
+	}
+
+	return multierr.Combine(errs...)
+}
diff --git a/pkg/controllers/nodepool/hash/suite_test.go b/pkg/controllers/nodepool/hash/suite_test.go
index c7f173088c..caac61f2ca 100644
--- a/pkg/controllers/nodepool/hash/suite_test.go
+++ b/pkg/controllers/nodepool/hash/suite_test.go
@@ -28,9 +28,11 @@ import (
 	"knative.dev/pkg/ptr"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 
-	. "sigs.k8s.io/karpenter/pkg/test/expectations"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
 	"sigs.k8s.io/karpenter/pkg/apis"
+	. "sigs.k8s.io/karpenter/pkg/test/expectations"
+
 	"sigs.k8s.io/karpenter/pkg/apis/v1beta1"
 	"sigs.k8s.io/karpenter/pkg/controllers/nodepool/hash"
 	"sigs.k8s.io/karpenter/pkg/operator/controller"
@@ -93,7 +95,7 @@ var _ = Describe("Static Drift Hash", func() {
 			},
 		})
 	})
-	It("should update the static drift hash when NodePool static field is updated", func() {
+	It("should update the drift hash when NodePool static field is updated", func() {
 		ExpectApplied(ctx, env.Client, nodePool)
 		ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool))
 		nodePool = ExpectExists(ctx, env.Client, nodePool)
@@ -109,7 +111,7 @@ var _ = Describe("Static Drift Hash", func() {
 		expectedHashTwo := nodePool.Hash()
 		Expect(nodePool.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, expectedHashTwo))
 	})
-	It("should not update the static drift hash when NodePool behavior field is updated", func() {
+	It("should not update the drift hash when NodePool behavior field is updated", func() {
 		ExpectApplied(ctx, env.Client, nodePool)
 		ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool))
 		nodePool = ExpectExists(ctx, env.Client, nodePool)
@@ -132,4 +134,107 @@ var _ = Describe("Static Drift Hash", func() {
 
 		Expect(nodePool.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, expectedHash))
 	})
+	It("should update nodepool hash version when the nodepool hash version is out of sync with the controller hash version", func() {
+		nodePool.Annotations = map[string]string{
+			v1beta1.NodePoolHashAnnotationKey:        "abceduefed",
+			v1beta1.NodePoolHashVersionAnnotationKey: "test",
+		}
+		ExpectApplied(ctx, env.Client, nodePool)
+
+		ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool))
+		nodePool = ExpectExists(ctx, env.Client, nodePool)
+
+		expectedHash := nodePool.Hash()
+		Expect(nodePool.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, expectedHash))
+		Expect(nodePool.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashVersionAnnotationKey, v1beta1.NodePoolHashVersion))
+	})
+	It("should update nodepool hash versions on all nodeclaims when the hash versions don't match the controller hash version", func() {
+		nodePool.Annotations = map[string]string{
+			v1beta1.NodePoolHashAnnotationKey:        "abceduefed",
+			v1beta1.NodePoolHashVersionAnnotationKey: "test",
+		}
+		nodeClaimOne := test.NodeClaim(v1beta1.NodeClaim{
+			ObjectMeta: metav1.ObjectMeta{
+				Labels: map[string]string{v1beta1.NodePoolLabelKey: nodePool.Name},
+				Annotations: map[string]string{
+					v1beta1.NodePoolHashAnnotationKey:        "123456",
+					v1beta1.NodePoolHashVersionAnnotationKey: "test",
+				},
+			},
+		})
+		nodeClaimTwo := test.NodeClaim(v1beta1.NodeClaim{
+			ObjectMeta: metav1.ObjectMeta{
+				Labels: map[string]string{v1beta1.NodePoolLabelKey: nodePool.Name},
+				Annotations: map[string]string{
+					v1beta1.NodePoolHashAnnotationKey:        "123456",
+					v1beta1.NodePoolHashVersionAnnotationKey: "test",
+				},
+			},
+		})
+
+		ExpectApplied(ctx, env.Client, nodePool, nodeClaimOne, nodeClaimTwo)
+
+		ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool))
+		nodePool = ExpectExists(ctx, env.Client, nodePool)
+		nodeClaimOne = ExpectExists(ctx, env.Client, nodeClaimOne)
+		nodeClaimTwo = ExpectExists(ctx, env.Client, nodeClaimTwo)
+
+		expectedHash := nodePool.Hash()
+		Expect(nodeClaimOne.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, expectedHash))
+		Expect(nodeClaimOne.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashVersionAnnotationKey, v1beta1.NodePoolHashVersion))
+		Expect(nodeClaimTwo.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, expectedHash))
+		Expect(nodeClaimTwo.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashVersionAnnotationKey, v1beta1.NodePoolHashVersion))
+	})
+	It("should not update nodepool hash on all nodeclaims when the hash versions match the controller hash version", func() {
+		nodePool.Annotations = map[string]string{
+			v1beta1.NodePoolHashAnnotationKey:        "abceduefed",
+			v1beta1.NodePoolHashVersionAnnotationKey: "test-version",
+		}
+		nodeClaim := test.NodeClaim(v1beta1.NodeClaim{
+			ObjectMeta: metav1.ObjectMeta{
+				Labels: map[string]string{v1beta1.NodePoolLabelKey: nodePool.Name},
+				Annotations: map[string]string{
+					v1beta1.NodePoolHashAnnotationKey:        "1234564654",
+					v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion,
+				},
+			},
+		})
+		ExpectApplied(ctx, env.Client, nodePool, nodeClaim)
+
+		ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool))
+		nodePool = ExpectExists(ctx, env.Client, nodePool)
+		nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
+
+		expectedHash := nodePool.Hash()
+
+		// Expect NodeClaims to have been updated to the original hash
+		Expect(nodePool.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, expectedHash))
+		Expect(nodePool.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashVersionAnnotationKey, v1beta1.NodePoolHashVersion))
+		Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, "1234564654"))
+		Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashVersionAnnotationKey, v1beta1.NodePoolHashVersion))
+	})
+	It("should not update nodepool hash on the nodeclaim if it's drifted", func() {
+		nodePool.Annotations = map[string]string{
+			v1beta1.NodePoolHashAnnotationKey:        "abceduefed",
+			v1beta1.NodePoolHashVersionAnnotationKey: "test",
+		}
+		nodeClaim := test.NodeClaim(v1beta1.NodeClaim{
+			ObjectMeta: metav1.ObjectMeta{
+				Labels: map[string]string{v1beta1.NodePoolLabelKey: nodePool.Name},
+				Annotations: map[string]string{
+					v1beta1.NodePoolHashAnnotationKey:        "123456",
+					v1beta1.NodePoolHashVersionAnnotationKey: "test",
+				},
+			},
+		})
+		nodeClaim.StatusConditions().MarkTrue(v1beta1.Drifted)
+		ExpectApplied(ctx, env.Client, nodePool, nodeClaim)
+
+		ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool))
+		nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
+
+		// Expect NodeClaims hash to not have been updated
+		Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, "123456"))
+		Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashVersionAnnotationKey, v1beta1.NodePoolHashVersion))
+	})
 })
diff --git a/pkg/controllers/provisioning/scheduling/nodeclaimtemplate.go b/pkg/controllers/provisioning/scheduling/nodeclaimtemplate.go
index 57f6770f10..97db022e5d 100644
--- a/pkg/controllers/provisioning/scheduling/nodeclaimtemplate.go
+++ b/pkg/controllers/provisioning/scheduling/nodeclaimtemplate.go
@@ -60,8 +60,11 @@ func (i *NodeClaimTemplate) ToNodeClaim(nodePool *v1beta1.NodePool) *v1beta1.Nod
 	nc := &v1beta1.NodeClaim{
 		ObjectMeta: metav1.ObjectMeta{
 			GenerateName: fmt.Sprintf("%s-", i.NodePoolName),
-			Annotations:  lo.Assign(i.Annotations, map[string]string{v1beta1.NodePoolHashAnnotationKey: nodePool.Hash()}),
-			Labels:       i.Labels,
+			Annotations: lo.Assign(i.Annotations, map[string]string{
+				v1beta1.NodePoolHashAnnotationKey:        nodePool.Hash(),
+				v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion,
+			}),
+			Labels: i.Labels,
 			OwnerReferences: []metav1.OwnerReference{
 				{
 					APIVersion:         v1beta1.SchemeGroupVersion.String(),