Skip to content

Commit

Permalink
chore: Update nodeclaim finalization flow to not depend on cloudProvi…
Browse files Browse the repository at this point in the history
…der.Get()
  • Loading branch information
jigisha620 committed Feb 1, 2025
1 parent bb27849 commit c2f4e29
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 78 deletions.
36 changes: 10 additions & 26 deletions pkg/controllers/nodeclaim/lifecycle/termination_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,11 @@ var _ = Describe("Termination", func() {
result := ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) // now all the nodes are gone so nodeClaim deletion continues
Expect(result.RequeueAfter).To(BeEquivalentTo(5 * time.Second))
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue())

ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) // this will call cloudProvider Get to check if the instance is still around
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeFalse())

ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) // this will call cloudProvider Delete again to check if the instance is deleted
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue())
ExpectMetricHistogramSampleCountValue("karpenter_nodeclaims_instance_termination_duration_seconds", 1, map[string]string{"nodepool": nodePool.Name})
ExpectMetricHistogramSampleCountValue("karpenter_nodeclaims_termination_duration_seconds", 1, map[string]string{"nodepool": nodePool.Name})
ExpectNotFound(ctx, env.Client, nodeClaim, node)
Expand Down Expand Up @@ -146,45 +147,28 @@ var _ = Describe("Termination", func() {

Expect(result.RequeueAfter).To(BeEquivalentTo(5 * time.Second))
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeFalse())

ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) // this will call cloudProvider Get to check if the instance is still around
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) // this will call cloudProvider Delete to check if the instance is still around
ExpectNotFound(ctx, env.Client, nodeClaim)
})
It("should requeue reconciliation if cloudProvider Get returns an error other than NodeClaimNotFoundError", func() {
It("should requeue reconciliation if cloudProvider Delete returns an error other than NodeClaimNotFoundError", func() {
ExpectApplied(ctx, env.Client, nodePool, nodeClaim)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)

nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
_, err := cloudProvider.Get(ctx, nodeClaim.Status.ProviderID)
Expect(err).ToNot(HaveOccurred())
Expect(env.Client.Delete(ctx, nodeClaim)).To(Succeed())
result := ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) // trigger nodeClaim Deletion that will set the nodeClaim status as terminating
result := ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) // trigger nodeClaim Deletion
Expect(result.RequeueAfter).To(BeEquivalentTo(5 * time.Second))
cloudProvider.NextGetErr = errors.New("fake error")
// trigger nodeClaim Deletion that will make cloudProvider Get and fail due to error
cloudProvider.NextDeleteErr = errors.New("fake error")
// trigger nodeClaim Deletion that will make cloudProvider Delete and fail due to error
Expect(ExpectObjectReconcileFailed(ctx, env.Client, nodeClaimController, nodeClaim)).To(HaveOccurred())
result = ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) // trigger nodeClaim Deletion that will succeed
Expect(result.Requeue).To(BeFalse())
ExpectNotFound(ctx, env.Client, nodeClaim)
})
It("should not remove the finalizer and terminate the NodeClaim if the cloudProvider instance is still around", func() {
ExpectApplied(ctx, env.Client, nodePool, nodeClaim)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)

nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
_, err := cloudProvider.Get(ctx, nodeClaim.Status.ProviderID)
Expect(err).ToNot(HaveOccurred())
Expect(env.Client.Delete(ctx, nodeClaim)).To(Succeed())
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
// The delete call that happened first will remove the cloudProvider instance from cloudProvider.CreatedNodeClaims[].
// To model the behavior of having cloudProvider instance not terminated, we add it back here.
cloudProvider.CreatedNodeClaims[nodeClaim.Status.ProviderID] = nodeClaim
result := ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) // this will ensure that we call cloudProvider Get to check if the instance is still around
Expect(result.RequeueAfter).To(BeEquivalentTo(5 * time.Second))
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue())
})
It("should delete multiple Nodes if multiple Nodes map to the NodeClaim", func() {
ExpectApplied(ctx, env.Client, nodePool, nodeClaim)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
Expand Down
44 changes: 10 additions & 34 deletions pkg/utils/termination/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package termination_test

import (
"context"
"errors"
"fmt"
"testing"

Expand Down Expand Up @@ -67,49 +68,22 @@ var _ = Describe("TerminationUtils", func() {
nodeClaim = test.NodeClaim()
cloudProvider.CreatedNodeClaims[nodeClaim.Status.ProviderID] = nodeClaim
})
It("should not call cloudProvider Delete if the status condition is already Terminating", func() {
nodeClaim.StatusConditions().SetTrue(v1.ConditionTypeInstanceTerminating)
It("should call cloudProvider Delete if the status condition is not Terminating", func() {
ExpectApplied(ctx, env.Client, nodeClaim)
instanceTerminated, err := termination.EnsureTerminated(ctx, env.Client, nodeClaim, cloudProvider)
Expect(len(cloudProvider.DeleteCalls)).To(BeEquivalentTo(0))
Expect(len(cloudProvider.GetCalls)).To(BeEquivalentTo(1))
Expect(instanceTerminated).To(BeFalse())
Expect(err).NotTo(HaveOccurred())
})
It("should call cloudProvider Delete followed by Get and return true when the cloudProvider instance is terminated", func() {
ExpectApplied(ctx, env.Client, nodeClaim)
// This will call cloudProvider.Delete()
instanceTerminated, err := termination.EnsureTerminated(ctx, env.Client, nodeClaim, cloudProvider)
Expect(len(cloudProvider.DeleteCalls)).To(BeEquivalentTo(1))
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeFalse())
Expect(instanceTerminated).To(BeFalse())
Expect(err).NotTo(HaveOccurred())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue())

//This will call cloudProvider.Get(). Instance is terminated at this point
instanceTerminated, err = termination.EnsureTerminated(ctx, env.Client, nodeClaim, cloudProvider)
Expect(len(cloudProvider.GetCalls)).To(BeEquivalentTo(1))

Expect(instanceTerminated).To(BeTrue())
Expect(err).NotTo(HaveOccurred())
})
It("should call cloudProvider Delete followed by Get and return false when the cloudProvider instance is not terminated", func() {
It("should not update instance terminating status condition if cloudProvider does not return a not found error", func() {
ExpectApplied(ctx, env.Client, nodeClaim)
cloudProvider.NextDeleteErr = errors.New("fake error")
// This will call cloudProvider.Delete()
instanceTerminated, err := termination.EnsureTerminated(ctx, env.Client, nodeClaim, cloudProvider)
Expect(len(cloudProvider.DeleteCalls)).To(BeEquivalentTo(1))
Expect(instanceTerminated).To(BeFalse())
Expect(err).NotTo(HaveOccurred())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue())

// The delete call that happened first will remove the cloudProvider instance from cloudProvider.CreatedNodeClaims[].
// To model the behavior of having cloudProvider instance not terminated, we add it back here.
cloudProvider.CreatedNodeClaims[nodeClaim.Status.ProviderID] = nodeClaim
//This will call cloudProvider.Get(). Instance is not terminated at this point
instanceTerminated, err = termination.EnsureTerminated(ctx, env.Client, nodeClaim, cloudProvider)
Expect(len(cloudProvider.GetCalls)).To(BeEquivalentTo(1))

Expect(instanceTerminated).To(BeFalse())
Expect(err).NotTo(HaveOccurred())
Expect(err).To(HaveOccurred())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeFalse())
})
It("should call cloudProvider Delete and return true if cloudProvider instance is not found", func() {
ExpectApplied(ctx, env.Client, nodeClaim)
Expand All @@ -120,6 +94,7 @@ var _ = Describe("TerminationUtils", func() {

Expect(instanceTerminated).To(BeTrue())
Expect(err).NotTo(HaveOccurred())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue())
})
It("shouldn't mark the root condition of the NodeClaim as unknown when setting the Termination condition", func() {
for _, cond := range []string{
Expand All @@ -133,8 +108,9 @@ var _ = Describe("TerminationUtils", func() {
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(nodeClaim.StatusConditions().Root().IsTrue())
ExpectApplied(ctx, env.Client, nodeClaim)
cloudProvider.NextDeleteErr = cloudprovider.NewNodeClaimNotFoundError(fmt.Errorf("no nodeclaim exists"))
instanceTerminated, err := termination.EnsureTerminated(ctx, env.Client, nodeClaim, cloudProvider)
Expect(instanceTerminated).To(BeFalse())
Expect(instanceTerminated).To(BeTrue())
Expect(err).NotTo(HaveOccurred())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue())
Expect(nodeClaim.StatusConditions().Root().IsTrue())
Expand Down
19 changes: 1 addition & 18 deletions pkg/utils/termination/termination.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ import (
)

// EnsureTerminated is a helper function that takes a v1.NodeClaim and calls cloudProvider.Delete() if status condition
// on nodeClaim is not terminating. If it is terminating then it will call cloudProvider.Get() to check if the instance
// is terminated or not. It will return an error and a boolean that indicates if the instance is terminated or not. We simply return
// on nodeClaim is not terminating. It will return an error and a boolean that indicates if the instance is terminated or not. We simply return
// conflict or a NotFound error if we encounter it while updating the status on nodeClaim.
func EnsureTerminated(ctx context.Context, c client.Client, nodeClaim *v1.NodeClaim, cloudProvider cloudprovider.CloudProvider) (terminated bool, err error) {
// Check if the status condition on nodeClaim is Terminating
Expand All @@ -49,24 +48,8 @@ func EnsureTerminated(ctx context.Context, c client.Client, nodeClaim *v1.NodeCl
}
return false, fmt.Errorf("terminating cloudprovider instance, %w", err)
}

stored := nodeClaim.DeepCopy()
updateStatusConditionsForDeleting(nodeClaim)
// We use client.MergeFromWithOptimisticLock because patching a list with a JSON merge patch
// can cause races due to the fact that it fully replaces the list on a change
// Here, we are updating the status condition list
if err = c.Status().Patch(ctx, nodeClaim, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil {
return false, err
}
return false, nil
}
// Call Get on cloudProvider to check if the instance is terminated
if _, err := cloudProvider.Get(ctx, nodeClaim.Status.ProviderID); err != nil {
if cloudprovider.IsNodeClaimNotFoundError(err) {
return true, nil
}
return false, fmt.Errorf("getting cloudprovider instance, %w", err)
}
return false, nil
}

Expand Down

0 comments on commit c2f4e29

Please sign in to comment.