Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: Update nodeclaim finalization flow to not depend on cloudProvider.Get() #1944

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pkg/cloudprovider/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ type CloudProvider interface {
// Create launches a NodeClaim with the given resource requests and requirements and returns a hydrated
// NodeClaim back with resolved NodeClaim labels for the launched NodeClaim
Create(context.Context, *v1.NodeClaim) (*v1.NodeClaim, error)
// Delete removes a NodeClaim from the cloudprovider by its provider id
// Delete removes a NodeClaim from the cloudprovider by its provider id. Delete should return
// NodeClaimNotFoundError if the cloudProvider instance is already terminated and nil if deletion was triggered.
// Karpenter will keep retrying until Delete returns a NodeClaimNotFound error.
Delete(context.Context, *v1.NodeClaim) error
// Get retrieves a NodeClaim from the cloudprovider by its provider id
Get(context.Context, string) (*v1.NodeClaim, error)
Expand Down
6 changes: 3 additions & 3 deletions pkg/controllers/nodeclaim/lifecycle/termination_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ var _ = Describe("Termination", func() {
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) // this will call cloudProvider Get to check if the instance is still around
ExpectNotFound(ctx, env.Client, nodeClaim)
})
It("should requeue reconciliation if cloudProvider Get returns an error other than NodeClaimNotFoundError", func() {
It("should requeue reconciliation if cloudProvider Delete returns an error other than NodeClaimNotFoundError", func() {
ExpectApplied(ctx, env.Client, nodePool, nodeClaim)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)

Expand All @@ -161,8 +161,8 @@ var _ = Describe("Termination", func() {
Expect(env.Client.Delete(ctx, nodeClaim)).To(Succeed())
result := ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) // trigger nodeClaim Deletion that will set the nodeClaim status as terminating
Expect(result.RequeueAfter).To(BeEquivalentTo(5 * time.Second))
cloudProvider.NextGetErr = errors.New("fake error")
// trigger nodeClaim Deletion that will make cloudProvider Get and fail due to error
cloudProvider.NextDeleteErr = errors.New("fake error")
// trigger nodeClaim Deletion that will make cloudProvider Delete and requeue reconciliation due to error
Expect(ExpectObjectReconcileFailed(ctx, env.Client, nodeClaimController, nodeClaim)).To(HaveOccurred())
result = ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) // trigger nodeClaim Deletion that will succeed
Expect(result.Requeue).To(BeFalse())
Expand Down
47 changes: 11 additions & 36 deletions pkg/utils/termination/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package termination_test

import (
"context"
"errors"
"fmt"
"testing"

Expand Down Expand Up @@ -67,59 +68,32 @@ var _ = Describe("TerminationUtils", func() {
nodeClaim = test.NodeClaim()
cloudProvider.CreatedNodeClaims[nodeClaim.Status.ProviderID] = nodeClaim
})
It("should not call cloudProvider Delete if the status condition is already Terminating", func() {
nodeClaim.StatusConditions().SetTrue(v1.ConditionTypeInstanceTerminating)
It("should return false if cloudProvider Delete does not return any error", func() {
ExpectApplied(ctx, env.Client, nodeClaim)
instanceTerminated, err := termination.EnsureTerminated(ctx, env.Client, nodeClaim, cloudProvider)
Expect(len(cloudProvider.DeleteCalls)).To(BeEquivalentTo(0))
Expect(len(cloudProvider.GetCalls)).To(BeEquivalentTo(1))
Expect(instanceTerminated).To(BeFalse())
Expect(err).NotTo(HaveOccurred())
})
It("should call cloudProvider Delete followed by Get and return true when the cloudProvider instance is terminated", func() {
ExpectApplied(ctx, env.Client, nodeClaim)
// This will call cloudProvider.Delete()
instanceTerminated, err := termination.EnsureTerminated(ctx, env.Client, nodeClaim, cloudProvider)
Expect(len(cloudProvider.DeleteCalls)).To(BeEquivalentTo(1))
Expect(instanceTerminated).To(BeFalse())
Expect(err).NotTo(HaveOccurred())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue())

//This will call cloudProvider.Get(). Instance is terminated at this point
instanceTerminated, err = termination.EnsureTerminated(ctx, env.Client, nodeClaim, cloudProvider)
Expect(len(cloudProvider.GetCalls)).To(BeEquivalentTo(1))

Expect(instanceTerminated).To(BeTrue())
Expect(instanceTerminated).To(BeFalse())
Expect(err).NotTo(HaveOccurred())
})
It("should call cloudProvider Delete followed by Get and return false when the cloudProvider instance is not terminated", func() {
It("should return false if cloudProvider Delete does not return a not found error", func() {
ExpectApplied(ctx, env.Client, nodeClaim)
cloudProvider.NextDeleteErr = errors.New("fake error")
// This will call cloudProvider.Delete()
instanceTerminated, err := termination.EnsureTerminated(ctx, env.Client, nodeClaim, cloudProvider)
Expect(len(cloudProvider.DeleteCalls)).To(BeEquivalentTo(1))
Expect(instanceTerminated).To(BeFalse())
Expect(err).NotTo(HaveOccurred())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue())

// The delete call that happened first will remove the cloudProvider instance from cloudProvider.CreatedNodeClaims[].
// To model the behavior of having cloudProvider instance not terminated, we add it back here.
cloudProvider.CreatedNodeClaims[nodeClaim.Status.ProviderID] = nodeClaim
//This will call cloudProvider.Get(). Instance is not terminated at this point
instanceTerminated, err = termination.EnsureTerminated(ctx, env.Client, nodeClaim, cloudProvider)
Expect(len(cloudProvider.GetCalls)).To(BeEquivalentTo(1))

Expect(instanceTerminated).To(BeFalse())
Expect(err).NotTo(HaveOccurred())
Expect(err).To(HaveOccurred())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeFalse())
})
It("should call cloudProvider Delete and return true if cloudProvider instance is not found", func() {
It("should call cloudProvider Delete and return true if cloudProvider Delete returns not found error", func() {
ExpectApplied(ctx, env.Client, nodeClaim)

cloudProvider.NextDeleteErr = cloudprovider.NewNodeClaimNotFoundError(fmt.Errorf("no nodeclaim exists"))
instanceTerminated, err := termination.EnsureTerminated(ctx, env.Client, nodeClaim, cloudProvider)
Expect(len(cloudProvider.GetCalls)).To(BeEquivalentTo(0))

Expect(instanceTerminated).To(BeTrue())
Expect(err).NotTo(HaveOccurred())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue())
})
It("shouldn't mark the root condition of the NodeClaim as unknown when setting the Termination condition", func() {
for _, cond := range []string{
Expand All @@ -133,8 +107,9 @@ var _ = Describe("TerminationUtils", func() {
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(nodeClaim.StatusConditions().Root().IsTrue())
ExpectApplied(ctx, env.Client, nodeClaim)
cloudProvider.NextDeleteErr = cloudprovider.NewNodeClaimNotFoundError(fmt.Errorf("no nodeclaim exists"))
instanceTerminated, err := termination.EnsureTerminated(ctx, env.Client, nodeClaim, cloudProvider)
Expect(instanceTerminated).To(BeFalse())
Expect(instanceTerminated).To(BeTrue())
Expect(err).NotTo(HaveOccurred())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue())
Expect(nodeClaim.StatusConditions().Root().IsTrue())
Expand Down
68 changes: 20 additions & 48 deletions pkg/utils/termination/termination.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,62 +20,34 @@ import (
"context"
"fmt"

"k8s.io/apimachinery/pkg/api/equality"

"sigs.k8s.io/controller-runtime/pkg/client"

v1 "sigs.k8s.io/karpenter/pkg/apis/v1"
"sigs.k8s.io/karpenter/pkg/cloudprovider"
)

// EnsureTerminated is a helper function that takes a v1.NodeClaim and calls cloudProvider.Delete() if status condition
// on nodeClaim is not terminating. If it is terminating then it will call cloudProvider.Get() to check if the instance
// is terminated or not. It will return an error and a boolean that indicates if the instance is terminated or not. We simply return
// conflict or a NotFound error if we encounter it while updating the status on nodeClaim.
// EnsureTerminated is a helper function that takes a v1.NodeClaim and calls cloudProvider.Delete(). It will return an
// error and a boolean that indicates if the instance is terminated or not. We simply return conflict or a NotFound
// error if we encounter it while updating the status on nodeClaim.
func EnsureTerminated(ctx context.Context, c client.Client, nodeClaim *v1.NodeClaim, cloudProvider cloudprovider.CloudProvider) (terminated bool, err error) {
// Check if the status condition on nodeClaim is Terminating
if !nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue() {
// If not then call Delete on cloudProvider to trigger termination and always requeue reconciliation
if err = cloudProvider.Delete(ctx, nodeClaim); err != nil {
if cloudprovider.IsNodeClaimNotFoundError(err) {
stored := nodeClaim.DeepCopy()
updateStatusConditionsForDeleting(nodeClaim)
// We use client.MergeFromWithOptimisticLock because patching a list with a JSON merge patch
// can cause races due to the fact that it fully replaces the list on a change
// Here, we are updating the status condition list
if err = c.Status().Patch(ctx, nodeClaim, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil {
return false, err
}
// Instance is terminated
return true, nil
stored := nodeClaim.DeepCopy()
err = cloudProvider.Delete(ctx, nodeClaim)
// Set InstanceTerminating to true when cloudProvider.Delete() returns -
// 1. nodeClaim not found error which indicates the instance was terminated
// 2. no error which indicates that Delete() has been processed but the instance has not terminated so requeue
jigisha620 marked this conversation as resolved.
Show resolved Hide resolved
if err == nil || cloudprovider.IsNodeClaimNotFoundError(err) {
nodeClaim.StatusConditions().SetTrue(v1.ConditionTypeInstanceTerminating)
if !equality.Semantic.DeepEqual(stored, nodeClaim) {
// We use client.MergeFromWithOptimisticLock because patching a list with a JSON merge patch
// can cause races due to the fact that it fully replaces the list on a change
// Here, we are updating the status condition list
if e := c.Status().Patch(ctx, nodeClaim, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); e != nil {
return false, e
}
return false, fmt.Errorf("terminating cloudprovider instance, %w", err)
}

stored := nodeClaim.DeepCopy()
updateStatusConditionsForDeleting(nodeClaim)
// We use client.MergeFromWithOptimisticLock because patching a list with a JSON merge patch
// can cause races due to the fact that it fully replaces the list on a change
// Here, we are updating the status condition list
if err = c.Status().Patch(ctx, nodeClaim, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil {
return false, err
}
return false, nil
}
// Call Get on cloudProvider to check if the instance is terminated
if _, err := cloudProvider.Get(ctx, nodeClaim.Status.ProviderID); err != nil {
if cloudprovider.IsNodeClaimNotFoundError(err) {
return true, nil
}
return false, fmt.Errorf("getting cloudprovider instance, %w", err)
}
return false, nil
}

func updateStatusConditionsForDeleting(nc *v1.NodeClaim) {
// perform a no-op for whatever the status condition is currently set to
// so that we bump the observed generation to the latest and prevent the nodeclaim
// root status from entering an `Unknown` state
for _, condition := range nc.Status.Conditions {
nc.StatusConditions().Set(condition)
return cloudprovider.IsNodeClaimNotFoundError(err), nil
}
nc.StatusConditions().SetTrue(v1.ConditionTypeInstanceTerminating)
return false, fmt.Errorf("terminating cloudprovider instance, %w", err)
}