diff --git a/Makefile b/Makefile index b093e9b916..559a978cfd 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ HELM_OPTS ?= --set logLevel=debug \ --set controller.resources.requests.cpu=1 \ --set controller.resources.requests.memory=1Gi \ --set controller.resources.limits.cpu=1 \ - --set controller.resources.limits.memory=1Gi + --set controller.resources.limits.memory=1Gi help: ## Display help @awk 'BEGIN {FS = ":.*##"; printf "Usage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) @@ -23,7 +23,7 @@ build: ## Build the Karpenter KWOK controller images using ko build $(eval IMG_REPOSITORY=$(shell echo $(CONTROLLER_IMG) | cut -d "@" -f 1 | cut -d ":" -f 1)) $(eval IMG_TAG=$(shell echo $(CONTROLLER_IMG) | cut -d "@" -f 1 | cut -d ":" -f 2 -s)) $(eval IMG_DIGEST=$(shell echo $(CONTROLLER_IMG) | cut -d "@" -f 2)) - + # Run make install-kwok to install the kwok controller in your cluster first # Webhooks are currently not supported in the kwok provider. @@ -40,11 +40,11 @@ apply: verify build ## Deploy the kwok controller from the current state of your delete: ## Delete the controller from your ~/.kube/config cluster helm uninstall karpenter --namespace ${KARPENTER_NAMESPACE} - + test: ## Run tests go test ./... \ -race \ - -timeout 10m \ + -timeout 1m \ --ginkgo.focus="${FOCUS}" \ --ginkgo.timeout=10m \ --ginkgo.v \ diff --git a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml index d0a19b000b..085e98885c 100644 --- a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml +++ b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.14.0 + controller-gen.kubebuilder.io/version: v0.13.0 name: nodeclaims.karpenter.sh spec: group: karpenter.sh @@ -50,19 +50,10 @@ spec: description: NodeClaim is the Schema for the NodeClaims API properties: apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + description: 'APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' type: string kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + description: 'Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' type: string metadata: type: object @@ -70,15 +61,10 @@ spec: description: NodeClaimSpec describes the desired state of the NodeClaim properties: kubelet: - description: |- - Kubelet defines args to be used when configuring kubelet on provisioned nodes. - They are a subset of the upstream types, recognizing not all options may be supported. - Wherever possible, the types and names should reflect the upstream kubelet types. + description: Kubelet defines args to be used when configuring kubelet on provisioned nodes. They are a subset of the upstream types, recognizing not all options may be supported. Wherever possible, the types and names should reflect the upstream kubelet types. properties: clusterDNS: - description: |- - clusterDNS is a list of IP addresses for the cluster DNS server. - Note that not all providers may use all addresses. + description: clusterDNS is a list of IP addresses for the cluster DNS server. Note that not all providers may use all addresses. items: type: string type: array @@ -95,9 +81,7 @@ spec: - message: valid keys for evictionHard are ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available'] rule: self.all(x, x in ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available']) evictionMaxPodGracePeriod: - description: |- - EvictionMaxPodGracePeriod is the maximum allowed grace period (in seconds) to use when terminating pods in - response to soft eviction thresholds being met. + description: EvictionMaxPodGracePeriod is the maximum allowed grace period (in seconds) to use when terminating pods in response to soft eviction thresholds being met. format: int32 type: integer evictionSoft: @@ -118,22 +102,13 @@ spec: - message: valid keys for evictionSoftGracePeriod are ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available'] rule: self.all(x, x in ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available']) imageGCHighThresholdPercent: - description: |- - ImageGCHighThresholdPercent is the percent of disk usage after which image - garbage collection is always run. The percent is calculated by dividing this - field value by 100, so this field must be between 0 and 100, inclusive. - When specified, the value must be greater than ImageGCLowThresholdPercent. + description: ImageGCHighThresholdPercent is the percent of disk usage after which image garbage collection is always run. The percent is calculated by dividing this field value by 100, so this field must be between 0 and 100, inclusive. When specified, the value must be greater than ImageGCLowThresholdPercent. format: int32 maximum: 100 minimum: 0 type: integer imageGCLowThresholdPercent: - description: |- - ImageGCLowThresholdPercent is the percent of disk usage before which image - garbage collection is never run. Lowest disk usage to garbage collect to. - The percent is calculated by dividing this field value by 100, - so the field value must be between 0 and 100, inclusive. - When specified, the value must be less than imageGCHighThresholdPercent + description: ImageGCLowThresholdPercent is the percent of disk usage before which image garbage collection is never run. Lowest disk usage to garbage collect to. The percent is calculated by dividing this field value by 100, so the field value must be between 0 and 100, inclusive. When specified, the value must be less than imageGCHighThresholdPercent format: int32 maximum: 100 minimum: 0 @@ -153,17 +128,12 @@ spec: - message: kubeReserved value cannot be a negative resource quantity rule: self.all(x, !self[x].startsWith('-')) maxPods: - description: |- - MaxPods is an override for the maximum number of pods that can run on - a worker node instance. + description: MaxPods is an override for the maximum number of pods that can run on a worker node instance. format: int32 minimum: 0 type: integer podsPerCore: - description: |- - PodsPerCore is an override for the number of pods that can run on a worker node - instance based on the number of cpu cores. This value cannot exceed MaxPods, so, if - MaxPods is a lower value, that value will be used. + description: PodsPerCore is an override for the number of pods that can run on a worker node instance based on the number of cpu cores. This value cannot exceed MaxPods, so, if MaxPods is a lower value, that value will be used. format: int32 minimum: 0 type: integer @@ -207,9 +177,7 @@ spec: requirements: description: Requirements are layered with GetLabels and applied to every node. items: - description: |- - A node selector requirement is a selector that contains values, a key, and an operator - that relates the key and values. + description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. properties: key: description: The label key that the selector applies to. @@ -226,9 +194,7 @@ spec: - message: label "kubernetes.io/hostname" is restricted rule: self != "kubernetes.io/hostname" operator: - description: |- - Represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. type: string enum: - In @@ -238,12 +204,7 @@ spec: - Gt - Lt values: - description: |- - An array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. If the operator is Gt or Lt, the values - array must have a single element, which will be interpreted as an integer. - This array is replaced during a strategic merge patch. + description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. items: type: string type: array @@ -274,21 +235,12 @@ spec: type: object type: object startupTaints: - description: |- - StartupTaints are taints that are applied to nodes upon startup which are expected to be removed automatically - within a short period of time, typically by a DaemonSet that tolerates the taint. These are commonly used by - daemonsets to allow initialization and enforce startup ordering. StartupTaints are ignored for provisioning - purposes in that pods are not required to tolerate a StartupTaint in order to have nodes provisioned for them. + description: StartupTaints are taints that are applied to nodes upon startup which are expected to be removed automatically within a short period of time, typically by a DaemonSet that tolerates the taint. These are commonly used by daemonsets to allow initialization and enforce startup ordering. StartupTaints are ignored for provisioning purposes in that pods are not required to tolerate a StartupTaint in order to have nodes provisioned for them. items: - description: |- - The node this Taint is attached to has the "effect" on - any pod that does not tolerate the Taint. + description: The node this Taint is attached to has the "effect" on any pod that does not tolerate the Taint. properties: effect: - description: |- - Required. The effect of the taint on pods - that do not tolerate the taint. - Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + description: Required. The effect of the taint on pods that do not tolerate the taint. Valid effects are NoSchedule, PreferNoSchedule and NoExecute. type: string enum: - NoSchedule @@ -300,9 +252,7 @@ spec: minLength: 1 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ timeAdded: - description: |- - TimeAdded represents the time at which the taint was added. - It is only written for NoExecute taints. + description: TimeAdded represents the time at which the taint was added. It is only written for NoExecute taints. format: date-time type: string value: @@ -317,15 +267,10 @@ spec: taints: description: Taints will be applied to the NodeClaim's node. items: - description: |- - The node this Taint is attached to has the "effect" on - any pod that does not tolerate the Taint. + description: The node this Taint is attached to has the "effect" on any pod that does not tolerate the Taint. properties: effect: - description: |- - Required. The effect of the taint on pods - that do not tolerate the taint. - Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + description: Required. The effect of the taint on pods that do not tolerate the taint. Valid effects are NoSchedule, PreferNoSchedule and NoExecute. type: string enum: - NoSchedule @@ -337,9 +282,7 @@ spec: minLength: 1 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ timeAdded: - description: |- - TimeAdded represents the time at which the taint was added. - It is only written for NoExecute taints. + description: TimeAdded represents the time at which the taint was added. It is only written for NoExecute taints. format: date-time type: string value: @@ -351,6 +294,10 @@ spec: - key type: object type: array + terminationGracePeriod: + description: "TerminationGracePeriod is the duration the controller will wait before forcefully terminating a node, measured from when deletion is first initiated. Once the GracePeriod has expired, all pods on the node will be shutdown using the official non-graceful shutdown taint. If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, that pod will be deleted up at T = node timeout - pod terminationGracePeriodSeconds. \n Warning: this bypasses any PDB or terminationGracePeriodSeconds value set for a Pod. Requires: K8s 1.26 or higher: https://kubernetes.io/docs/concepts/architecture/nodes/#non-graceful-node-shutdown \n This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. It can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. If left undefined, the controller will wait indefinitely for pods to be drained." + pattern: ^(([0-9]+(s|m|h))+)$ + type: string required: - nodeClassRef - requirements @@ -379,15 +326,10 @@ spec: conditions: description: Conditions contains signals for health and readiness items: - description: |- - Condition defines a readiness condition for a Knative resource. - See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties + description: 'Condition defines a readiness condition for a Knative resource. See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties' properties: lastTransitionTime: - description: |- - LastTransitionTime is the last time the condition transitioned from one status to another. - We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic - differences (all other things held constant). + description: LastTransitionTime is the last time the condition transitioned from one status to another. We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic differences (all other things held constant). type: string message: description: A human readable message indicating details about the transition. @@ -396,9 +338,7 @@ spec: description: The reason for the condition's last transition. type: string severity: - description: |- - Severity with which to treat failures of this type of condition. - When this is not specified, it defaults to Error. + description: Severity with which to treat failures of this type of condition. When this is not specified, it defaults to Error. type: string status: description: Status of the condition, one of True, False, Unknown. diff --git a/pkg/apis/crds/karpenter.sh_nodepools.yaml b/pkg/apis/crds/karpenter.sh_nodepools.yaml index da032545dd..c8469594ce 100644 --- a/pkg/apis/crds/karpenter.sh_nodepools.yaml +++ b/pkg/apis/crds/karpenter.sh_nodepools.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.14.0 + controller-gen.kubebuilder.io/version: v0.13.0 name: nodepools.karpenter.sh spec: group: karpenter.sh @@ -30,28 +30,15 @@ spec: description: NodePool is the Schema for the NodePools API properties: apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + description: 'APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' type: string kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + description: 'Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' type: string metadata: type: object spec: - description: |- - NodePoolSpec is the top level nodepool specification. Nodepools - launch nodes in response to pods that are unschedulable. A single nodepool - is capable of managing a diverse set of nodes. Node properties are determined - from a combination of nodepool and pod scheduling constraints. + description: NodePoolSpec is the top level nodepool specification. Nodepools launch nodes in response to pods that are unschedulable. A single nodepool is capable of managing a diverse set of nodes. Node properties are determined from a combination of nodepool and pod scheduling constraints. properties: disruption: default: @@ -62,44 +49,21 @@ spec: budgets: default: - nodes: 10% - description: |- - Budgets is a list of Budgets. - If there are multiple active budgets, Karpenter uses - the most restrictive value. If left undefined, - this will default to one budget with a value to 10%. + description: Budgets is a list of Budgets. If there are multiple active budgets, Karpenter uses the most restrictive value. If left undefined, this will default to one budget with a value to 10%. items: - description: |- - Budget defines when Karpenter will restrict the - number of Node Claims that can be terminating simultaneously. + description: Budget defines when Karpenter will restrict the number of Node Claims that can be terminating simultaneously. properties: duration: - description: |- - Duration determines how long a Budget is active since each Schedule hit. - Only minutes and hours are accepted, as cron does not work in seconds. - If omitted, the budget is always active. - This is required if Schedule is set. - This regex has an optional 0s at the end since the duration.String() always adds - a 0s at the end. + description: Duration determines how long a Budget is active since each Schedule hit. Only minutes and hours are accepted, as cron does not work in seconds. If omitted, the budget is always active. This is required if Schedule is set. This regex has an optional 0s at the end since the duration.String() always adds a 0s at the end. pattern: ^([0-9]+(m|h)+(0s)?)$ type: string nodes: default: 10% - description: |- - Nodes dictates the maximum number of NodeClaims owned by this NodePool - that can be terminating at once. This is calculated by counting nodes that - have a deletion timestamp set, or are actively being deleted by Karpenter. - This field is required when specifying a budget. - This cannot be of type intstr.IntOrString since kubebuilder doesn't support pattern - checking for int nodes for IntOrString nodes. - Ref: https://github.com/kubernetes-sigs/controller-tools/blob/55efe4be40394a288216dab63156b0a64fb82929/pkg/crd/markers/validation.go#L379-L388 + description: 'Nodes dictates the maximum number of NodeClaims owned by this NodePool that can be terminating at once. This is calculated by counting nodes that have a deletion timestamp set, or are actively being deleted by Karpenter. This field is required when specifying a budget. This cannot be of type intstr.IntOrString since kubebuilder doesn''t support pattern checking for int nodes for IntOrString nodes. Ref: https://github.com/kubernetes-sigs/controller-tools/blob/55efe4be40394a288216dab63156b0a64fb82929/pkg/crd/markers/validation.go#L379-L388' pattern: ^((100|[0-9]{1,2})%|[0-9]+)$ type: string schedule: - description: |- - Schedule specifies when a budget begins being active, following - the upstream cronjob syntax. If omitted, the budget is always active. - Timezones are not supported. - This field is required if Duration is set. + description: Schedule specifies when a budget begins being active, following the upstream cronjob syntax. If omitted, the budget is always active. Timezones are not supported. This field is required if Duration is set. pattern: ^(@(annually|yearly|monthly|weekly|daily|midnight|hourly))|((.+)\s(.+)\s(.+)\s(.+)\s(.+))$ type: string required: @@ -111,28 +75,19 @@ spec: - message: '''schedule'' must be set with ''duration''' rule: '!self.all(x, (has(x.schedule) && !has(x.duration)) || (!has(x.schedule) && has(x.duration)))' consolidateAfter: - description: |- - ConsolidateAfter is the duration the controller will wait - before attempting to terminate nodes that are underutilized. - Refer to ConsolidationPolicy for how underutilization is considered. + description: ConsolidateAfter is the duration the controller will wait before attempting to terminate nodes that are underutilized. Refer to ConsolidationPolicy for how underutilization is considered. pattern: ^(([0-9]+(s|m|h))+)|(Never)$ type: string consolidationPolicy: default: WhenUnderutilized - description: |- - ConsolidationPolicy describes which nodes Karpenter can disrupt through its consolidation - algorithm. This policy defaults to "WhenUnderutilized" if not specified + description: ConsolidationPolicy describes which nodes Karpenter can disrupt through its consolidation algorithm. This policy defaults to "WhenUnderutilized" if not specified enum: - WhenEmpty - WhenUnderutilized type: string expireAfter: default: 720h - description: |- - ExpireAfter is the duration the controller will wait - before terminating a node, measured from when the node is created. This - is useful to implement features like eventually consistent node upgrade, - memory leak protection, and disruption testing. + description: ExpireAfter is the duration the controller will wait before terminating a node, measured from when the node is created. This is useful to implement features like eventually consistent node upgrade, memory leak protection, and disruption testing. pattern: ^(([0-9]+(s|m|h))+)|(Never)$ type: string type: object @@ -151,31 +106,21 @@ spec: description: Limits define a set of bounds for provisioning capacity. type: object template: - description: |- - Template contains the template of possibilities for the provisioning logic to launch a NodeClaim with. - NodeClaims launched from this NodePool will often be further constrained than the template specifies. + description: Template contains the template of possibilities for the provisioning logic to launch a NodeClaim with. NodeClaims launched from this NodePool will often be further constrained than the template specifies. properties: metadata: properties: annotations: additionalProperties: type: string - description: |- - Annotations is an unstructured key value map stored with a resource that may be - set by external tools to store and retrieve arbitrary metadata. They are not - queryable and should be preserved when modifying objects. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations + description: 'Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations' type: object labels: additionalProperties: type: string maxLength: 63 pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ - description: |- - Map of string keys and values that can be used to organize and categorize - (scope and select) objects. May match selectors of replication controllers - and services. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels + description: 'Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels' type: object maxProperties: 100 x-kubernetes-validations: @@ -194,15 +139,10 @@ spec: description: NodeClaimSpec describes the desired state of the NodeClaim properties: kubelet: - description: |- - Kubelet defines args to be used when configuring kubelet on provisioned nodes. - They are a subset of the upstream types, recognizing not all options may be supported. - Wherever possible, the types and names should reflect the upstream kubelet types. + description: Kubelet defines args to be used when configuring kubelet on provisioned nodes. They are a subset of the upstream types, recognizing not all options may be supported. Wherever possible, the types and names should reflect the upstream kubelet types. properties: clusterDNS: - description: |- - clusterDNS is a list of IP addresses for the cluster DNS server. - Note that not all providers may use all addresses. + description: clusterDNS is a list of IP addresses for the cluster DNS server. Note that not all providers may use all addresses. items: type: string type: array @@ -219,9 +159,7 @@ spec: - message: valid keys for evictionHard are ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available'] rule: self.all(x, x in ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available']) evictionMaxPodGracePeriod: - description: |- - EvictionMaxPodGracePeriod is the maximum allowed grace period (in seconds) to use when terminating pods in - response to soft eviction thresholds being met. + description: EvictionMaxPodGracePeriod is the maximum allowed grace period (in seconds) to use when terminating pods in response to soft eviction thresholds being met. format: int32 type: integer evictionSoft: @@ -242,22 +180,13 @@ spec: - message: valid keys for evictionSoftGracePeriod are ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available'] rule: self.all(x, x in ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available']) imageGCHighThresholdPercent: - description: |- - ImageGCHighThresholdPercent is the percent of disk usage after which image - garbage collection is always run. The percent is calculated by dividing this - field value by 100, so this field must be between 0 and 100, inclusive. - When specified, the value must be greater than ImageGCLowThresholdPercent. + description: ImageGCHighThresholdPercent is the percent of disk usage after which image garbage collection is always run. The percent is calculated by dividing this field value by 100, so this field must be between 0 and 100, inclusive. When specified, the value must be greater than ImageGCLowThresholdPercent. format: int32 maximum: 100 minimum: 0 type: integer imageGCLowThresholdPercent: - description: |- - ImageGCLowThresholdPercent is the percent of disk usage before which image - garbage collection is never run. Lowest disk usage to garbage collect to. - The percent is calculated by dividing this field value by 100, - so the field value must be between 0 and 100, inclusive. - When specified, the value must be less than imageGCHighThresholdPercent + description: ImageGCLowThresholdPercent is the percent of disk usage before which image garbage collection is never run. Lowest disk usage to garbage collect to. The percent is calculated by dividing this field value by 100, so the field value must be between 0 and 100, inclusive. When specified, the value must be less than imageGCHighThresholdPercent format: int32 maximum: 100 minimum: 0 @@ -277,17 +206,12 @@ spec: - message: kubeReserved value cannot be a negative resource quantity rule: self.all(x, !self[x].startsWith('-')) maxPods: - description: |- - MaxPods is an override for the maximum number of pods that can run on - a worker node instance. + description: MaxPods is an override for the maximum number of pods that can run on a worker node instance. format: int32 minimum: 0 type: integer podsPerCore: - description: |- - PodsPerCore is an override for the number of pods that can run on a worker node - instance based on the number of cpu cores. This value cannot exceed MaxPods, so, if - MaxPods is a lower value, that value will be used. + description: PodsPerCore is an override for the number of pods that can run on a worker node instance based on the number of cpu cores. This value cannot exceed MaxPods, so, if MaxPods is a lower value, that value will be used. format: int32 minimum: 0 type: integer @@ -331,9 +255,7 @@ spec: requirements: description: Requirements are layered with GetLabels and applied to every node. items: - description: |- - A node selector requirement is a selector that contains values, a key, and an operator - that relates the key and values. + description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. properties: key: description: The label key that the selector applies to. @@ -352,9 +274,7 @@ spec: - message: label "kubernetes.io/hostname" is restricted rule: self != "kubernetes.io/hostname" operator: - description: |- - Represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. type: string enum: - In @@ -364,12 +284,7 @@ spec: - Gt - Lt values: - description: |- - An array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. If the operator is Gt or Lt, the values - array must have a single element, which will be interpreted as an integer. - This array is replaced during a strategic merge patch. + description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. items: type: string type: array @@ -401,21 +316,12 @@ spec: type: object maxProperties: 0 startupTaints: - description: |- - StartupTaints are taints that are applied to nodes upon startup which are expected to be removed automatically - within a short period of time, typically by a DaemonSet that tolerates the taint. These are commonly used by - daemonsets to allow initialization and enforce startup ordering. StartupTaints are ignored for provisioning - purposes in that pods are not required to tolerate a StartupTaint in order to have nodes provisioned for them. + description: StartupTaints are taints that are applied to nodes upon startup which are expected to be removed automatically within a short period of time, typically by a DaemonSet that tolerates the taint. These are commonly used by daemonsets to allow initialization and enforce startup ordering. StartupTaints are ignored for provisioning purposes in that pods are not required to tolerate a StartupTaint in order to have nodes provisioned for them. items: - description: |- - The node this Taint is attached to has the "effect" on - any pod that does not tolerate the Taint. + description: The node this Taint is attached to has the "effect" on any pod that does not tolerate the Taint. properties: effect: - description: |- - Required. The effect of the taint on pods - that do not tolerate the taint. - Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + description: Required. The effect of the taint on pods that do not tolerate the taint. Valid effects are NoSchedule, PreferNoSchedule and NoExecute. type: string enum: - NoSchedule @@ -427,9 +333,7 @@ spec: minLength: 1 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ timeAdded: - description: |- - TimeAdded represents the time at which the taint was added. - It is only written for NoExecute taints. + description: TimeAdded represents the time at which the taint was added. It is only written for NoExecute taints. format: date-time type: string value: @@ -444,15 +348,10 @@ spec: taints: description: Taints will be applied to the NodeClaim's node. items: - description: |- - The node this Taint is attached to has the "effect" on - any pod that does not tolerate the Taint. + description: The node this Taint is attached to has the "effect" on any pod that does not tolerate the Taint. properties: effect: - description: |- - Required. The effect of the taint on pods - that do not tolerate the taint. - Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + description: Required. The effect of the taint on pods that do not tolerate the taint. Valid effects are NoSchedule, PreferNoSchedule and NoExecute. type: string enum: - NoSchedule @@ -464,9 +363,7 @@ spec: minLength: 1 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ timeAdded: - description: |- - TimeAdded represents the time at which the taint was added. - It is only written for NoExecute taints. + description: TimeAdded represents the time at which the taint was added. It is only written for NoExecute taints. format: date-time type: string value: @@ -478,6 +375,10 @@ spec: - key type: object type: array + terminationGracePeriod: + description: "TerminationGracePeriod is the duration the controller will wait before forcefully terminating a node, measured from when deletion is first initiated. Once the GracePeriod has expired, all pods on the node will be shutdown using the official non-graceful shutdown taint. If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, that pod will be deleted up at T = node timeout - pod terminationGracePeriodSeconds. \n Warning: this bypasses any PDB or terminationGracePeriodSeconds value set for a Pod. Requires: K8s 1.26 or higher: https://kubernetes.io/docs/concepts/architecture/nodes/#non-graceful-node-shutdown \n This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. It can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. If left undefined, the controller will wait indefinitely for pods to be drained." + pattern: ^(([0-9]+(s|m|h))+)$ + type: string required: - nodeClassRef - requirements @@ -486,11 +387,7 @@ spec: - spec type: object weight: - description: |- - Weight is the priority given to the nodepool during scheduling. A higher - numerical weight indicates that this nodepool will be ordered - ahead of other nodepools with lower weights. A nodepool with no weight - will be treated as if it is a nodepool with a weight of 0. + description: Weight is the priority given to the nodepool during scheduling. A higher numerical weight indicates that this nodepool will be ordered ahead of other nodepools with lower weights. A nodepool with no weight will be treated as if it is a nodepool with a weight of 0. format: int32 maximum: 100 minimum: 1 diff --git a/pkg/apis/v1beta1/nodeclaim.go b/pkg/apis/v1beta1/nodeclaim.go index 41ac8cb59d..3d97fb6a55 100644 --- a/pkg/apis/v1beta1/nodeclaim.go +++ b/pkg/apis/v1beta1/nodeclaim.go @@ -52,6 +52,23 @@ type NodeClaimSpec struct { // NodeClassRef is a reference to an object that defines provider specific configuration // +required NodeClassRef *NodeClassReference `json:"nodeClassRef"` + // TerminationGracePeriod is the duration the controller will wait before forcefully terminating a node, measured from when deletion is first initiated. + // Once the GracePeriod has expired, all pods on the node will be shutdown using the official non-graceful shutdown taint. + // If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, + // that pod will be deleted up at T = node timeout - pod terminationGracePeriodSeconds. + // + // Warning: this bypasses any PDB or terminationGracePeriodSeconds value set for a Pod. + // Requires: K8s 1.26 or higher: https://kubernetes.io/docs/concepts/architecture/nodes/#non-graceful-node-shutdown + // + // This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. + // It can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. + // If left undefined, the controller will wait indefinitely for pods to be drained. + // + // +kubebuilder:validation:Pattern=`^(([0-9]+(s|m|h))+)$` + // +kubebuilder:validation:Type="string" + // +kubebuilder:validation:Schemaless + // +optional + TerminationGracePeriod *metav1.Duration `json:"terminationGracePeriod"` } // ResourceRequirements models the required resources for the NodeClaim to launch diff --git a/pkg/apis/v1beta1/taints.go b/pkg/apis/v1beta1/taints.go index 6a9b932dfb..ec1593c43d 100644 --- a/pkg/apis/v1beta1/taints.go +++ b/pkg/apis/v1beta1/taints.go @@ -22,6 +22,9 @@ import v1 "k8s.io/api/core/v1" const ( DisruptionTaintKey = Group + "/disruption" DisruptingNoScheduleTaintValue = "disrupting" + + DisruptionNonGracefulShutdownKey = "node.kubernetes.io/out-of-service" + DisruptionNonGracefulShutdownValue = "nodeshutdown" ) var ( @@ -32,6 +35,15 @@ var ( Effect: v1.TaintEffectNoSchedule, Value: DisruptingNoScheduleTaintValue, } + + // DisruptionNonGracefulShutdown is used by the deprovisioning controller to forcefully + // shut down a node. This does not respect graceful termination of any pods on the node. + // https://kubernetes.io/docs/concepts/architecture/nodes/#non-graceful-node-shutdown + DisruptionNonGracefulShutdown = v1.Taint{ + Key: DisruptionNonGracefulShutdownKey, + Value: DisruptionNonGracefulShutdownValue, + Effect: v1.TaintEffectNoExecute, + } ) func IsDisruptingTaint(taint v1.Taint) bool { diff --git a/pkg/apis/v1beta1/zz_generated.deepcopy.go b/pkg/apis/v1beta1/zz_generated.deepcopy.go index cabcf7f4d7..69973ddb25 100644 --- a/pkg/apis/v1beta1/zz_generated.deepcopy.go +++ b/pkg/apis/v1beta1/zz_generated.deepcopy.go @@ -301,6 +301,11 @@ func (in *NodeClaimSpec) DeepCopyInto(out *NodeClaimSpec) { *out = new(NodeClassReference) **out = **in } + if in.TerminationGracePeriod != nil { + in, out := &in.TerminationGracePeriod, &out.TerminationGracePeriod + *out = new(metav1.Duration) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeClaimSpec. diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index 30388a811f..5eacf7ffe0 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -55,6 +55,7 @@ func NewControllers( p := provisioning.NewProvisioner(kubeClient, kubernetesInterface.CoreV1(), recorder, cloudProvider, cluster) evictionQueue := terminator.NewQueue(kubernetesInterface.CoreV1(), recorder) + deletionQueue := terminator.NewDeletionQueue(kubernetesInterface.CoreV1(), recorder) disruptionQueue := orchestration.NewQueue(kubeClient, recorder, cluster, clock, p) return []controller.Controller{ @@ -67,7 +68,7 @@ func NewControllers( informer.NewPodController(kubeClient, cluster), informer.NewNodePoolController(kubeClient, cluster), informer.NewNodeClaimController(kubeClient, cluster), - termination.NewController(kubeClient, cloudProvider, terminator.NewTerminator(clock, kubeClient, evictionQueue), recorder), + termination.NewController(kubeClient, cloudProvider, terminator.NewTerminator(clock, kubeClient, evictionQueue, deletionQueue), recorder), metricspod.NewController(kubeClient), metricsnodepool.NewController(kubeClient), metricsnode.NewController(cluster), diff --git a/pkg/controllers/node/termination/controller.go b/pkg/controllers/node/termination/controller.go index 7cefe5b2aa..a1543e8825 100644 --- a/pkg/controllers/node/termination/controller.go +++ b/pkg/controllers/node/termination/controller.go @@ -25,6 +25,7 @@ import ( "golang.org/x/time/rate" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/util/workqueue" "knative.dev/pkg/logging" controllerruntime "sigs.k8s.io/controller-runtime" @@ -77,13 +78,24 @@ func (c *Controller) Finalize(ctx context.Context, node *v1.Node) (reconcile.Res if !controllerutil.ContainsFinalizer(node, v1beta1.TerminationFinalizer) { return reconcile.Result{}, nil } + + nodeGracePeriodExpirationTime, err := c.terminationGracePeriodExpirationTime(ctx, node) + if err != nil { + return reconcile.Result{}, fmt.Errorf("couldn't evaulate node's terminationGracePeriod, %w", err) + } else if nodeGracePeriodExpirationTime != nil && time.Now().After(*nodeGracePeriodExpirationTime) { + if err := c.terminator.Taint(ctx, node, v1beta1.DisruptionNonGracefulShutdown); err != nil { + logging.FromContext(ctx).Infof("node's terminationGracePeriod has expired, adding non-graceful shutdown taint: %v", node.Name) + return reconcile.Result{}, fmt.Errorf("error while tainting node with node.kubernetes.io/out-of-service=nodeshutdown:NoExecute, %w", err) + } + } + if err := c.deleteAllNodeClaims(ctx, node); err != nil { return reconcile.Result{}, fmt.Errorf("deleting nodeclaims, %w", err) } - if err := c.terminator.Taint(ctx, node); err != nil { - return reconcile.Result{}, fmt.Errorf("tainting node, %w", err) + if err := c.terminator.Taint(ctx, node, v1beta1.DisruptionNoScheduleTaint); err != nil { + return reconcile.Result{}, fmt.Errorf("tainting node with karpenter.sh/disruption taint, %w", err) } - if err := c.terminator.Drain(ctx, node); err != nil { + if err := c.terminator.Drain(ctx, node, nodeGracePeriodExpirationTime); err != nil { if !terminator.IsNodeDrainError(err) { return reconcile.Result{}, fmt.Errorf("draining node, %w", err) } @@ -138,6 +150,53 @@ func (c *Controller) removeFinalizer(ctx context.Context, n *v1.Node) error { return nil } +func (c *Controller) terminationGracePeriodExpirationTime(ctx context.Context, node *v1.Node) (*time.Time, error) { + + nodeClaim := &v1beta1.NodeClaim{} + + if len(node.OwnerReferences) == 0 { + logging.FromContext(ctx).Errorf("node has no owner, could not find NodeClaim for Node: %v", node.Name) + return nil, nil + } + + // assume the only Node ownerRef is the NodeClaim + nodeClaimName := types.NamespacedName{ + Name: node.OwnerReferences[0].Name, + } + if err := c.kubeClient.Get(ctx, nodeClaimName, nodeClaim); err != nil { + logging.FromContext(ctx).Errorf("could not find NodeClaim for Node: %v", node.Name) + return nil, err + } + + // TODO: remove, holding this here in case we switch the preferred implementation + // nodePool := &v1beta1.NodePool{} + // // assume the only NodeClaim ownerRef is the NodePool + // nodePoolName := types.NamespacedName{ + // Name: nodeClaim.OwnerReferences[0].Name, + // } + // if err := c.kubeClient.Get(ctx, nodePoolName, nodePool); err != nil { + // logging.FromContext(ctx).Errorf("could not find NodePool for NodeClaim: %v", nodeClaim.Name) + // return nil, err + // } + + // if nodePool.Spec.Disruption.TerminationGracePeriod != nil { + // expirationTime := node.DeletionTimestamp.Time.Add(nodePool.Spec.Disruption.TerminationGracePeriod.Duration) + // c.recorder.Publish(terminatorevents.NodeTerminationGracePeriod(node, expirationTime, fmt.Sprintf("%s", nodePool.Spec.Disruption.TerminationGracePeriod))) + // // logging.FromContext(ctx).Infof("node %v will be forcefully terminated at %v (terminationGracePeriod=%v)", node.Name, expirationTime, nodePool.Spec.Disruption.TerminationGracePeriod) + // return &expirationTime, nil + // } + + if nodeClaim.Spec.TerminationGracePeriod != nil { + expirationTime := node.DeletionTimestamp.Time.Add(nodeClaim.Spec.TerminationGracePeriod.Duration) + c.recorder.Publish(terminatorevents.NodeTerminationGracePeriod(node, expirationTime, fmt.Sprintf("%s", nodeClaim.Spec.TerminationGracePeriod))) + if node.DeletionTimestamp.Time.Add(nodeClaim.Spec.TerminationGracePeriod.Duration).Before(time.Now()) { + return &expirationTime, nil + } + } + + return nil, nil +} + func (c *Controller) Builder(_ context.Context, m manager.Manager) operatorcontroller.Builder { return operatorcontroller.Adapt(controllerruntime. NewControllerManagedBy(m). diff --git a/pkg/controllers/node/termination/suite_test.go b/pkg/controllers/node/termination/suite_test.go index 5b1432f3c9..e0eb35e87e 100644 --- a/pkg/controllers/node/termination/suite_test.go +++ b/pkg/controllers/node/termination/suite_test.go @@ -56,6 +56,7 @@ var fakeClock *clock.FakeClock var cloudProvider *fake.CloudProvider var recorder *test.EventRecorder var queue *terminator.Queue +var deletionQueue *terminator.DeletionQueue func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) @@ -70,7 +71,8 @@ var _ = BeforeSuite(func() { cloudProvider = fake.NewCloudProvider() recorder = test.NewEventRecorder() queue = terminator.NewQueue(env.KubernetesInterface.CoreV1(), recorder) - terminationController = termination.NewController(env.Client, cloudProvider, terminator.NewTerminator(fakeClock, env.Client, queue), recorder) + deletionQueue = terminator.NewDeletionQueue(env.KubernetesInterface.CoreV1(), recorder) + terminationController = termination.NewController(env.Client, cloudProvider, terminator.NewTerminator(fakeClock, env.Client, queue, deletionQueue), recorder) }) var _ = AfterSuite(func() { diff --git a/pkg/controllers/node/termination/terminator/events/events.go b/pkg/controllers/node/termination/terminator/events/events.go index 84c4b07bc0..a56d0cd4b9 100644 --- a/pkg/controllers/node/termination/terminator/events/events.go +++ b/pkg/controllers/node/termination/terminator/events/events.go @@ -18,6 +18,7 @@ package events import ( "fmt" + "time" v1 "k8s.io/api/core/v1" @@ -34,6 +35,16 @@ func EvictPod(pod *v1.Pod) events.Event { } } +func DeletePod(pod *v1.Pod) events.Event { + return events.Event{ + InvolvedObject: pod, + Type: v1.EventTypeNormal, + Reason: "Deleted", + Message: fmt.Sprintf("Deleted pod regardless of PDBs and lifecycle hooks, %v seconds before node termination to accomodate it's terminationGracePeriodSeconds", pod.Spec.TerminationGracePeriodSeconds), + DedupeValues: []string{pod.Name}, + } +} + func NodeFailedToDrain(node *v1.Node, err error) events.Event { return events.Event{ InvolvedObject: node, @@ -43,3 +54,13 @@ func NodeFailedToDrain(node *v1.Node, err error) events.Event { DedupeValues: []string{node.Name}, } } + +func NodeTerminationGracePeriod(node *v1.Node, expirationTime time.Time, terminationGracePeriod string) events.Event { + return events.Event{ + InvolvedObject: node, + Type: v1.EventTypeWarning, + Reason: "TerminationGracePeriodExpiration", + Message: fmt.Sprintf("Node will have the out-of-service taint applied at: %s (TerminationGracePeriod: %s)", expirationTime, terminationGracePeriod), + DedupeValues: []string{node.Name}, + } +} diff --git a/pkg/controllers/node/termination/terminator/eviction.go b/pkg/controllers/node/termination/terminator/eviction.go index c724c7500e..8b93432158 100644 --- a/pkg/controllers/node/termination/terminator/eviction.go +++ b/pkg/controllers/node/termination/terminator/eviction.go @@ -65,15 +65,22 @@ func IsNodeDrainError(err error) bool { type Queue struct { workqueue.RateLimitingInterface set.Set + deleteSet set.Set coreV1Client corev1.CoreV1Interface recorder events.Recorder } +type PodAction struct { + action string + pod types.NamespacedName +} + func NewQueue(coreV1Client corev1.CoreV1Interface, recorder events.Recorder) *Queue { queue := &Queue{ RateLimitingInterface: workqueue.NewRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(evictionQueueBaseDelay, evictionQueueMaxDelay)), Set: set.NewSet(), + deleteSet: set.NewSet(), coreV1Client: coreV1Client, recorder: recorder, } @@ -89,11 +96,11 @@ func (q *Queue) Builder(_ context.Context, m manager.Manager) controller.Builder } // Add adds pods to the Queue -func (q *Queue) Add(pods ...*v1.Pod) { +func (q *Queue) Add(action string, pods ...*v1.Pod) { for _, pod := range pods { if nn := client.ObjectKeyFromObject(pod); !q.Set.Contains(nn) { - q.Set.Add(nn) - q.RateLimitingInterface.Add(nn) + q.Set.Add(PodAction{action: action, pod: nn}) + q.RateLimitingInterface.Add(PodAction{action: action, pod: nn}) } } } @@ -110,16 +117,25 @@ func (q *Queue) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.R if shutdown { return reconcile.Result{}, fmt.Errorf("EvictionQueue is broken and has shutdown") } - nn := item.(types.NamespacedName) - defer q.RateLimitingInterface.Done(nn) - // Evict pod - if q.Evict(ctx, nn) { - q.RateLimitingInterface.Forget(nn) - q.Set.Remove(nn) - return reconcile.Result{RequeueAfter: controller.Immediately}, nil + podAction := item.(PodAction) + defer q.RateLimitingInterface.Done(podAction) + if podAction.action == "evict" { + // Evict pod + if q.Evict(ctx, podAction.pod) { + q.RateLimitingInterface.Forget(podAction) + q.Set.Remove(podAction) + return reconcile.Result{RequeueAfter: controller.Immediately}, nil + } + } else if podAction.action == "delete" { + // Delete pod + if q.Delete(ctx, podAction.pod) { + q.RateLimitingInterface.Forget(podAction) + q.Set.Remove(podAction) + return reconcile.Result{RequeueAfter: controller.Immediately}, nil + } } // Requeue pod if eviction failed - q.RateLimitingInterface.AddRateLimited(nn) + q.RateLimitingInterface.AddRateLimited(podAction) return reconcile.Result{RequeueAfter: controller.Immediately}, nil } @@ -148,6 +164,20 @@ func (q *Queue) Evict(ctx context.Context, nn types.NamespacedName) bool { return true } +// Delete returns true if successful delete call, and false if there was an error +func (q *Queue) Delete(ctx context.Context, nn types.NamespacedName) bool { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("pod", nn)) + if err := q.coreV1Client.Pods(nn.Namespace).Delete(ctx, nn.Name, metav1.DeleteOptions{}); err != nil { + if apierrors.IsNotFound(err) { // 404 + return true + } + logging.FromContext(ctx).Errorf("deleting pod, %s", err) + return false + } + q.recorder.Publish(terminatorevents.DeletePod(&v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: nn.Name, Namespace: nn.Namespace}})) + return true +} + func (q *Queue) Reset() { q.RateLimitingInterface = workqueue.NewRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(evictionQueueBaseDelay, evictionQueueMaxDelay)) q.Set = set.NewSet() diff --git a/pkg/controllers/node/termination/terminator/suite_test.go b/pkg/controllers/node/termination/terminator/suite_test.go index 7719932fd8..76a220fa03 100644 --- a/pkg/controllers/node/termination/terminator/suite_test.go +++ b/pkg/controllers/node/termination/terminator/suite_test.go @@ -123,4 +123,12 @@ var _ = Describe("Eviction/Queue", func() { Expect(queue.Evict(ctx, types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace})).To(BeFalse()) }) }) + + Context("Pod Deletion API", func() { + It("should succeed with no event when the pod is not found", func() { + ExpectApplied(ctx, env.Client) + Expect(queue.Delete(ctx, types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace})).To(BeTrue()) + Expect(recorder.Events()).To(HaveLen(0)) + }) + }) }) diff --git a/pkg/controllers/node/termination/terminator/terminator.go b/pkg/controllers/node/termination/terminator/terminator.go index ac604b63ef..ab8a7bf8b5 100644 --- a/pkg/controllers/node/termination/terminator/terminator.go +++ b/pkg/controllers/node/termination/terminator/terminator.go @@ -28,7 +28,6 @@ import ( "knative.dev/pkg/logging" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/karpenter/pkg/apis/v1beta1" podutil "sigs.k8s.io/karpenter/pkg/utils/pod" ) @@ -38,7 +37,7 @@ type Terminator struct { evictionQueue *Queue } -func NewTerminator(clk clock.Clock, kubeClient client.Client, eq *Queue) *Terminator { +func NewTerminator(clk clock.Clock, kubeClient client.Client, eq *Queue, dq *DeletionQueue) *Terminator { return &Terminator{ clock: clk, kubeClient: kubeClient, @@ -46,18 +45,18 @@ func NewTerminator(clk clock.Clock, kubeClient client.Client, eq *Queue) *Termin } } -// Taint idempotently adds the karpenter.sh/disruption taint to a node with a NodeClaim -func (t *Terminator) Taint(ctx context.Context, node *v1.Node) error { +// Taint idempotently adds an arbitrary taint to a node with a NodeClaim +func (t *Terminator) Taint(ctx context.Context, node *v1.Node, taint v1.Taint) error { stored := node.DeepCopy() - // If the taint already has the karpenter.sh/disruption=disrupting:NoSchedule taint, do nothing. + // If the node already has the correct taint (key, value, and effect), do nothing. if _, ok := lo.Find(node.Spec.Taints, func(t v1.Taint) bool { - return v1beta1.IsDisruptingTaint(t) + return t.MatchTaint(&taint) && t.Value == taint.Value && t.Effect == taint.Effect }); !ok { - // If the taint key exists (but with a different value or effect), remove it. + // Otherwise, if the taint key exists (but with a different value or effect), remove it. node.Spec.Taints = lo.Reject(node.Spec.Taints, func(t v1.Taint, _ int) bool { - return t.Key == v1beta1.DisruptionTaintKey + return t.Key == taint.Key }) - node.Spec.Taints = append(node.Spec.Taints, v1beta1.DisruptionNoScheduleTaint) + node.Spec.Taints = append(node.Spec.Taints, taint) } // Adding this label to the node ensures that the node is removed from the load-balancer target group // while it is draining and before it is terminated. This prevents 500s coming prior to health check @@ -78,13 +77,30 @@ func (t *Terminator) Taint(ctx context.Context, node *v1.Node) error { // Drain evicts pods from the node and returns true when all pods are evicted // https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown -func (t *Terminator) Drain(ctx context.Context, node *v1.Node) error { +func (t *Terminator) Drain(ctx context.Context, node *v1.Node, nodeGracePeriodExpirationTime *time.Time) error { // Get evictable pods pods := &v1.PodList{} if err := t.kubeClient.List(ctx, pods, client.MatchingFields{"spec.nodeName": node.Name}); err != nil { return fmt.Errorf("listing pods on node, %w", err) } + if nodeGracePeriodExpirationTime != nil { + // preemptively begin deleting pods that would be terminated with the node without receiving their full grace period in termination time + podsToDelete := lo.FilterMap(pods.Items, func(po v1.Pod, _ int) (*v1.Pod, bool) { + p := lo.ToPtr(po) + return p, t.shouldDeletePodToEnsureGracePeriod(nodeGracePeriodExpirationTime, p) + }) + // Enqueue for deletion + + if len(podsToDelete) > 0 { + logging.FromContext(ctx).Infof("pods to delete: %v", len(podsToDelete)) + } + for _, pod := range podsToDelete { + logging.FromContext(ctx).Infof("delete pod: %v/%v", pod.Namespace, pod.Name) + } + t.Delete(podsToDelete) + } + // Skip node due to pods that are not able to be evicted podsToEvict := lo.FilterMap(pods.Items, func(po v1.Pod, _ int) (*v1.Pod, bool) { p := lo.ToPtr(po) @@ -136,19 +152,34 @@ func (t *Terminator) Evict(pods []*v1.Pod) { // c. critical non-daemonsets // d. critical daemonsets if len(nonCriticalNonDaemon) != 0 { - t.evictionQueue.Add(nonCriticalNonDaemon...) + t.evictionQueue.Add("evict", nonCriticalNonDaemon...) } else if len(nonCriticalDaemon) != 0 { - t.evictionQueue.Add(nonCriticalDaemon...) + t.evictionQueue.Add("evict", nonCriticalDaemon...) } else if len(criticalNonDaemon) != 0 { - t.evictionQueue.Add(criticalNonDaemon...) + t.evictionQueue.Add("evict", criticalNonDaemon...) } else if len(criticalDaemon) != 0 { - t.evictionQueue.Add(criticalDaemon...) + t.evictionQueue.Add("evict", criticalDaemon...) } } +// Delete bypasses the eviction API to ensure a pod will begin terminating, regardless of PDBs +func (t *Terminator) Delete(pods []*v1.Pod) { + t.evictionQueue.Add("delete", pods...) +} + func (t *Terminator) isStuckTerminating(pod *v1.Pod) bool { if pod.DeletionTimestamp == nil { return false } return t.clock.Now().After(pod.DeletionTimestamp.Time.Add(1 * time.Minute)) } + +func (t *Terminator) shouldDeletePodToEnsureGracePeriod(nodeGracePeriodExpirationTime *time.Time, pod *v1.Pod) bool { + if nodeGracePeriodExpirationTime == nil || pod.Spec.TerminationGracePeriodSeconds == nil { + return false + } + + // check if the pod would be force terminated before being allowed its full terminationGracePeriodSeconds in time to gracefully terminate + // eg: if a node will be force terminated in 30m, but the current pod has a grace period of 45m, we return true + return time.Now().After(nodeGracePeriodExpirationTime.Add(time.Duration(*pod.Spec.TerminationGracePeriodSeconds) * time.Second * -1)) +}