diff --git a/pkg/controllers/node/health/controller.go b/pkg/controllers/node/health/controller.go index 8cb59b7238..ede8f2725b 100644 --- a/pkg/controllers/node/health/controller.go +++ b/pkg/controllers/node/health/controller.go @@ -136,10 +136,16 @@ func (c *Controller) deleteNodeClaim(ctx context.Context, nodeClaim *v1.NodeClai // The deletion timestamp has successfully been set for the Node, update relevant metrics. log.FromContext(ctx).V(1).Info("deleting unhealthy node") metrics.NodeClaimsDisruptedTotal.Inc(map[string]string{ - metrics.ReasonLabel: pretty.ToSnakeCase(string(unhealthyNodeCondition.Type)), + metrics.ReasonLabel: metrics.UnhealthyReason, metrics.NodePoolLabel: node.Labels[v1.NodePoolLabelKey], metrics.CapacityTypeLabel: node.Labels[v1.CapacityTypeLabelKey], }) + NodeClaimsUnhealthyDisruptedTotal.Inc(map[string]string{ + Condition: pretty.ToSnakeCase(string(unhealthyNodeCondition.Type)), + metrics.NodePoolLabel: node.Labels[v1.NodePoolLabelKey], + metrics.CapacityTypeLabel: node.Labels[v1.CapacityTypeLabelKey], + ImageID: nodeClaim.Status.ImageID, + }) return reconcile.Result{}, nil } diff --git a/pkg/controllers/node/health/metrics.go b/pkg/controllers/node/health/metrics.go new file mode 100644 index 0000000000..aba68cdcca --- /dev/null +++ b/pkg/controllers/node/health/metrics.go @@ -0,0 +1,46 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package health + +import ( + opmetrics "github.com/awslabs/operatorpkg/metrics" + "github.com/prometheus/client_golang/prometheus" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" + + "sigs.k8s.io/karpenter/pkg/metrics" +) + +const ( + ImageID = "image_id" + Condition = "condition" +) + +var NodeClaimsUnhealthyDisruptedTotal = opmetrics.NewPrometheusCounter( + crmetrics.Registry, + prometheus.CounterOpts{ + Namespace: metrics.Namespace, + Subsystem: metrics.NodeClaimSubsystem, + Name: "unhealthy_disrupted_total", + Help: "Number of unhealthy nodeclaims disrupted in total by Karpenter. Labeled by condition on the node was disrupted, the owning nodepool, and the image ID.", + }, + []string{ + Condition, + metrics.NodePoolLabel, + metrics.CapacityTypeLabel, + ImageID, + }, +) diff --git a/pkg/controllers/node/health/suite_test.go b/pkg/controllers/node/health/suite_test.go index 5bfae95547..7418ab723b 100644 --- a/pkg/controllers/node/health/suite_test.go +++ b/pkg/controllers/node/health/suite_test.go @@ -360,7 +360,11 @@ var _ = Describe("Node Health", func() { Expect(nodeClaim.DeletionTimestamp).ToNot(BeNil()) ExpectMetricCounterValue(metrics.NodeClaimsDisruptedTotal, 1, map[string]string{ - metrics.ReasonLabel: pretty.ToSnakeCase(string(cloudProvider.RepairPolicies()[0].ConditionType)), + metrics.ReasonLabel: metrics.UnhealthyReason, + metrics.NodePoolLabel: nodePool.Name, + }) + ExpectMetricCounterValue(health.NodeClaimsUnhealthyDisruptedTotal, 1, map[string]string{ + health.Condition: pretty.ToSnakeCase(string(cloudProvider.RepairPolicies()[0].ConditionType)), metrics.NodePoolLabel: nodePool.Name, }) }) diff --git a/pkg/metrics/constants.go b/pkg/metrics/constants.go index 32d8bdcac8..a958aeb4a8 100644 --- a/pkg/metrics/constants.go +++ b/pkg/metrics/constants.go @@ -33,6 +33,7 @@ const ( // Reasons for CREATE/DELETE shared metrics ProvisionedReason = "provisioned" ExpiredReason = "expired" + UnhealthyReason = "unhealthy" ) // DurationBuckets returns a []float64 of default threshold values for duration histograms.