diff --git a/pkg/controllers/node/health/controller.go b/pkg/controllers/node/health/controller.go index 8cb59b723..6a8899f3a 100644 --- a/pkg/controllers/node/health/controller.go +++ b/pkg/controllers/node/health/controller.go @@ -135,10 +135,11 @@ func (c *Controller) deleteNodeClaim(ctx context.Context, nodeClaim *v1.NodeClai } // The deletion timestamp has successfully been set for the Node, update relevant metrics. log.FromContext(ctx).V(1).Info("deleting unhealthy node") - metrics.NodeClaimsDisruptedTotal.Inc(map[string]string{ - metrics.ReasonLabel: pretty.ToSnakeCase(string(unhealthyNodeCondition.Type)), + NodeClaimsUnhealthyDisruptedTotal.Inc(map[string]string{ + Condition: pretty.ToSnakeCase(string(unhealthyNodeCondition.Type)), metrics.NodePoolLabel: node.Labels[v1.NodePoolLabelKey], metrics.CapacityTypeLabel: node.Labels[v1.CapacityTypeLabelKey], + ImageID: nodeClaim.Status.ImageID, }) return reconcile.Result{}, nil } diff --git a/pkg/controllers/node/health/metrics.go b/pkg/controllers/node/health/metrics.go new file mode 100644 index 000000000..486d14bfa --- /dev/null +++ b/pkg/controllers/node/health/metrics.go @@ -0,0 +1,45 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package health + +import ( + opmetrics "github.com/awslabs/operatorpkg/metrics" + "github.com/prometheus/client_golang/prometheus" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" + "sigs.k8s.io/karpenter/pkg/metrics" +) + +const ( + ImageID = "image_id" + Condition = "condition" +) + +var NodeClaimsUnhealthyDisruptedTotal = opmetrics.NewPrometheusCounter( + crmetrics.Registry, + prometheus.CounterOpts{ + Namespace: metrics.Namespace, + Subsystem: metrics.NodeClaimSubsystem, + Name: "unhealthy_disrupted_total", + Help: "Number of unhealhty nodeclaims disrupted in total by Karpenter. Labeled by condition on the node was disrupted, the owning nodepool, and the image ID.", + }, + []string{ + Condition, + metrics.NodePoolLabel, + metrics.CapacityTypeLabel, + ImageID, + }, +) diff --git a/pkg/controllers/node/health/suite_test.go b/pkg/controllers/node/health/suite_test.go index 5bfae9554..1ac2acd0b 100644 --- a/pkg/controllers/node/health/suite_test.go +++ b/pkg/controllers/node/health/suite_test.go @@ -359,8 +359,8 @@ var _ = Describe("Node Health", func() { nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) Expect(nodeClaim.DeletionTimestamp).ToNot(BeNil()) - ExpectMetricCounterValue(metrics.NodeClaimsDisruptedTotal, 1, map[string]string{ - metrics.ReasonLabel: pretty.ToSnakeCase(string(cloudProvider.RepairPolicies()[0].ConditionType)), + ExpectMetricCounterValue(health.NodeClaimsUnhealthyDisruptedTotal, 1, map[string]string{ + health.Condition: pretty.ToSnakeCase(string(cloudProvider.RepairPolicies()[0].ConditionType)), metrics.NodePoolLabel: nodePool.Name, }) })