From ca201c35ee812a5157c8bd45d782d2406848a0b0 Mon Sep 17 00:00:00 2001 From: Evan Baker Date: Tue, 25 Feb 2025 23:29:37 +0000 Subject: [PATCH] feat: add metric for NNC init failures Signed-off-by: Evan Baker --- cns/service/main.go | 12 ++++----- cns/service/metrics.go | 58 +++++++++++++++++++++++++++--------------- 2 files changed, 43 insertions(+), 27 deletions(-) diff --git a/cns/service/main.go b/cns/service/main.go index fb1655accd..1846dddf65 100644 --- a/cns/service/main.go +++ b/cns/service/main.go @@ -1061,7 +1061,7 @@ func main() { return errors.Wrap(err, "failed to start fsnotify watcher, will retry") } return nil - }, retry.DelayType(retry.BackOffDelay), retry.Attempts(0), retry.Context(rootCtx)) // infinite cancellable exponential backoff retrier + }, retry.DelayType(retry.BackOffDelay), retry.UntilSucceeded(), retry.Context(rootCtx)) // infinite cancellable exponential backoff retrier }() } @@ -1450,20 +1450,18 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn // aks addons to come up so retry a bit more aggresively here. // will retry 10 times maxing out at a minute taking about 8 minutes before it gives up. attempt := 0 - err = retry.Do(func() error { + _ = retry.Do(func() error { attempt++ logger.Printf("reconciling initial CNS state attempt: %d", attempt) err = reconcileInitialCNSState(ctx, directscopedcli, httpRestServiceImplementation, podInfoByIPProvider) if err != nil { logger.Errorf("failed to reconcile initial CNS state, attempt: %d err: %v", attempt, err) + nncInitFailure.Inc() } return errors.Wrap(err, "failed to initialize CNS state") - }, retry.Context(ctx), retry.Delay(initCNSInitalDelay), retry.MaxDelay(time.Minute)) - if err != nil { - return err - } + }, retry.Context(ctx), retry.Delay(initCNSInitalDelay), retry.MaxDelay(time.Minute), retry.UntilSucceeded()) logger.Printf("reconciled initial CNS state after %d attempts", attempt) - + hasNNCInitialized.Set(1) scheme := kuberuntime.NewScheme() if err := corev1.AddToScheme(scheme); err != nil { //nolint:govet // intentional shadow return errors.Wrap(err, "failed to add corev1 to scheme") diff --git a/cns/service/metrics.go b/cns/service/metrics.go index ca4da8a7f4..44f9e9a945 100644 --- a/cns/service/metrics.go +++ b/cns/service/metrics.go @@ -5,31 +5,49 @@ import ( "sigs.k8s.io/controller-runtime/pkg/metrics" ) -// managerStartFailures is a monotic counter which tracks the number of times the controller-runtime -// manager failed to start. To drive alerting based on this metric, it is recommended to use the rate -// of increase over a period of time. A positive rate of change indicates that the CNS is actively -// failing and retrying. -var managerStartFailures = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "cns_ctrlmanager_start_failures_total", - Help: "Number of times the controller-runtime manager failed to start.", - }, -) - -// nncReconcilerStartFailures is a monotic counter which tracks the number of times the NNC reconciler -// has failed to start within the timeout period. To drive alerting based on this metric, it is -// recommended to use the rate of increase over a period of time. A positive rate of change indicates -// that the CNS is actively failing and retrying. -var nncReconcilerStartFailures = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "cns_nnc_reconciler_start_failures_total", - Help: "Number of times the NNC reconciler has failed to start within the timeout period.", - }, +var ( + // managerStartFailures is a monotic counter which tracks the number of times the controller-runtime + // manager failed to start. To drive alerting based on this metric, it is recommended to use the rate + // of increase over a period of time. A positive rate of change indicates that the CNS is actively + // failing and retrying. + managerStartFailures = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "cns_ctrlmanager_start_failures_total", + Help: "Number of times the controller-runtime manager failed to start.", + }, + ) + // nncReconcilerStartFailures is a monotic counter which tracks the number of times the NNC reconciler + // has failed to start within the timeout period. To drive alerting based on this metric, it is + // recommended to use the rate of increase over a period of time. A positive rate of change indicates + // that the CNS is actively failing and retrying. + nncReconcilerStartFailures = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "cns_nnc_reconciler_start_failures_total", + Help: "Number of times the NNC reconciler has failed to start within the timeout period.", + }, + ) + // nncInitFailure is a monotic counter which tracks the number of times the initial NNC reconcile + // has failed. + nncInitFailure = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "cns_nnc_init_failures_total", + Help: "Number of times the initial NNC reconcile has failed.", + }, + ) + // hasNNCInitialized is a gauge which tracks whether the initial NNC reconcile has completed. + hasNNCInitialized = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "cns_nnc_initialized", + Help: "Whether the initial NNC reconcile has completed.", + }, + ) ) func init() { metrics.Registry.MustRegister( managerStartFailures, nncReconcilerStartFailures, + nncInitFailure, + hasNNCInitialized, ) }