Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add metric for NNC init failures #3453

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 5 additions & 7 deletions cns/service/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -1061,7 +1061,7 @@ func main() {
return errors.Wrap(err, "failed to start fsnotify watcher, will retry")
}
return nil
}, retry.DelayType(retry.BackOffDelay), retry.Attempts(0), retry.Context(rootCtx)) // infinite cancellable exponential backoff retrier
}, retry.DelayType(retry.BackOffDelay), retry.UntilSucceeded(), retry.Context(rootCtx)) // infinite cancellable exponential backoff retrier
}()
}

Expand Down Expand Up @@ -1450,20 +1450,18 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
// aks addons to come up so retry a bit more aggresively here.
// will retry 10 times maxing out at a minute taking about 8 minutes before it gives up.
attempt := 0
err = retry.Do(func() error {
_ = retry.Do(func() error {
attempt++
logger.Printf("reconciling initial CNS state attempt: %d", attempt)
err = reconcileInitialCNSState(ctx, directscopedcli, httpRestServiceImplementation, podInfoByIPProvider)
if err != nil {
logger.Errorf("failed to reconcile initial CNS state, attempt: %d err: %v", attempt, err)
nncInitFailure.Inc()
}
return errors.Wrap(err, "failed to initialize CNS state")
}, retry.Context(ctx), retry.Delay(initCNSInitalDelay), retry.MaxDelay(time.Minute))
if err != nil {
return err
}
}, retry.Context(ctx), retry.Delay(initCNSInitalDelay), retry.MaxDelay(time.Minute), retry.UntilSucceeded())
logger.Printf("reconciled initial CNS state after %d attempts", attempt)

hasNNCInitialized.Set(1)
scheme := kuberuntime.NewScheme()
if err := corev1.AddToScheme(scheme); err != nil { //nolint:govet // intentional shadow
return errors.Wrap(err, "failed to add corev1 to scheme")
Expand Down
58 changes: 38 additions & 20 deletions cns/service/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,31 +5,49 @@ import (
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

// managerStartFailures is a monotic counter which tracks the number of times the controller-runtime
// manager failed to start. To drive alerting based on this metric, it is recommended to use the rate
// of increase over a period of time. A positive rate of change indicates that the CNS is actively
// failing and retrying.
var managerStartFailures = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "cns_ctrlmanager_start_failures_total",
Help: "Number of times the controller-runtime manager failed to start.",
},
)

// nncReconcilerStartFailures is a monotic counter which tracks the number of times the NNC reconciler
// has failed to start within the timeout period. To drive alerting based on this metric, it is
// recommended to use the rate of increase over a period of time. A positive rate of change indicates
// that the CNS is actively failing and retrying.
var nncReconcilerStartFailures = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "cns_nnc_reconciler_start_failures_total",
Help: "Number of times the NNC reconciler has failed to start within the timeout period.",
},
var (
// managerStartFailures is a monotic counter which tracks the number of times the controller-runtime
// manager failed to start. To drive alerting based on this metric, it is recommended to use the rate
// of increase over a period of time. A positive rate of change indicates that the CNS is actively
// failing and retrying.
managerStartFailures = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "cns_ctrlmanager_start_failures_total",
Help: "Number of times the controller-runtime manager failed to start.",
},
)
// nncReconcilerStartFailures is a monotic counter which tracks the number of times the NNC reconciler
// has failed to start within the timeout period. To drive alerting based on this metric, it is
// recommended to use the rate of increase over a period of time. A positive rate of change indicates
// that the CNS is actively failing and retrying.
nncReconcilerStartFailures = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "cns_nnc_reconciler_start_failures_total",
Help: "Number of times the NNC reconciler has failed to start within the timeout period.",
},
)
// nncInitFailure is a monotic counter which tracks the number of times the initial NNC reconcile
// has failed.
nncInitFailure = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "cns_nnc_init_failures_total",
Help: "Number of times the initial NNC reconcile has failed.",
},
)
// hasNNCInitialized is a gauge which tracks whether the initial NNC reconcile has completed.
hasNNCInitialized = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "cns_nnc_initialized",
Help: "Whether the initial NNC reconcile has completed.",
},
)
)

func init() {
metrics.Registry.MustRegister(
managerStartFailures,
nncReconcilerStartFailures,
nncInitFailure,
hasNNCInitialized,
)
}
Loading