diff --git a/pkg/k8s/client/cell.go b/pkg/k8s/client/cell.go index d1868a96470fe..78f7efbaedebf 100644 --- a/pkg/k8s/client/cell.go +++ b/pkg/k8s/client/cell.go @@ -60,9 +60,10 @@ var ClientBuilderCell = cell.Module( ) var ( - k8sHeartbeatControllerGroup = controller.NewGroup("k8s-heartbeat") - connTimeout = time.Minute - connRetryInterval = 5 * time.Second + k8sHeartbeatControllerGroup = controller.NewGroup("k8s-heartbeat") + k8sConnRecoveryControllerGroup = controller.NewGroup("k8s-conn-recovery") + connTimeout = time.Minute + connRetryInterval = 5 * time.Second ) // Type aliases for the clientsets to avoid name collision on 'Clientset' when composing them. @@ -243,7 +244,14 @@ func (c *compositeClientset) onStart(startCtx cell.HookContext) error { } if err := c.waitForConn(startCtx); err != nil { - return err + if !c.config.IgnoreApiserverFailOnStart { + return err + } + c.logger.Warn("Unable to connect to k8s API server on startup; continuing in degraded state", + logfields.Error, err, + ) + c.startConnRecovery() + return nil } c.startHeartbeat() @@ -309,6 +317,46 @@ func (c *compositeClientset) startHeartbeat() { }) } +// degraded state background retry +func (c *compositeClientset) startConnRecovery() { + const controllerName = "k8s-conn-recovery" + c.controller.UpdateController(controllerName, + controller.ControllerParams{ + Group: k8sConnRecoveryControllerGroup, + // use the same cfg vars as onstart for timeout and retry + // allow the controller to exec the anon at interval + DoFunc: func(ctx context.Context) error { + if err := isConnReady(c); err != nil { + c.logger.Debug("k8s API server still unreachable, will retry", + logfields.IPAddr, c.restConfigManager.getConfig().Host, + logfields.Error, err, + ) + return nil + } + + c.logger.Info("Re-established connection to API server. Exiting degraded state", + logfields.IPAddr, c.restConfigManager.getConfig().Host, + ) + // start the heartbeat as this was previously skipped + c.startHeartbeat() + + // do the k8s version check. might remove + if err := k8sversion.Update(c.logger, c, c.config.EnableK8sAPIDiscovery); err != nil { + c.logger.Warn("k8s version check failed after reconnect", logfields.Error, err) + } else if !k8sversion.Capabilities().MinimalVersionMet { + c.logger.Warn("k8s version does not meet minimal standardc", + "version", k8sversion.Version(), + "minVersion", k8sversion.MinimalVersionConstraint, + ) + } + + c.controller.RemoveController(controllerName) + return nil + }, + RunInterval: connRetryInterval, + }) +} + func (c *compositeClientset) waitForConn(ctx context.Context) error { stop := make(chan struct{}) timeout := time.NewTimer(connTimeout) diff --git a/pkg/k8s/client/config.go b/pkg/k8s/client/config.go index d51cb6f04fed3..0d4fa423f6690 100644 --- a/pkg/k8s/client/config.go +++ b/pkg/k8s/client/config.go @@ -46,6 +46,11 @@ type SharedConfig struct { // EnableAPIDiscovery enables Kubernetes API discovery EnableK8sAPIDiscovery bool + + // IgnoreApiserverFailOnStart controls whether a failure to connect to the + // k8s API server during startup is treated as fatal. When true, the agent + // continues starting in a degraded state instead of exiting. + IgnoreApiserverFailOnStart bool } type ClientParams struct { @@ -75,6 +80,7 @@ var defaultSharedConfig = SharedConfig{ K8sClientConnectionKeepAlive: 30 * time.Second, K8sHeartbeatTimeout: 30 * time.Second, EnableK8sAPIDiscovery: defaults.K8sEnableAPIDiscovery, + IgnoreApiserverFailOnStart: false, } func (def SharedConfig) Flags(flags *pflag.FlagSet) { @@ -87,6 +93,7 @@ func (def SharedConfig) Flags(flags *pflag.FlagSet) { flags.Duration(option.K8sClientConnectionKeepAlive, def.K8sClientConnectionKeepAlive, "Configures the keep alive duration of K8s client connections. K8 client is disabled if the value is set to 0") flags.Duration(option.K8sHeartbeatTimeout, def.K8sHeartbeatTimeout, "Configures the timeout for api-server heartbeat, set to 0 to disable") flags.Bool(option.K8sEnableAPIDiscovery, def.EnableK8sAPIDiscovery, "Enable discovery of Kubernetes API groups and resources with the discovery API") + flags.Bool(option.IgnoreApiserverFailOnStart, def.IgnoreApiserverFailOnStart, "When true, failure to connect to the k8s API server on startup is non-fatal; the agent starts in a degraded state") } func NewClientConfig(cfg SharedConfig, params ClientParams) Config { diff --git a/pkg/option/config.go b/pkg/option/config.go index 92f76893084cd..2b4b3dbfa8609 100644 --- a/pkg/option/config.go +++ b/pkg/option/config.go @@ -177,6 +177,11 @@ const ( // Intended for operating cilium with CNI-compatible orchestrators other than Kubernetes. (default is true) EnableK8s = "enable-k8s" + // IgnoreApiserverFailOnStart controls whether a failure to connect to the + // k8s API server during startup is treated as fatal. When true, the agent + // continues starting in a degraded state instead of exiting. (default is false) + IgnoreApiserverFailOnStart = "ignore-apiserver-fail-onstart" + // K8sAPIServer is the kubernetes api address server (for https use --k8s-kubeconfig-path instead) K8sAPIServer = "k8s-api-server"