Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: validate credentials and check apiserver connectivity before starting kubelet #5982

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
9 changes: 8 additions & 1 deletion e2e/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"encoding/base64"
"fmt"
"strings"

"github.com/Azure/agentbaker/e2e/config"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -38,6 +39,12 @@ func ValidateCommonLinux(ctx context.Context, s *Scenario) {
stdout := execResult.stdout.String()
require.NotContains(s.T, stdout, "--dynamic-config-dir", "kubelet flag '--dynamic-config-dir' should not be present in /etc/default/kubelet\nContents:\n%s")

kubeletLogs := execScriptOnVMForScenarioValidateExitCode(ctx, s, "sudo journalctl -u kubelet", 0, "could not retrieve kubelet logs with journalctl").stdout.String()
validationFailed := strings.Contains(kubeletLogs, "kubelet credential validation failed")
require.False(s.T, validationFailed, "expected kubelet credential validation to have succeeded")
validatedKubeletCredentials := strings.Contains(kubeletLogs, "kubelet client credential is valid") || strings.Contains(kubeletLogs, "kubelet bootstrap token credential is valid")
require.True(s.T, validatedKubeletCredentials, "expected kubelet to have validated its credential or bootstrap token before startup, but seemingly did not")

// the instructions belows expects the SSH key to be uploaded to the user pool VM.
// which happens as a side-effect of execCommandOnVMForScenario, it's ugly but works.
// maybe we should use a single ssh key per cluster, but need to be careful with parallel test runs.
Expand All @@ -62,7 +69,7 @@ func ValidateCommonLinux(ctx context.Context, s *Scenario) {
//"cloud-config.txt", // file with UserData
})

execResult = execScriptOnVMForScenarioValidateExitCode(ctx, s, "sudo curl http://168.63.129.16:32526/vmSettings", 0, "curl to wireserver failed")
_ = execScriptOnVMForScenarioValidateExitCode(ctx, s, "sudo curl http://168.63.129.16:32526/vmSettings", 0, "curl to wireserver failed")

execResult = execOnVMForScenarioOnUnprivilegedPod(ctx, s, "curl https://168.63.129.16/machine/?comp=goalstate -H 'x-ms-version: 2015-04-05' -s --connect-timeout 4")
require.Equal(s.T, "28", execResult.exitCode, "curl to wireserver should fail")
Expand Down
7 changes: 4 additions & 3 deletions e2e/validators.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@ import (
"bytes"
"context"
"fmt"
"github.com/tidwall/gjson"
"net"
"os"
"regexp"
"strings"
"time"

"github.com/tidwall/gjson"

"github.com/Azure/agentbaker/e2e/config"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -301,7 +302,7 @@ func ValidateContainerdWASMShims(ctx context.Context, s *Scenario) {

func ValidateKubeletHasNotStopped(ctx context.Context, s *Scenario) {
command := "sudo journalctl -u kubelet"
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, command, 0, "could not retrieve kubelet logs")
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, command, 0, "could not retrieve kubelet logs with journalctl")
assert.NotContains(s.T, execResult.stdout.String(), "Stopped Kubelet")
assert.Contains(s.T, execResult.stdout.String(), "Started Kubelet")
}
Expand All @@ -314,7 +315,7 @@ func ValidateServicesDoNotRestartKubelet(ctx context.Context, s *Scenario) {

// ValidateKubeletHasFlags checks kubelet is started with the right flags and configs.
func ValidateKubeletHasFlags(ctx context.Context, s *Scenario, filePath string) {
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, "sudo journalctl -u kubelet", 0, "could not get kubelet logs")
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, "sudo journalctl -u kubelet", 0, "could not retrieve kubelet logs with journalctl")
configFileFlags := fmt.Sprintf("FLAG: --config=\"%s\"", filePath)
require.Containsf(s.T, execResult.stdout.String(), configFileFlags, "expected to find flag %s, but not found", "config")
}
Expand Down
13 changes: 8 additions & 5 deletions parts/linux/cloud-init/artifacts/cse_main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,6 @@ if [ "${NEEDS_CONTAINERD}" == "true" ] && [ "${SHOULD_CONFIG_CONTAINERD_ULIMITS
logs_to_events "AKS.CSE.setContainerdUlimits" configureContainerdUlimits
fi

logs_to_events "AKS.CSE.ensureKubelet" ensureKubelet
if [ "${ENSURE_NO_DUPE_PROMISCUOUS_BRIDGE}" == "true" ]; then
logs_to_events "AKS.CSE.ensureNoDupOnPromiscuBridge" ensureNoDupOnPromiscuBridge
fi
Expand Down Expand Up @@ -401,6 +400,13 @@ else
logs_to_events "AKS.CSE.apiserverNC" "retrycmd_if_failure ${API_SERVER_CONN_RETRIES} 1 10 nc -vz ${API_SERVER_NAME} 443" || time nc -vz ${API_SERVER_NAME} 443 || VALIDATION_ERR=$ERR_K8S_API_SERVER_CONN_FAIL
fi

echo "API server connection check code: $VALIDATION_ERR"
if [ $VALIDATION_ERR -ne 0 ]; then
exit $VALIDATION_ERR
fi

logs_to_events "AKS.CSE.ensureKubelet" ensureKubelet

if [[ ${ID} != "mariner" ]] && [[ ${ID} != "azurelinux" ]]; then
echo "Recreating man-db auto-update flag file and kicking off man-db update process at $(date)"
createManDbAutoUpdateFlagFile
Expand Down Expand Up @@ -458,10 +464,7 @@ else
fi
fi

echo "Custom script finished. API server connection check code:" $VALIDATION_ERR
echo "Custom script finished."
echo $(date),$(hostname), endcustomscript>>/opt/m

exit $VALIDATION_ERR


#EOF
2 changes: 2 additions & 0 deletions parts/linux/cloud-init/artifacts/kubelet.service
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ ExecStartPre=/bin/mount --make-shared /var/lib/kubelet
ExecStartPre=-/sbin/ebtables -t nat --list
ExecStartPre=-/sbin/iptables -t nat --numeric --list

ExecStartPre=/bin/bash /opt/azure/containers/validate-kubelet-credentials.sh

ExecStart=/usr/local/bin/kubelet \
--enable-server \
--node-labels="${KUBELET_NODE_LABELS}" \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/bin/bash
set -euo pipefail

# this gives us logs_to_events and retry wrappers
source /opt/azure/containers/provision_source.sh

set -x

KUBECONFIG_PATH="${KUBECONFIG_PATH:-/var/lib/kubelet/kubeconfig}"
BOOTSTRAP_KUBECONFIG_PATH="${BOOTSTRAP_KUBECONFIG_PATH:-/var/lib/kubelet/bootstrap-kubeconfig}"

VALIDATE_KUBELET_CREDENTIALS_MAX_RETRIES=${VALIDATE_KUBELET_CREDENTIALS_MAX_RETRIES:-30}
VALIDATE_KUBELET_CREDENTIALS_RETRY_DELAY_SECONDS=${VALIDATE_KUBELET_CREDENTIALS_RETRY_DELAY_SECONDS:-1}
VALIDATE_KUBELET_CREDENTIALS_RETRY_TIMEOUT_SECONDS=${VALIDATE_KUBELET_CREDENTIALS_RETRY_TIMEOUT_SECONDS:-10}

function validateKubeconfig {
local kubeconfig_path=$1

# if ! retrycmd_if_failure $VALIDATE_KUBELET_CREDENTIALS_MAX_RETRIES \
# $VALIDATE_KUBELET_CREDENTIALS_RETRY_DELAY_SECONDS \
# $VALIDATE_KUBELET_CREDENTIALS_RETRY_TIMEOUT_SECONDS \
# kubectl version --kubeconfig "$kubeconfig_path"; then

# # for now we simply exit 0 here to prevent provisioning failures in cases where the credential
# # doesn't become valid until after we've exhausted our retries - kubelet should still eventually be able to register
# echo "kubelet credential validation failed, will still attempt to start kubelet"
# exit 0
# fi

# if ! retrycmd_if_failure $VALIDATE_KUBELET_CREDENTIALS_MAX_RETRIES \
# $VALIDATE_KUBELET_CREDENTIALS_RETRY_DELAY_SECONDS \
# $VALIDATE_KUBELET_CREDENTIALS_RETRY_TIMEOUT_SECONDS \
# kubectl auth whoami -v 10 --kubeconfig "$kubeconfig_path"; then

# # for now we simply exit 0 here to prevent provisioning failures in cases where the credential
# # doesn't become valid until after we've exhausted our retries - kubelet should still eventually be able to register
# echo "kubelet credential validation failed, will still attempt to start kubelet"
# exit 0
# fi

if ! kubectl auth whoami -v 10 --kubeconfig "$kubeconfig_path"; then

# for now we simply exit 0 here to prevent provisioning failures in cases where the credential
# doesn't become valid until after we've exhausted our retries - kubelet should still eventually be able to register
echo "kubelet credential validation failed, will still attempt to start kubelet"
exit 0
fi
}

function validateKubeletCredentials {
if [ ! -f "$KUBECONFIG_PATH" ] && [ ! -f "$BOOTSTRAP_KUBECONFIG_PATH" ]; then
echo "both kubeconfig: $KUBECONFIG_PATH and bootstrap-kubeconfig: $BOOTSTRAP_KUBECONFIG_PATH do not exist, unable to start kubelet"
exit 1
fi

if ! which kubectl >/dev/null 2>&1; then
echo "kubectl not found, will skip kubelet credential validation"
exit 0
fi

if [ -f "$KUBECONFIG_PATH" ]; then
echo "will validate kubeconfig: $KUBECONFIG_PATH"
validateKubeconfig "$KUBECONFIG_PATH"
echo "kubelet client credential is valid"
exit 0
fi

echo "will validate bootstrap-kubeconfig: $BOOTSTRAP_KUBECONFIG_PATH"
validateKubeconfig "$BOOTSTRAP_KUBECONFIG_PATH"
echo "kubelet bootstrap token credential is valid"
}

logs_to_events "AKS.Runtime.validateKubeletCredentials" validateKubeletCredentials
7 changes: 7 additions & 0 deletions parts/linux/cloud-init/nodecustomdata.yml
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,13 @@ write_files:
content: !!binary |
{{GetVariableProperty "cloudInitData" "ensureIMDSRestrictionScript"}}

- path: /opt/azure/containers/validate-kubelet-credentials.sh
permissions: "0755"
encoding: gzip
owner: root
content: !!binary |
{{GetVariableProperty "cloudInitData" "validateKubeletCredentialsScript"}}

- path: /etc/kubernetes/certs/ca.crt
permissions: "0600"
encoding: base64
Expand Down
6 changes: 5 additions & 1 deletion pkg/agent/baker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -787,7 +787,7 @@ var _ = Describe("Assert generated customData and cseCmd", func() {
config.KubeletConfig = map[string]string{}
}, nil),

Entry("AKSUbuntu1804 with kubelet client certificatet", "AKSUbuntu1804+WithKubeletClientCert", "1.18.3",
Entry("AKSUbuntu1804 with kubelet client certificate", "AKSUbuntu1804+WithKubeletClientCert", "1.18.3",
func(config *datamodel.NodeBootstrappingConfiguration) {
config.ContainerService.Properties.CertificateProfile = &datamodel.CertificateProfile{
ClientCertificate: "fooBarBaz",
Expand All @@ -798,12 +798,14 @@ var _ = Describe("Assert generated customData and cseCmd", func() {
etcDefaultKubelet := o.files["/etc/default/kubelet"].value
etcDefaultKubeletService := o.files["/etc/systemd/system/kubelet.service"].value
kubeletSh := o.files["/opt/azure/containers/kubelet.sh"].value
validateCredentials := o.files["/opt/azure/containers/validate-kubelet-credentials.sh"].value
caCRT := o.files["/etc/kubernetes/certs/ca.crt"].value
kubeconfig := o.files["/var/lib/kubelet/kubeconfig"].value

Expect(etcDefaultKubelet).NotTo(BeEmpty())
Expect(etcDefaultKubeletService).NotTo(BeEmpty())
Expect(kubeletSh).NotTo(BeEmpty())
Expect(validateCredentials).ToNot(BeEmpty())
Expect(caCRT).NotTo(BeEmpty())
Expect(kubeconfig).ToNot(BeEmpty())

Expand All @@ -822,13 +824,15 @@ var _ = Describe("Assert generated customData and cseCmd", func() {
etcDefaultKubelet := o.files["/etc/default/kubelet"].value
etcDefaultKubeletService := o.files["/etc/systemd/system/kubelet.service"].value
kubeletSh := o.files["/opt/azure/containers/kubelet.sh"].value
validateCredentials := o.files["/opt/azure/containers/validate-kubelet-credentials.sh"].value
bootstrapKubeconfig := o.files["/var/lib/kubelet/bootstrap-kubeconfig"].value
caCRT := o.files["/etc/kubernetes/certs/ca.crt"].value

Expect(etcDefaultKubelet).NotTo(BeEmpty())
Expect(bootstrapKubeconfig).NotTo(BeEmpty())
Expect(kubeletSh).NotTo(BeEmpty())
Expect(etcDefaultKubeletService).NotTo(BeEmpty())
Expect(validateCredentials).ToNot(BeEmpty())
Expect(caCRT).NotTo(BeEmpty())

Expect(bootstrapKubeconfig).To(ContainSubstring("token"))
Expand Down
1 change: 1 addition & 0 deletions pkg/agent/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ const (
migPartitionScript = "linux/cloud-init/artifacts/mig-partition.sh"
migPartitionSystemdService = "linux/cloud-init/artifacts/mig-partition.service"
ensureIMDSRestrictionScript = "linux/cloud-init/artifacts/ensure_imds_restriction.sh"
validateKubeletCredentialsScript = "linux/cloud-init/artifacts/validate-kubelet-credentials.sh"

// scripts and service for enabling ipv6 dual stack.
dhcpv6SystemdService = "linux/cloud-init/artifacts/dhcpv6.service"
Expand Down
Loading
Loading