Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
9ed2482
Fix nil pointer panic and spurious Auto Mode updates
demikl Sep 4, 2025
1769a87
Add safety checks for nil configurations in updateComputeConfig
demikl Sep 4, 2025
b7a41aa
Enhance error logging in customUpdate for AutoMode config updates
demikl Sep 4, 2025
71116ff
Refactor Auto Mode configuration validation to ensure all required se…
demikl Sep 4, 2025
de895a2
Add integration tests for EKS Auto-Mode Cluster updates
demikl Sep 6, 2025
c2ae4cc
fix: terminal status assertion
demikl Sep 8, 2025
7725b88
fix timings to detect UPDATING phase
demikl Sep 8, 2025
86928b7
check against eks:DescribeCluster whether EKS Auto-Mode is enabled or…
demikl Sep 8, 2025
a2c5808
unused code
demikl Sep 8, 2025
79d91c2
debug output from eks:DescribeCluster for EKS Auto Mode
demikl Sep 8, 2025
fa8a7d9
more debug
demikl Sep 8, 2025
9db637b
more debugging
demikl Sep 8, 2025
6a5e6d8
fix bad timings + add polling loop to get a better insight when the E…
demikl Sep 8, 2025
994435c
add logging for custom resource and EKS DescribeCluster during auto-m…
demikl Sep 9, 2025
bfd0625
add logging for EKS update status during auto-mode cluster tests
demikl Sep 9, 2025
8d3cde0
replace assertions on eks:DescribeCluster API by eks:DescribeUpdate API
demikl Sep 11, 2025
d5a11c1
merge tests for EKS Auto Mode in a single file
demikl Sep 22, 2025
086fe41
test(e2e): add shorter class-level wait constants for Auto Mode updat…
demikl Sep 22, 2025
273c53c
reuse simple_cluster fixture from test_cluster.py
demikl Sep 25, 2025
bee62c8
no need to wait for status to be updated
demikl Sep 25, 2025
af8a30b
re-use existing timing constants
demikl Sep 25, 2025
2471f9e
some delay is needed for the controller to PATCH the custom resource
demikl Sep 26, 2025
ac549a2
Revert "some delay is needed for the controller to PATCH the custom r…
demikl Sep 26, 2025
424bed0
re: some delay is needed for the controller to PATCH the custom resource
demikl Sep 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 113 additions & 14 deletions pkg/resource/cluster/hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,77 @@ func (rm *resourceManager) clusterInUse(ctx context.Context, r *resource) (bool,
return (nodes != nil && len(nodes.Nodegroups) > 0), nil
}

// isAutoModeCluster returns true if the cluster is configured for EKS Auto Mode.
// According to AWS documentation, compute, block storage, and load balancing capabilities
// must all be enabled or disabled together. Any partial configuration is invalid.
func isAutoModeCluster(r *resource) bool {
if r == nil || r.ko == nil {
return false
}

// Check if all three Auto Mode configurations are present
hasComputeConfig := r.ko.Spec.ComputeConfig != nil
hasStorageConfig := r.ko.Spec.StorageConfig != nil && r.ko.Spec.StorageConfig.BlockStorage != nil
hasELBConfig := r.ko.Spec.KubernetesNetworkConfig != nil && r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing != nil

// All three must be present for this to be considered an Auto Mode cluster
if !hasComputeConfig || !hasStorageConfig || !hasELBConfig {
return false
}

// Check compute configuration
computeEnabled := r.ko.Spec.ComputeConfig.Enabled != nil && *r.ko.Spec.ComputeConfig.Enabled

// Check storage configuration
storageEnabled := r.ko.Spec.StorageConfig.BlockStorage.Enabled != nil && *r.ko.Spec.StorageConfig.BlockStorage.Enabled

// Check elastic load balancing configuration
elbEnabled := r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled != nil && *r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled

// Auto Mode requires all three capabilities to have the same state (all true or all false)
// If they are all true, Auto Mode is enabled
// If they are all false, Auto Mode is being disabled
// Any other combination is invalid
return (computeEnabled && storageEnabled && elbEnabled) || (!computeEnabled && !storageEnabled && !elbEnabled)
}

// validateAutoModeConfig validates that Auto Mode configuration is consistent.
// Returns an error if the configuration is invalid (partial enablement).
func validateAutoModeConfig(r *resource) error {
if r == nil || r.ko == nil {
return nil // Not an Auto Mode configuration
}

// Check if any Auto Mode configuration is present
hasComputeConfig := r.ko.Spec.ComputeConfig != nil
hasStorageConfig := r.ko.Spec.StorageConfig != nil && r.ko.Spec.StorageConfig.BlockStorage != nil
hasELBConfig := r.ko.Spec.KubernetesNetworkConfig != nil && r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing != nil

// If no Auto Mode configuration is present, it's valid (not an Auto Mode cluster)
if !hasComputeConfig && !hasStorageConfig && !hasELBConfig {
return nil
}

// If any Auto Mode configuration is present, ALL must be present
if !hasComputeConfig || !hasStorageConfig || !hasELBConfig {
return fmt.Errorf("invalid Auto Mode configuration: when configuring Auto Mode, all three capabilities must be specified (compute=%v, storage=%v, elb=%v)",
hasComputeConfig, hasStorageConfig, hasELBConfig)
}

// Check that all configurations have the same enabled state
computeEnabled := r.ko.Spec.ComputeConfig.Enabled != nil && *r.ko.Spec.ComputeConfig.Enabled
storageEnabled := r.ko.Spec.StorageConfig.BlockStorage.Enabled != nil && *r.ko.Spec.StorageConfig.BlockStorage.Enabled
elbEnabled := r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled != nil && *r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled

// All three must be in the same state
if computeEnabled == storageEnabled && storageEnabled == elbEnabled {
return nil // Valid configuration
}

return fmt.Errorf("invalid Auto Mode configuration: compute, block storage, and load balancing capabilities must all be enabled or disabled together (compute=%v, storage=%v, elb=%v)",
computeEnabled, storageEnabled, elbEnabled)
}

func customPreCompare(
a *resource,
b *resource,
Expand Down Expand Up @@ -380,25 +451,39 @@ func (rm *resourceManager) customUpdate(
return returnClusterUpdating(updatedRes)
}

// Handle computeConfig updates
// Handle computeConfig updates - only for Auto Mode clusters
if delta.DifferentAt("Spec.ComputeConfig") || delta.DifferentAt("Spec.StorageConfig") || delta.DifferentAt("Spec.KubernetesNetworkConfig") {
if err := rm.updateComputeConfig(ctx, desired); err != nil {
awsErr, ok := extractAWSError(err)
rlog.Info("attempting to update AutoMode config",
"error", err,
"isAWSError", ok,
"awsErrorCode", awsErr.Code)
// Validate Auto Mode configuration consistency before attempting update
if err := validateAutoModeConfig(desired); err != nil {
return nil, ackerr.NewTerminalError(err)
}

// Check to see if we've raced an async update call and need to requeue
if ok && awsErr.Code == "ResourceInUseException" {
rlog.Info("resource in use, requeueing after async update")
return nil, requeueAfterAsyncUpdate()
// Only proceed with Auto Mode updates if the cluster is actually configured for Auto Mode
if isAutoModeCluster(desired) {
if err := rm.updateComputeConfig(ctx, desired); err != nil {
awsErr, ok := extractAWSError(err)
var awsErrorCode string
if ok && awsErr != nil {
awsErrorCode = awsErr.Code
}
rlog.Info("attempting to update AutoMode config",
"error", err,
"isAWSError", ok,
"awsErrorCode", awsErrorCode)

// Check to see if we've raced an async update call and need to requeue
if ok && awsErr != nil && awsErr.Code == "ResourceInUseException" {
rlog.Info("resource in use, requeueing after async update")
return nil, requeueAfterAsyncUpdate()
}

return nil, fmt.Errorf("failed to update AutoMode config: %w", err)
}

return nil, fmt.Errorf("failed to update AutoMode config: %w", err)
return returnClusterUpdating(updatedRes)
}

return returnClusterUpdating(updatedRes)
// If not Auto Mode, ignore the diff (likely elasticLoadBalancing false vs absent)
rlog.Debug("ignoring diff on compute/storage/network config for non-Auto Mode cluster")
}

// Handle zonalShiftConfig updates
Expand Down Expand Up @@ -666,6 +751,20 @@ func (rm *resourceManager) updateComputeConfig(
exit := rlog.Trace("rm.updateComputeConfig")
defer exit(err)

// Safety check: ensure ComputeConfig is not nil
if r == nil || r.ko == nil || r.ko.Spec.ComputeConfig == nil {
rlog.Debug("skipping updateComputeConfig: ComputeConfig is nil")
return nil
}

// Safety check: ensure all required configurations are not nil
if r.ko.Spec.StorageConfig == nil || r.ko.Spec.StorageConfig.BlockStorage == nil {
return fmt.Errorf("invalid Auto Mode configuration: StorageConfig.BlockStorage is required")
}
if r.ko.Spec.KubernetesNetworkConfig == nil || r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing == nil {
return fmt.Errorf("invalid Auto Mode configuration: KubernetesNetworkConfig.ElasticLoadBalancing is required")
}

// Convert []*string to []string for NodePools
nodePools := make([]string, 0, len(r.ko.Spec.ComputeConfig.NodePools))
for _, nodePool := range r.ko.Spec.ComputeConfig.NodePools {
Expand Down
212 changes: 208 additions & 4 deletions test/e2e/tests/test_cluster_automode.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,10 @@
from e2e.common.types import CLUSTER_RESOURCE_PLURAL
from e2e.common.waiter import wait_until_deleted
from e2e.replacement_values import REPLACEMENT_VALUES
from e2e.tests.test_cluster import simple_cluster

MODIFY_WAIT_AFTER_SECONDS = 240
CHECK_STATUS_WAIT_SECONDS = 240
MODIFY_WAIT_AFTER_SECONDS = 30
CHECK_STATUS_WAIT_SECONDS = 30


def wait_for_cluster_active(eks_client, cluster_name):
Expand Down Expand Up @@ -93,8 +94,13 @@ def auto_mode_cluster(eks_client):

yield (ref, cr)

pass

# Try to delete, if doesn't already exist
try:
_, deleted = k8s.delete_custom_resource(ref, 3, 10)
assert deleted
wait_until_deleted(cluster_name)
except Exception:
pass

@service_marker
@pytest.mark.canary
Expand Down Expand Up @@ -144,3 +150,201 @@ def test_create_auto_mode_cluster(self, eks_client, auto_mode_cluster):
_, deleted = k8s.delete_custom_resource(ref, 3, 10)
assert deleted
wait_until_deleted(cluster_name)


@service_marker
@pytest.mark.canary
class TestAutoModeClusterUpdates:
def test_enable_auto_mode_on_standard_cluster(self, eks_client, simple_cluster):
(ref, cr) = simple_cluster
cluster_name = cr["spec"]["name"]

try:
aws_res = eks_client.describe_cluster(name=cluster_name)
assert aws_res is not None
except eks_client.exceptions.ResourceNotFoundException:
pytest.fail(f"Could not find cluster '{cluster_name}' in EKS")

# Wait for the cluster to be ACTIVE and let controller refresh status
wait_for_cluster_active(eks_client, cluster_name)
time.sleep(CHECK_STATUS_WAIT_SECONDS)
get_and_assert_status(ref, "ACTIVE", True)

# Patch to enable auto-mode
patch_enable_auto_mode = {
"spec": {
"computeConfig": {"enabled": True},
"storageConfig": {"blockStorage": {"enabled": True}},
"kubernetesNetworkConfig": {
"elasticLoadBalancing": {"enabled": True},
"ipFamily": "ipv4",
},
}
}
k8s.patch_custom_resource(ref, patch_enable_auto_mode)
time.sleep(MODIFY_WAIT_AFTER_SECONDS)
get_and_assert_status(ref, "UPDATING", False)

# Wait for cluster to become active after update
wait_for_cluster_active(eks_client, cluster_name)
time.sleep(CHECK_STATUS_WAIT_SECONDS)
get_and_assert_status(ref, "ACTIVE", True)

# Verify auto-mode activation via EKS update history (since DescribeCluster may not reflect the fields immediately)
updates_summary = eks_client.list_updates(name=cluster_name)

update_ids = updates_summary.get("updateIds", [])
assert len(update_ids) == 1, (
f"Expected exactly 1 update, got {len(update_ids)}: {update_ids}"
)

update_id = update_ids[0]
upd_desc = eks_client.describe_update(name=cluster_name, updateId=update_id)

update_info = upd_desc["update"]

# Verify update type and status
assert update_info["type"] == "AutoModeUpdate", (
f"Expected AutoModeUpdate, got: {update_info['type']}"
)
assert update_info["status"] == "Successful", (
f"Expected Successful status, got: {update_info['status']}"
)

# Verify update params contain the three Auto Mode configurations
params = update_info.get("params", [])
param_types = {param["type"] for param in params}
expected_types = {"ComputeConfig", "StorageConfig", "KubernetesNetworkConfig"}
assert param_types == expected_types, (
f"Expected params {expected_types}, got: {param_types}"
)

# Verify each param has correct enabled=true values
for param in params:
import json

value = json.loads(param["value"])
if param["type"] == "ComputeConfig":
assert value.get("enabled") is True, (
f"ComputeConfig should have enabled=true, got: {value}"
)
elif param["type"] == "StorageConfig":
block_storage = value.get("blockStorage", {})
assert block_storage.get("enabled") is True, (
f"StorageConfig.blockStorage should have enabled=true, got: {value}"
)
elif param["type"] == "KubernetesNetworkConfig":
elb = value.get("elasticLoadBalancing", {})
assert elb.get("enabled") is True, (
f"KubernetesNetworkConfig.elasticLoadBalancing should have enabled=true, got: {value}"
)

def test_disable_auto_mode_incorrectly(self, eks_client, auto_mode_cluster):
(ref, cr) = auto_mode_cluster
cluster_name = cr["spec"]["name"]

try:
aws_res = eks_client.describe_cluster(name=cluster_name)
assert aws_res is not None
except eks_client.exceptions.ResourceNotFoundException:
pytest.fail(f"Could not find cluster '{cluster_name}' in EKS")

wait_for_cluster_active(eks_client, cluster_name)
time.sleep(CHECK_STATUS_WAIT_SECONDS)
get_and_assert_status(ref, "ACTIVE", True)

# Patch with incorrect parameters to disable auto-mode
patch_disable_auto_mode_incorrectly = {
"spec": {
"computeConfig": {"enabled": False},
"storageConfig": {
"blockStorage": {
"enabled": True # Should be False
}
},
"kubernetesNetworkConfig": {"elasticLoadBalancing": {"enabled": False}},
}
}
logging.info(
f"Applying patch with incorrect parameters: {patch_disable_auto_mode_incorrectly}"
)
k8s.patch_custom_resource(ref, patch_disable_auto_mode_incorrectly)
time.sleep(MODIFY_WAIT_AFTER_SECONDS)

# The controller should detect the invalid configuration and set a terminal condition.
terminal_condition = "ACK.Terminal"
cond = k8s.get_resource_condition(ref, terminal_condition)
if cond is None:
pytest.fail(
f"Failed to find {terminal_condition} condition in resource {ref}"
)

cond_status = cond.get("status", None)
if str(cond_status) != str(True):
pytest.fail(
f"Expected {terminal_condition} condition to have status True but found {cond_status}"
)

# Verify the error message contains information about invalid Auto Mode configuration
assert "invalid Auto Mode configuration" in cond.get("message", "")

def test_disable_auto_mode_correctly(self, eks_client, auto_mode_cluster):
(ref, cr) = auto_mode_cluster
cluster_name = cr["spec"]["name"]

try:
aws_res = eks_client.describe_cluster(name=cluster_name)
assert aws_res is not None
except eks_client.exceptions.ResourceNotFoundException:
pytest.fail(f"Could not find cluster '{cluster_name}' in EKS")

wait_for_cluster_active(eks_client, cluster_name)
time.sleep(CHECK_STATUS_WAIT_SECONDS)
get_and_assert_status(ref, "ACTIVE", True)

# Patch to disable auto-mode correctly
patch_disable_auto_mode = {
"spec": {
"computeConfig": {"enabled": False},
"storageConfig": {"blockStorage": {"enabled": False}},
"kubernetesNetworkConfig": {"elasticLoadBalancing": {"enabled": False}},
}
}
logging.info(f"Applying patch to disable auto-mode: {patch_disable_auto_mode}")
k8s.patch_custom_resource(ref, patch_disable_auto_mode)
time.sleep(MODIFY_WAIT_AFTER_SECONDS )

get_and_assert_status(ref, "UPDATING", False)

# Wait for cluster to become active after update
wait_for_cluster_active(eks_client, cluster_name)
time.sleep(CHECK_STATUS_WAIT_SECONDS)

get_and_assert_status(ref, "ACTIVE", True)

# Verify auto-mode is disabled
aws_res = eks_client.describe_cluster(name=cluster_name)

# Check compute config - should be absent or disabled
compute_config = aws_res["cluster"].get("computeConfig")
if compute_config is not None:
assert compute_config.get("enabled") is False, (
f"computeConfig.enabled should be False or absent, got: {compute_config.get('enabled')}"
)

# Check storage config - should be absent or disabled
storage_config = aws_res["cluster"].get("storageConfig")
if storage_config is not None:
block_storage = storage_config.get("blockStorage", {})
if block_storage:
assert block_storage.get("enabled") is False, (
f"storageConfig.blockStorage.enabled should be False or absent, got: {block_storage.get('enabled')}"
)

# Check elastic load balancing config - should be absent or disabled
k8s_network_config = aws_res["cluster"].get("kubernetesNetworkConfig", {})
elb_config = k8s_network_config.get("elasticLoadBalancing")
if elb_config is not None:
assert elb_config.get("enabled") is False, (
f"kubernetesNetworkConfig.elasticLoadBalancing.enabled should be False or absent, got: {elb_config.get('enabled')}"
)