Skip to content

Commit f4ed435

Browse files
authored
Do not count missing records as a failure of the cluster readiness check (#3062)
* Do not consider missing records as a cluster readiness check failure * Update CHANGELOG * Add note that missing records don't cause failure
1 parent 6b38de4 commit f4ed435

File tree

3 files changed

+17
-9
lines changed

3 files changed

+17
-9
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste
2525
- Rdma-core: rdma-core-59.0-1
2626
- Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.8-11
2727

28+
**BUG FIXES**
29+
- Prevent cluster readiness check failures due to instances launched while the check is in progress.
30+
2831
3.14.0
2932
------
3033

cookbooks/aws-parallelcluster-slurm/files/default/head_node_checks/check_cluster_ready.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,13 +112,21 @@ def check_deployed_config_version(cluster_name: str, table_name: str, expected_c
112112

113113
missing, incomplete, wrong = _check_cluster_config_items(instance_ids, items, expected_config_version)
114114

115-
if missing or incomplete or wrong:
115+
if incomplete or wrong:
116116
raise CheckFailedError(
117-
f"Check failed due to the following erroneous records:\n"
117+
f"Check failed due to the following erroneous records "
118+
f"(missing records are not counted for the failure):\n"
118119
f" * missing records ({len(missing)}): {missing}\n"
119120
f" * incomplete records ({len(incomplete)}): {incomplete}\n"
120121
f" * wrong records ({len(wrong)}): {wrong}"
121122
)
123+
if missing:
124+
logger.warning(
125+
"Ignoring the following missing records due them being recently bootstrapped:\n"
126+
" * missing records (%s): %s",
127+
len(missing),
128+
missing,
129+
)
122130
logger.info("Verified cluster configuration for cluster node(s) %s", instance_ids)
123131

124132

test/unit/head_node_checks/test_check_cluster_ready.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,7 @@ def _mocked_request_batch_get_items(table_name: str, compute_nodes: [str], ddb_r
8383
["i-cmp123456789"],
8484
["i-lgn123456789"],
8585
{},
86-
"Check failed due to the following erroneous records:\n"
87-
" * missing records (2): ['i-cmp123456789', 'i-lgn123456789']\n"
88-
" * incomplete records (0): []\n"
89-
" * wrong records (0): []",
86+
None,
9087
id="Check with missing DDB records",
9188
),
9289
pytest.param(
@@ -96,7 +93,7 @@ def _mocked_request_batch_get_items(table_name: str, compute_nodes: [str], ddb_r
9693
"i-cmp123456789": {"UNEXPECTED_KEY_A": {"S": "UNEXPECTED_KEY_VALUE_A"}},
9794
"i-lgn123456789": {"UNEXPECTED_KEY_B": {"S": "UNEXPECTED_KEY_VALUE_B"}},
9895
},
99-
"Check failed due to the following erroneous records:\n"
96+
"Check failed due to the following erroneous records (missing records are not counted for the failure):\n"
10097
" * missing records (0): []\n"
10198
" * incomplete records (2): ['i-cmp123456789', 'i-lgn123456789']\n"
10299
" * wrong records (0): []",
@@ -109,7 +106,7 @@ def _mocked_request_batch_get_items(table_name: str, compute_nodes: [str], ddb_r
109106
"i-cmp123456789": {"cluster_config_version": {"S": "WRONG_CLUSTER_CONFIG_VERSION_A"}},
110107
"i-lgn123456789": {"cluster_config_version": {"S": "WRONG_CLUSTER_CONFIG_VERSION_B"}},
111108
},
112-
"Check failed due to the following erroneous records:\n"
109+
"Check failed due to the following erroneous records (missing records are not counted for the failure):\n"
113110
" * missing records (0): []\n"
114111
" * incomplete records (0): []\n"
115112
" * wrong records (2): [('i-cmp123456789', 'WRONG_CLUSTER_CONFIG_VERSION_A'), "
@@ -127,7 +124,7 @@ def _mocked_request_batch_get_items(table_name: str, compute_nodes: [str], ddb_r
127124
"i-cmp1234567893": {"cluster_config_version": {"S": "WRONG_CLUSTER_CONFIG_VERSION_A"}},
128125
"i-lgn1234567893": {"cluster_config_version": {"S": "WRONG_CLUSTER_CONFIG_VERSION_B"}},
129126
},
130-
"Check failed due to the following erroneous records:\n"
127+
"Check failed due to the following erroneous records (missing records are not counted for the failure):\n"
131128
" * missing records (2): ['i-cmp1234567894', 'i-lgn1234567894']\n"
132129
" * incomplete records (2): ['i-cmp1234567892', 'i-lgn1234567892']\n"
133130
" * wrong records (2): [('i-cmp1234567893', 'WRONG_CLUSTER_CONFIG_VERSION_A'), "

0 commit comments

Comments
 (0)