Skip to content

Commit fe11d55

Browse files
committed
cherry-pick: Set unhealthy nodes static nodes to down with reset node address
Set unhealthy nodes static nodes to down with reset node address in order to fix ice static nodes after a bootstrap failure being treated as bootstrap failure nodes. Here's the bootstrap failure case that static node in replacement will ice issue will fail with: def is_bootstrap_failure(self): """Check if a slurm node has boostrap failure.""" if self.is_static_nodes_in_replacement and not self.is_backing_instance_valid(log_warn_if_unhealthy=False): # Node is currently in replacement and no backing instance logger.warning( "Node bootstrap error: Node %s is currently in replacement and no backing instance, node state %s:", self, self.state_string, ) Behaviors before the change: When detect unhealthy static node, static nodes will be set to down when it is unhealthy. In the same iteration, a run_instance call will be performed to launch a new instance for the node, node address will be changed to the new one if the run_instance call is succesfully. If the run_instance call failed, node address will be remained, node will be treat as bootstrap failure node. After this change, When detect unhealthy static node, static nodes will be set to down with node address reset. If run_instance call is successfully, node will be set to new address. If run_instance call failed, node address will be node_name. Node will not be treated as bootstrap failure. Signed-off-by: chenwany <[email protected]>
1 parent e1c0cc4 commit fe11d55

File tree

2 files changed

+8
-7
lines changed

2 files changed

+8
-7
lines changed

src/slurm_plugin/clustermgtd.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
get_partition_info,
2828
reset_nodes,
2929
resume_powering_down_nodes,
30-
set_nodes_down,
3130
set_nodes_drain,
3231
set_nodes_power_down,
3332
update_all_partitions,
@@ -719,7 +718,7 @@ def _handle_unhealthy_static_nodes(self, unhealthy_static_nodes):
719718
# Set nodes into down state so jobs can be requeued immediately
720719
try:
721720
log.info("Setting unhealthy static nodes to DOWN")
722-
set_nodes_down(node_list, reason="Static node maintenance: unhealthy node is being replaced")
721+
reset_nodes(node_list, state="down", reason="Static node maintenance: unhealthy node is being replaced")
723722
except Exception as e:
724723
log.error("Encountered exception when setting unhealthy static nodes into down state: %s", e)
725724

tests/slurm_plugin/test_clustermgtd.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -816,11 +816,13 @@ def test_handle_unhealthy_static_nodes(
816816
# Mock add_instances_for_nodes but still try to execute original code
817817
original_add_instances = cluster_manager._instance_manager.add_instances_for_nodes
818818
cluster_manager._instance_manager.add_instances_for_nodes = mocker.MagicMock(side_effect=original_add_instances)
819-
update_mock = mocker.patch("slurm_plugin.clustermgtd.set_nodes_down", return_value=None, auto_spec=True)
819+
reset_mock = mocker.patch("slurm_plugin.clustermgtd.reset_nodes", return_value=None, auto_spec=True)
820820
# Run test
821821
cluster_manager._handle_unhealthy_static_nodes(unhealthy_static_nodes)
822822
# Assert calls
823-
update_mock.assert_called_with(add_node_list, reason="Static node maintenance: unhealthy node is being replaced")
823+
reset_mock.assert_called_with(
824+
add_node_list, reason="Static node maintenance: unhealthy node is being replaced", state="down"
825+
)
824826
if delete_instance_list:
825827
cluster_manager._instance_manager.delete_instances.assert_called_with(
826828
delete_instance_list, terminate_batch_size=1
@@ -2221,11 +2223,11 @@ def test_handle_failed_health_check_nodes_in_replacement(
22212223
expected_nodes_in_replacement,
22222224
mocker,
22232225
):
2224-
for node, is_static_nodes_in_replacement, is_failing_health_check in zip(
2226+
for node, is_node_in_replacement, is_node_failing_health_check in zip(
22252227
active_nodes, is_static_nodes_in_replacement, is_failing_health_check
22262228
):
2227-
node.is_static_nodes_in_replacement = is_static_nodes_in_replacement
2228-
node.is_failing_health_check = is_failing_health_check
2229+
node.is_static_nodes_in_replacement = is_node_in_replacement
2230+
node.is_failing_health_check = is_node_failing_health_check
22292231

22302232
cluster_manager = ClusterManager(mocker.MagicMock())
22312233
cluster_manager._static_nodes_in_replacement = current_nodes_in_replacement

0 commit comments

Comments
 (0)