aws
diff --git a/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cookbooks/aws-parallelcluster-entrypoints/libraries/update_failure_handler.rb‎
Lines changed: 150 additions & 0 deletions b/‎cookbooks/aws-parallelcluster-entrypoints/libraries/update_failure_handler.rb‎
Lines changed: 150 additions & 0 deletions
diff --git a/‎cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb‎
Lines changed: 6 additions & 0 deletions b/‎cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb‎
Lines changed: 6 additions & 0 deletions
@@ -9,6 +9,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste
 3.14.1
 ------
 
+**ENHANCEMENTS**
+- Ensure clustermgtd runs after cluster update. On success, restart unconditionally. On failure, restart if the queue reconfiguration succeeded.
+
 **CHANGES**
 - Add chef attribute `cluster/in_place_update_on_fleet_enabled` to disable in-place updates on compute and login nodes
    and achieve better performance at scale. 
@@ -27,6 +30,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste
   - Rdma-core: rdma-core-59.0-1
   - Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.8-11
 
+**BUG FIXES**
+- Fix race condition where compute nodes could deploy the wrong cluster config version after an update failure.
+
 3.14.0
 ------
 
 
@@ -0,0 +1,150 @@
+# frozen_string_literal: true
+
+#
+# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
+# License. A copy of the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
+# limitations under the License.
+
+require 'chef/handler'
+
+module UpdateChefError
+  # Chef exception handler for cluster update failures.
+  #
+  # This handler is triggered when the update recipe fails. It performs recovery actions
+  # to restore the cluster to a consistent state:
+  # 1. Logs information about the update failure including which resources succeeded before failure
+  # 2. Cleans up DNA files shared with compute nodes
+  # 3. Starts clustermgtd if scontrol reconfigure succeeded
+  #
+  # Only runs on HeadNode - compute and login nodes skip this handler.
+  class UpdateFailureHandler < Chef::Handler
+    LOG_PREFIX = 'UpdateFailureHandler:'
+    # Must match SCONTROL_RECONFIGURE_RESOURCE_NAME in aws-parallelcluster-slurm/libraries/update.rb
+    SCONTROL_RECONFIGURE_RESOURCE = 'reload config for running nodes'
+
+    # Retry configuration for recovery commands
+    DEFAULT_RETRIES = 10
+    DEFAULT_RETRY_DELAY = 90
+    DEFAULT_TIMEOUT = 30
+
+    def report
+      extend Chef::Mixin::ShellOut
+
+      Chef::Log.info("#{LOG_PREFIX} Started")
+
+      unless node_type == 'HeadNode'
+        Chef::Log.info("#{LOG_PREFIX} Node type is #{node_type}, recovery from update failure only executes on the HeadNode")
+        return
+      end
+
+      begin
+        write_error_report
+        run_recovery_commands
+        Chef::Log.info("#{LOG_PREFIX} Completed successfully")
+      rescue => e
+        Chef::Log.error("#{LOG_PREFIX} Failed with error: #{e.message}")
+        Chef::Log.error("#{LOG_PREFIX} Backtrace: #{e.backtrace.join("\n")}")
+      end
+    end
+
+    def write_error_report
+      Chef::Log.info("#{LOG_PREFIX} Update failed on #{node_type} due to: #{run_status.exception}")
+      Chef::Log.info("#{LOG_PREFIX} Resources that have been successfully executed before the failure:")
+      run_status.updated_resources.each do |resource|
+        Chef::Log.info("#{LOG_PREFIX}   - #{resource}")
+      end
+      Chef::Log.info("#{LOG_PREFIX} Resource '#{SCONTROL_RECONFIGURE_RESOURCE}' has execution status: #{slurm_reconfigure_status}")
+    end
+
+    def run_recovery_commands
+      Chef::Log.info("#{LOG_PREFIX} Running recovery commands")
+
+      # Cleanup DNA files
+      cleanup_dna_files
+
+      # Start clustermgtd if scontrol reconfigure succeeded
+      if scontrol_reconfigure_succeeded?
+        Chef::Log.info("#{LOG_PREFIX} scontrol reconfigure succeeded, starting clustermgtd")
+        start_clustermgtd
+      else
+        Chef::Log.info("#{LOG_PREFIX} scontrol reconfigure did not succeed, skipping clustermgtd start")
+      end
+    end
+
+    def cleanup_dna_files
+      command = "#{cookbook_virtualenv_path}/bin/python #{cluster_attributes['scripts_dir']}/share_compute_fleet_dna.py --region #{cluster_attributes['region']} --cleanup"
+      run_command_with_retries(command, description: "cleanup DNA files")
+    end
+
+    def start_clustermgtd
+      command = "#{cookbook_virtualenv_path}/bin/supervisorctl start clustermgtd"
+      run_command_with_retries(command, description: "start clustermgtd")
+    end
+
+    def cluster_attributes
+      run_status.node['cluster']
+    end
+
+    def node_type
+      cluster_attributes['node_type']
+    end
+
+    def cookbook_virtualenv_path
+      "#{cluster_attributes['system_pyenv_root']}/versions/#{cluster_attributes['python-version']}/envs/cookbook_virtualenv"
+    end
+
+    def scontrol_reconfigure_succeeded?
+      slurm_reconfigure_status == :updated
+    end
+
+    def slurm_reconfigure_status
+      reload_record = find_scontrol_reconfigure_record
+      if reload_record
+        reload_record.status
+      else
+        :not_executed
+      end
+    end
+
+    def find_scontrol_reconfigure_record
+      # Use action_collection directly (inherited from Chef::Handler)
+      action_records = action_collection.filtered_collection
+      action_records.find { |r| r.new_resource.resource_name == :execute && r.new_resource.name == SCONTROL_RECONFIGURE_RESOURCE }
+    end
+
+    def run_command_with_retries(command, description:, retries: DEFAULT_RETRIES, retry_delay: DEFAULT_RETRY_DELAY, timeout: DEFAULT_TIMEOUT)
+      Chef::Log.info("#{LOG_PREFIX} Executing: #{description}")
+      max_attempts = retries + 1
+
+      max_attempts.times do |attempt|
+        attempt_num = attempt + 1
+        Chef::Log.info("#{LOG_PREFIX} Running command (attempt #{attempt_num}/#{max_attempts}): #{command}")
+        result = shell_out(command, timeout: timeout)
+        Chef::Log.info("#{LOG_PREFIX} Command stdout: #{result.stdout}")
+        Chef::Log.info("#{LOG_PREFIX} Command stderr: #{result.stderr}")
+
+        if result.exitstatus == 0
+          Chef::Log.info("#{LOG_PREFIX} Successfully executed: #{description}")
+          return true
+        end
+
+        Chef::Log.warn("#{LOG_PREFIX} Failed to #{description} (attempt #{attempt_num}/#{max_attempts})")
+
+        if attempt_num < max_attempts
+          Chef::Log.info("#{LOG_PREFIX} Retrying in #{retry_delay} seconds...")
+          sleep(retry_delay)
+        end
+      end
+
+      Chef::Log.error("#{LOG_PREFIX} Failed to #{description} after #{max_attempts} attempts")
+      false
+    end
+  end
+end
@@ -11,6 +11,12 @@
 # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 # limitations under the License.
+
+chef_handler 'UpdateChefError::UpdateFailureHandler' do
+  type exception: true
+  action :enable
+end
+
 include_recipe "aws-parallelcluster-shared::setup_envars"
 
 # Fetch and load cluster configs