aws · gmarciani · Dec 15, 2025 · Dec 11, 2025 · hanwen-cluster · Dec 12, 2025
@@ -9,6 +9,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste
 3.14.1
 ------
 
+**ENHANCEMENTS**
+- Ensure clustermgtd runs after cluster update. On success, start it unconditionally. On failure, start it if the queue reconfiguration succeeded.
+
 **CHANGES**
 - Add chef attribute `cluster/in_place_update_on_fleet_enabled` to disable in-place updates on compute and login nodes
    and achieve better performance at scale. 
@@ -27,6 +30,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste
   - Rdma-core: rdma-core-59.0-1
   - Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.8-11
 
+**BUG FIXES**
+- Fix race condition where compute nodes could deploy the wrong cluster config version after an update failure.
+
 3.14.0
 ------
 

@@ -0,0 +1,56 @@
+# frozen_string_literal: true
+
+#
+# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
+# License. A copy of the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
+# limitations under the License.
+
+module ErrorHandlers
+  # Executes shell commands with retry logic and logging.
+  class CommandRunner
+    include Chef::Mixin::ShellOut
+
+    DEFAULT_RETRIES = 10
+    DEFAULT_RETRY_DELAY = 90
+    DEFAULT_TIMEOUT = 30
+
+    def initialize(log_prefix:)
+      @log_prefix = log_prefix
+    end
+
+    def run_with_retries(command, description:, retries: DEFAULT_RETRIES, retry_delay: DEFAULT_RETRY_DELAY, timeout: DEFAULT_TIMEOUT)
+      Chef::Log.info("#{@log_prefix} Executing: #{description}")
+      max_attempts = retries + 1
+
+      max_attempts.times do |attempt|
+        attempt_num = attempt + 1
+        Chef::Log.info("#{@log_prefix} Running command (attempt #{attempt_num}/#{max_attempts}): #{command}")
+        result = shell_out(command, timeout: timeout)
+        Chef::Log.info("#{@log_prefix} Command stdout: #{result.stdout}")
+        Chef::Log.info("#{@log_prefix} Command stderr: #{result.stderr}")
+
+        if result.exitstatus == 0
+          Chef::Log.info("#{@log_prefix} Successfully executed: #{description}")
+          return true
+        end
+
+        Chef::Log.warn("#{@log_prefix} Failed to #{description} (attempt #{attempt_num}/#{max_attempts})")
+
+        if attempt_num < max_attempts
+          Chef::Log.info("#{@log_prefix} Retrying in #{retry_delay} seconds...")
+          sleep(retry_delay)
+        end
+      end
+
+      Chef::Log.error("#{@log_prefix} Failed to #{description} after #{max_attempts} attempts")
+      false
+    end
+  end
+end
@@ -0,0 +1,114 @@
+# frozen_string_literal: true
+
+#
+# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
+# License. A copy of the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
+# limitations under the License.
+
+require 'chef/handler'
+require_relative 'command_runner'
+
+module ErrorHandlers
+  # Chef exception handler for cluster update failures.
+  #
+  # This handler is triggered when the update recipe fails. It performs recovery actions
+  # to restore the cluster to a consistent state:
+  # 1. Logs information about the update failure including which resources succeeded before failure
+  # 2. Cleans up DNA files shared with compute nodes
+  # 3. Starts clustermgtd if scontrol reconfigure succeeded
+  #
+  # Only runs on HeadNode - compute and login nodes skip this handler.
+  class UpdateFailureHandler < Chef::Handler
+    def report
+      Chef::Log.info("#{log_prefix} Started")
+
+      unless node_type == 'HeadNode'
+        Chef::Log.info("#{log_prefix} Node type is #{node_type}, recovery from update failure only executes on the HeadNode")
+        return
+      end
+
+      begin
+        write_error_report
+        run_recovery
+        Chef::Log.info("#{log_prefix} Completed successfully")
+      rescue => e
+        Chef::Log.error("#{log_prefix} Failed with error: #{e.message}")
+        Chef::Log.error("#{log_prefix} Backtrace: #{e.backtrace.join("\n")}")
+      end
+    end
+
+    def write_error_report
+      Chef::Log.info("#{log_prefix} Update failed on #{node_type} due to: #{run_status.exception}")
+      Chef::Log.info("#{log_prefix} Resources that have been successfully executed before the failure:")
+      run_status.updated_resources.each do |resource|
+        Chef::Log.info("#{log_prefix}   - #{resource}")
+      end
+    end
+
+    def run_recovery
+      Chef::Log.info("#{log_prefix} Running recovery commands")
+
+      # Cleanup DNA files
+      cleanup_dna_files
+
+      # Start clustermgtd if scontrol reconfigure succeeded
+      # Must match SCONTROL_RECONFIGURE_RESOURCE_NAME in aws-parallelcluster-slurm/libraries/update.rb
+      scontrol_reconfigure_resource_name = 'reload config for running nodes'
+      Chef::Log.info("#{log_prefix} Resource '#{scontrol_reconfigure_resource_name}' has execution status: #{resource_status(scontrol_reconfigure_resource_name)}")
+      if resource_succeeded?(scontrol_reconfigure_resource_name)
+        Chef::Log.info("#{log_prefix} scontrol reconfigure succeeded, starting clustermgtd")
+        start_clustermgtd
+      else
+        Chef::Log.info("#{log_prefix} scontrol reconfigure did not succeed, skipping clustermgtd start")
+      end
+    end
+
+    def cleanup_dna_files
+      command = "#{cookbook_virtualenv_path}/bin/python #{cluster_attributes['scripts_dir']}/share_compute_fleet_dna.py --region #{cluster_attributes['region']} --cleanup"
+      command_runner.run_with_retries(command, description: "cleanup DNA files")
+    end
+
+    def start_clustermgtd
+      command = "#{cookbook_virtualenv_path}/bin/supervisorctl start clustermgtd"
+      command_runner.run_with_retries(command, description: "start clustermgtd")
+    end
+
+    def cluster_attributes
+      run_status.node['cluster']
+    end
+
+    def node_type
+      cluster_attributes['node_type']
+    end
+
+    def cookbook_virtualenv_path
+      "#{cluster_attributes['system_pyenv_root']}/versions/#{cluster_attributes['python-version']}/envs/cookbook_virtualenv"
+    end
+
+    def resource_succeeded?(resource_name)
+      %i(updated up_to_date).include?(resource_status(resource_name))
+    end
+
+    def resource_status(resource_name)
+      # Use action_collection directly (inherited from Chef::Handler)
+      action_records = action_collection.filtered_collection
+      record = action_records.find { |r| r.new_resource.resource_name == :execute && r.new_resource.name == resource_name }
+      record ? record.status : :not_executed
+    end
+
+    def command_runner
+      @command_runner ||= CommandRunner.new(log_prefix: log_prefix)
+    end
+
+    def log_prefix
+      @log_prefix ||= "#{self.class.name}:"
+    end
+  end
+end
@@ -11,6 +11,11 @@
 # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 # limitations under the License.
+
+chef_handler 'ErrorHandlers::UpdateFailureHandler' do
+  type exception: true
+end
+
 include_recipe "aws-parallelcluster-shared::setup_envars"
 
 # Fetch and load cluster configs

@@ -0,0 +1,128 @@
+# frozen_string_literal: true
+
+# Copyright:: 2025 Amazon.com, Inc. and its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
+# License. A copy of the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
+# limitations under the License.
+
+require_relative '../../spec_helper'
+require_relative '../../../libraries/command_runner'
+
+describe ErrorHandlers::CommandRunner do
+  let(:log_prefix) { 'TestPrefix:' }
+  let(:runner) { described_class.new(log_prefix: log_prefix) }
+  let(:command) { 'test command' }
+  let(:description) { 'test operation' }
+  let(:shell_out_result) { double('shell_out_result', exitstatus: 0, stdout: 'success', stderr: '') }
+
+  before do
+    allow(runner).to receive(:shell_out).and_return(shell_out_result)
+    allow(runner).to receive(:sleep)
+  end
+
+  describe '#run_with_retries' do
+    context 'when command succeeds on first attempt' do
+      it 'returns true and does not retry' do
+        expect(runner).to receive(:shell_out).once.and_return(shell_out_result)
+        expect(runner).not_to receive(:sleep)
+        expect(runner.run_with_retries(command, description: description)).to be true
+      end
+
+      it 'logs stdout and stderr' do
+        allow(Chef::Log).to receive(:info)
+        expect(Chef::Log).to receive(:info).with(/Command stdout: success/)
+        expect(Chef::Log).to receive(:info).with(/Command stderr:/)
+        runner.run_with_retries(command, description: description)
+      end
+
+      it 'logs success message' do
+        allow(Chef::Log).to receive(:info)
+        expect(Chef::Log).to receive(:info).with(/Successfully executed: test operation/)
+        runner.run_with_retries(command, description: description)
+      end
+    end
+
+    context 'when command fails then succeeds' do
+      let(:failed_result) { double('failed_result', exitstatus: 1, stdout: '', stderr: 'error') }
+
+      it 'retries and returns true on success' do
+        expect(runner).to receive(:shell_out).and_return(failed_result, shell_out_result)
+        expect(runner).to receive(:sleep).with(90).once
+        expect(runner.run_with_retries(command, description: description, retries: 1)).to be true
+      end
+
+      it 'logs retry message' do
+        allow(runner).to receive(:shell_out).and_return(failed_result, shell_out_result)
+        allow(Chef::Log).to receive(:info)
+        allow(Chef::Log).to receive(:warn)
+        expect(Chef::Log).to receive(:info).with(/Retrying in 90 seconds/)
+        runner.run_with_retries(command, description: description, retries: 1)
+      end
+    end
+
+    context 'when command fails all attempts' do
+      let(:failed_result) { double('failed_result', exitstatus: 1, stdout: '', stderr: 'error') }
+
+      it 'returns false after exhausting retries' do
+        allow(runner).to receive(:shell_out).and_return(failed_result)
+        expect(runner.run_with_retries(command, description: description, retries: 1, retry_delay: 0)).to be false
+      end
+
+      it 'logs error after all attempts fail' do
+        allow(runner).to receive(:shell_out).and_return(failed_result)
+        expect(Chef::Log).to receive(:error).with(/Failed to test operation after 2 attempts/)
+        runner.run_with_retries(command, description: description, retries: 1, retry_delay: 0)
+      end
+
+      it 'logs warning for each failed attempt' do
+        allow(runner).to receive(:shell_out).and_return(failed_result)
+        allow(Chef::Log).to receive(:info)
+        allow(Chef::Log).to receive(:error)
+        expect(Chef::Log).to receive(:warn).with(%r{Failed to test operation \(attempt 1/2\)})
+        expect(Chef::Log).to receive(:warn).with(%r{Failed to test operation \(attempt 2/2\)})
+        runner.run_with_retries(command, description: description, retries: 1, retry_delay: 0)
+      end
+    end
+
+    context 'with custom retry parameters' do
+      it 'respects custom retries count' do
+        failed_result = double('failed_result', exitstatus: 1, stdout: '', stderr: 'error')
+        allow(runner).to receive(:shell_out).and_return(failed_result)
+        expect(runner).to receive(:shell_out).exactly(3).times
+        runner.run_with_retries(command, description: description, retries: 2, retry_delay: 0)
+      end
+
+      it 'respects custom retry delay' do
+        failed_result = double('failed_result', exitstatus: 1, stdout: '', stderr: 'error')
+        allow(runner).to receive(:shell_out).and_return(failed_result, shell_out_result)
+        expect(runner).to receive(:sleep).with(30).once
+        runner.run_with_retries(command, description: description, retries: 1, retry_delay: 30)
+      end
+
+      it 'respects custom timeout' do
+        expect(runner).to receive(:shell_out).with(command, timeout: 60).and_return(shell_out_result)
+        runner.run_with_retries(command, description: description, timeout: 60)
+      end
+    end
+
+    context 'with default parameters' do
+      it 'uses DEFAULT_RETRIES' do
+        failed_result = double('failed_result', exitstatus: 1, stdout: '', stderr: 'error')
+        allow(runner).to receive(:shell_out).and_return(failed_result)
+        expect(runner).to receive(:shell_out).exactly(11).times # 10 retries + 1 initial = 11 attempts
+        runner.run_with_retries(command, description: description, retry_delay: 0)
+      end
+
+      it 'uses DEFAULT_TIMEOUT' do
+        expect(runner).to receive(:shell_out).with(command, timeout: 30).and_return(shell_out_result)
+        runner.run_with_retries(command, description: description)
+      end
+    end
+  end
+end