diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9029793616..bd7d9793c2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste
 3.14.1
 ------
 
+**ENHANCEMENTS**
+- Ensure clustermgtd runs after cluster update. On success, start it unconditionally. On failure, start it if the queue reconfiguration succeeded.
+
 **CHANGES**
 - Add chef attribute `cluster/in_place_update_on_fleet_enabled` to disable in-place updates on compute and login nodes
    and achieve better performance at scale. 
@@ -27,6 +30,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste
   - Rdma-core: rdma-core-59.0-1
   - Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.8-11
 
+**BUG FIXES**
+- Fix race condition where compute nodes could deploy the wrong cluster config version after an update failure.
+
 3.14.0
 ------
 
diff --git a/cookbooks/aws-parallelcluster-entrypoints/libraries/command_runner.rb b/cookbooks/aws-parallelcluster-entrypoints/libraries/command_runner.rb
new file mode 100644
index 0000000000..b5f7c88fb9
--- /dev/null
+++ b/cookbooks/aws-parallelcluster-entrypoints/libraries/command_runner.rb
@@ -0,0 +1,56 @@
+# frozen_string_literal: true
+
+#
+# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
+# License. A copy of the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
+# limitations under the License.
+
+module ErrorHandlers
+  # Executes shell commands with retry logic and logging.
+  class CommandRunner
+    include Chef::Mixin::ShellOut
+
+    DEFAULT_RETRIES = 10
+    DEFAULT_RETRY_DELAY = 90
+    DEFAULT_TIMEOUT = 30
+
+    def initialize(log_prefix:)
+      @log_prefix = log_prefix
+    end
+
+    def run_with_retries(command, description:, retries: DEFAULT_RETRIES, retry_delay: DEFAULT_RETRY_DELAY, timeout: DEFAULT_TIMEOUT)
+      Chef::Log.info("#{@log_prefix} Executing: #{description}")
+      max_attempts = retries + 1
+
+      max_attempts.times do |attempt|
+        attempt_num = attempt + 1
+        Chef::Log.info("#{@log_prefix} Running command (attempt #{attempt_num}/#{max_attempts}): #{command}")
+        result = shell_out(command, timeout: timeout)
+        Chef::Log.info("#{@log_prefix} Command stdout: #{result.stdout}")
+        Chef::Log.info("#{@log_prefix} Command stderr: #{result.stderr}")
+
+        if result.exitstatus == 0
+          Chef::Log.info("#{@log_prefix} Successfully executed: #{description}")
+          return true
+        end
+
+        Chef::Log.warn("#{@log_prefix} Failed to #{description} (attempt #{attempt_num}/#{max_attempts})")
+
+        if attempt_num < max_attempts
+          Chef::Log.info("#{@log_prefix} Retrying in #{retry_delay} seconds...")
+          sleep(retry_delay)
+        end
+      end
+
+      Chef::Log.error("#{@log_prefix} Failed to #{description} after #{max_attempts} attempts")
+      false
+    end
+  end
+end
diff --git a/cookbooks/aws-parallelcluster-entrypoints/libraries/update_failure_handler.rb b/cookbooks/aws-parallelcluster-entrypoints/libraries/update_failure_handler.rb
new file mode 100644
index 0000000000..18dcad4382
--- /dev/null
+++ b/cookbooks/aws-parallelcluster-entrypoints/libraries/update_failure_handler.rb
@@ -0,0 +1,114 @@
+# frozen_string_literal: true
+
+#
+# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
+# License. A copy of the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
+# limitations under the License.
+
+require 'chef/handler'
+require_relative 'command_runner'
+
+module ErrorHandlers
+  # Chef exception handler for cluster update failures.
+  #
+  # This handler is triggered when the update recipe fails. It performs recovery actions
+  # to restore the cluster to a consistent state:
+  # 1. Logs information about the update failure including which resources succeeded before failure
+  # 2. Cleans up DNA files shared with compute nodes
+  # 3. Starts clustermgtd if scontrol reconfigure succeeded
+  #
+  # Only runs on HeadNode - compute and login nodes skip this handler.
+  class UpdateFailureHandler < Chef::Handler
+    def report
+      Chef::Log.info("#{log_prefix} Started")
+
+      unless node_type == 'HeadNode'
+        Chef::Log.info("#{log_prefix} Node type is #{node_type}, recovery from update failure only executes on the HeadNode")
+        return
+      end
+
+      begin
+        write_error_report
+        run_recovery
+        Chef::Log.info("#{log_prefix} Completed successfully")
+      rescue => e
+        Chef::Log.error("#{log_prefix} Failed with error: #{e.message}")
+        Chef::Log.error("#{log_prefix} Backtrace: #{e.backtrace.join("\n")}")
+      end
+    end
+
+    def write_error_report
+      Chef::Log.info("#{log_prefix} Update failed on #{node_type} due to: #{run_status.exception}")
+      Chef::Log.info("#{log_prefix} Resources that have been successfully executed before the failure:")
+      run_status.updated_resources.each do |resource|
+        Chef::Log.info("#{log_prefix}   - #{resource}")
+      end
+    end
+
+    def run_recovery
+      Chef::Log.info("#{log_prefix} Running recovery commands")
+
+      # Cleanup DNA files
+      cleanup_dna_files
+
+      # Start clustermgtd if scontrol reconfigure succeeded
+      # Must match SCONTROL_RECONFIGURE_RESOURCE_NAME in aws-parallelcluster-slurm/libraries/update.rb
+      scontrol_reconfigure_resource_name = 'reload config for running nodes'
+      Chef::Log.info("#{log_prefix} Resource '#{scontrol_reconfigure_resource_name}' has execution status: #{resource_status(scontrol_reconfigure_resource_name)}")
+      if resource_succeeded?(scontrol_reconfigure_resource_name)
+        Chef::Log.info("#{log_prefix} scontrol reconfigure succeeded, starting clustermgtd")
+        start_clustermgtd
+      else
+        Chef::Log.info("#{log_prefix} scontrol reconfigure did not succeed, skipping clustermgtd start")
+      end
+    end
+
+    def cleanup_dna_files
+      command = "#{cookbook_virtualenv_path}/bin/python #{cluster_attributes['scripts_dir']}/share_compute_fleet_dna.py --region #{cluster_attributes['region']} --cleanup"
+      command_runner.run_with_retries(command, description: "cleanup DNA files")
+    end
+
+    def start_clustermgtd
+      command = "#{cookbook_virtualenv_path}/bin/supervisorctl start clustermgtd"
+      command_runner.run_with_retries(command, description: "start clustermgtd")
+    end
+
+    def cluster_attributes
+      run_status.node['cluster']
+    end
+
+    def node_type
+      cluster_attributes['node_type']
+    end
+
+    def cookbook_virtualenv_path
+      "#{cluster_attributes['system_pyenv_root']}/versions/#{cluster_attributes['python-version']}/envs/cookbook_virtualenv"
+    end
+
+    def resource_succeeded?(resource_name)
+      %i(updated up_to_date).include?(resource_status(resource_name))
+    end
+
+    def resource_status(resource_name)
+      # Use action_collection directly (inherited from Chef::Handler)
+      action_records = action_collection.filtered_collection
+      record = action_records.find { |r| r.new_resource.resource_name == :execute && r.new_resource.name == resource_name }
+      record ? record.status : :not_executed
+    end
+
+    def command_runner
+      @command_runner ||= CommandRunner.new(log_prefix: log_prefix)
+    end
+
+    def log_prefix
+      @log_prefix ||= "#{self.class.name}:"
+    end
+  end
+end
diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb
index f69aa24530..a2bf3cc6b3 100644
--- a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb
+++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb
@@ -11,6 +11,11 @@
 # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 # limitations under the License.
+
+chef_handler 'ErrorHandlers::UpdateFailureHandler' do
+  type exception: true
+end
+
 include_recipe "aws-parallelcluster-shared::setup_envars"
 
 # Fetch and load cluster configs
diff --git a/cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/command_runner_spec.rb b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/command_runner_spec.rb
new file mode 100644
index 0000000000..f5d8937017
--- /dev/null
+++ b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/command_runner_spec.rb
@@ -0,0 +1,128 @@
+# frozen_string_literal: true
+
+# Copyright:: 2025 Amazon.com, Inc. and its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
+# License. A copy of the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
+# limitations under the License.
+
+require_relative '../../spec_helper'
+require_relative '../../../libraries/command_runner'
+
+describe ErrorHandlers::CommandRunner do
+  let(:log_prefix) { 'TestPrefix:' }
+  let(:runner) { described_class.new(log_prefix: log_prefix) }
+  let(:command) { 'test command' }
+  let(:description) { 'test operation' }
+  let(:shell_out_result) { double('shell_out_result', exitstatus: 0, stdout: 'success', stderr: '') }
+
+  before do
+    allow(runner).to receive(:shell_out).and_return(shell_out_result)
+    allow(runner).to receive(:sleep)
+  end
+
+  describe '#run_with_retries' do
+    context 'when command succeeds on first attempt' do
+      it 'returns true and does not retry' do
+        expect(runner).to receive(:shell_out).once.and_return(shell_out_result)
+        expect(runner).not_to receive(:sleep)
+        expect(runner.run_with_retries(command, description: description)).to be true
+      end
+
+      it 'logs stdout and stderr' do
+        allow(Chef::Log).to receive(:info)
+        expect(Chef::Log).to receive(:info).with(/Command stdout: success/)
+        expect(Chef::Log).to receive(:info).with(/Command stderr:/)
+        runner.run_with_retries(command, description: description)
+      end
+
+      it 'logs success message' do
+        allow(Chef::Log).to receive(:info)
+        expect(Chef::Log).to receive(:info).with(/Successfully executed: test operation/)
+        runner.run_with_retries(command, description: description)
+      end
+    end
+
+    context 'when command fails then succeeds' do
+      let(:failed_result) { double('failed_result', exitstatus: 1, stdout: '', stderr: 'error') }
+
+      it 'retries and returns true on success' do
+        expect(runner).to receive(:shell_out).and_return(failed_result, shell_out_result)
+        expect(runner).to receive(:sleep).with(90).once
+        expect(runner.run_with_retries(command, description: description, retries: 1)).to be true
+      end
+
+      it 'logs retry message' do
+        allow(runner).to receive(:shell_out).and_return(failed_result, shell_out_result)
+        allow(Chef::Log).to receive(:info)
+        allow(Chef::Log).to receive(:warn)
+        expect(Chef::Log).to receive(:info).with(/Retrying in 90 seconds/)
+        runner.run_with_retries(command, description: description, retries: 1)
+      end
+    end
+
+    context 'when command fails all attempts' do
+      let(:failed_result) { double('failed_result', exitstatus: 1, stdout: '', stderr: 'error') }
+
+      it 'returns false after exhausting retries' do
+        allow(runner).to receive(:shell_out).and_return(failed_result)
+        expect(runner.run_with_retries(command, description: description, retries: 1, retry_delay: 0)).to be false
+      end
+
+      it 'logs error after all attempts fail' do
+        allow(runner).to receive(:shell_out).and_return(failed_result)
+        expect(Chef::Log).to receive(:error).with(/Failed to test operation after 2 attempts/)
+        runner.run_with_retries(command, description: description, retries: 1, retry_delay: 0)
+      end
+
+      it 'logs warning for each failed attempt' do
+        allow(runner).to receive(:shell_out).and_return(failed_result)
+        allow(Chef::Log).to receive(:info)
+        allow(Chef::Log).to receive(:error)
+        expect(Chef::Log).to receive(:warn).with(%r{Failed to test operation \(attempt 1/2\)})
+        expect(Chef::Log).to receive(:warn).with(%r{Failed to test operation \(attempt 2/2\)})
+        runner.run_with_retries(command, description: description, retries: 1, retry_delay: 0)
+      end
+    end
+
+    context 'with custom retry parameters' do
+      it 'respects custom retries count' do
+        failed_result = double('failed_result', exitstatus: 1, stdout: '', stderr: 'error')
+        allow(runner).to receive(:shell_out).and_return(failed_result)
+        expect(runner).to receive(:shell_out).exactly(3).times
+        runner.run_with_retries(command, description: description, retries: 2, retry_delay: 0)
+      end
+
+      it 'respects custom retry delay' do
+        failed_result = double('failed_result', exitstatus: 1, stdout: '', stderr: 'error')
+        allow(runner).to receive(:shell_out).and_return(failed_result, shell_out_result)
+        expect(runner).to receive(:sleep).with(30).once
+        runner.run_with_retries(command, description: description, retries: 1, retry_delay: 30)
+      end
+
+      it 'respects custom timeout' do
+        expect(runner).to receive(:shell_out).with(command, timeout: 60).and_return(shell_out_result)
+        runner.run_with_retries(command, description: description, timeout: 60)
+      end
+    end
+
+    context 'with default parameters' do
+      it 'uses DEFAULT_RETRIES' do
+        failed_result = double('failed_result', exitstatus: 1, stdout: '', stderr: 'error')
+        allow(runner).to receive(:shell_out).and_return(failed_result)
+        expect(runner).to receive(:shell_out).exactly(11).times # 10 retries + 1 initial = 11 attempts
+        runner.run_with_retries(command, description: description, retry_delay: 0)
+      end
+
+      it 'uses DEFAULT_TIMEOUT' do
+        expect(runner).to receive(:shell_out).with(command, timeout: 30).and_return(shell_out_result)
+        runner.run_with_retries(command, description: description)
+      end
+    end
+  end
+end
diff --git a/cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/update_failure_handler_spec.rb b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/update_failure_handler_spec.rb
new file mode 100644
index 0000000000..ba8d7db92d
--- /dev/null
+++ b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/update_failure_handler_spec.rb
@@ -0,0 +1,222 @@
+# frozen_string_literal: true
+
+# Copyright:: 2025 Amazon.com, Inc. and its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
+# License. A copy of the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
+# limitations under the License.
+
+require_relative '../../spec_helper'
+require_relative '../../../libraries/update_failure_handler'
+
+describe ErrorHandlers::UpdateFailureHandler do
+  let(:handler) { described_class.new }
+  let(:exception) { StandardError.new('Test error') }
+  let(:resource1) { double('resource1', to_s: 'file[/tmp/test]') }
+  let(:updated_resources) { [resource1] }
+  let(:action_collection) { double('action_collection') }
+  let(:pyenv_root) { '/opt/parallelcluster/pyenv' }
+  let(:python_version) { '3.9.0' }
+  let(:scripts_dir) { '/opt/parallelcluster/scripts' }
+  let(:region) { 'us-east-1' }
+  let(:virtualenv_path) { "#{pyenv_root}/versions/#{python_version}/envs/cookbook_virtualenv" }
+  let(:node) do
+    {
+      'cluster' => {
+        'node_type' => node_type,
+        'system_pyenv_root' => pyenv_root,
+        'python-version' => python_version,
+        'scripts_dir' => scripts_dir,
+        'region' => region,
+      },
+    }
+  end
+  let(:node_type) { 'HeadNode' }
+  let(:run_status) { double('run_status', exception: exception, updated_resources: updated_resources, node: node) }
+  let(:scontrol_resource_name) { 'reload config for running nodes' }
+  let(:command_runner) { instance_double(ErrorHandlers::CommandRunner) }
+
+  before do
+    allow(handler).to receive(:run_status).and_return(run_status)
+    allow(handler).to receive(:action_collection).and_return(action_collection)
+    allow(action_collection).to receive(:filtered_collection).and_return([])
+    allow(handler).to receive(:command_runner).and_return(command_runner)
+    allow(command_runner).to receive(:run_with_retries).and_return(true)
+  end
+
+  describe '#node_type' do
+    it 'returns the node type from cluster attributes' do
+      expect(handler.node_type).to eq('HeadNode')
+    end
+  end
+
+  describe '#cookbook_virtualenv_path' do
+    it 'constructs the correct virtualenv path' do
+      expect(handler.cookbook_virtualenv_path).to eq(virtualenv_path)
+    end
+  end
+
+  describe '#report' do
+    context 'when node type is HeadNode' do
+      it 'writes error report and runs recovery commands' do
+        expect(handler).to receive(:write_error_report)
+        expect(handler).to receive(:run_recovery)
+        handler.report
+      end
+
+      it 'catches and logs exceptions during recovery' do
+        allow(handler).to receive(:write_error_report).and_raise(StandardError.new('Recovery failed'))
+        expect(Chef::Log).to receive(:error).with(/Failed with error: Recovery failed/)
+        expect(Chef::Log).to receive(:error).with(/Backtrace:/)
+        handler.report
+      end
+    end
+
+    context 'when node type is not HeadNode' do
+      let(:node_type) { 'ComputeFleet' }
+
+      it 'skips recovery and returns early' do
+        expect(handler).not_to receive(:write_error_report)
+        expect(handler).not_to receive(:run_recovery)
+        allow(Chef::Log).to receive(:info)
+        expect(Chef::Log).to receive(:info).with(/Node type is ComputeFleet/)
+        handler.report
+      end
+    end
+  end
+
+  describe '#write_error_report' do
+    it 'logs the exception and updated resources' do
+      expect(Chef::Log).to receive(:info).with(/Update failed on HeadNode due to: Test error/)
+      expect(Chef::Log).to receive(:info).with(/Resources that have been successfully executed/)
+      expect(Chef::Log).to receive(:info).with(%r{file\[/tmp/test\]})
+      handler.write_error_report
+    end
+  end
+
+  describe '#run_recovery' do
+    context 'when scontrol reconfigure succeeded' do
+      let(:reload_resource) { double('reload_resource', resource_name: :execute, name: scontrol_resource_name) }
+      let(:action_record) { double('action_record', new_resource: reload_resource, status: :updated) }
+
+      before do
+        allow(action_collection).to receive(:filtered_collection).and_return([action_record])
+      end
+
+      it 'cleans up DNA files and starts clustermgtd' do
+        expect(handler).to receive(:cleanup_dna_files)
+        expect(handler).to receive(:start_clustermgtd)
+        handler.run_recovery
+      end
+    end
+
+    context 'when scontrol reconfigure did not succeed' do
+      it 'cleans up DNA files but does not start clustermgtd' do
+        expect(handler).to receive(:cleanup_dna_files)
+        expect(handler).not_to receive(:start_clustermgtd)
+        handler.run_recovery
+      end
+    end
+  end
+
+  describe '#cleanup_dna_files' do
+    it 'runs the cleanup command with correct arguments' do
+      expected_command = "#{virtualenv_path}/bin/python #{scripts_dir}/share_compute_fleet_dna.py --region #{region} --cleanup"
+      expect(command_runner).to receive(:run_with_retries).with(expected_command, description: "cleanup DNA files")
+      handler.cleanup_dna_files
+    end
+  end
+
+  describe '#start_clustermgtd' do
+    it 'runs the supervisorctl command' do
+      expected_command = "#{virtualenv_path}/bin/supervisorctl start clustermgtd"
+      expect(command_runner).to receive(:run_with_retries).with(expected_command, description: "start clustermgtd")
+      handler.start_clustermgtd
+    end
+  end
+
+  describe '#command_runner' do
+    before do
+      allow(handler).to receive(:command_runner).and_call_original
+    end
+
+    it 'returns a CommandRunner instance' do
+      expect(handler.command_runner).to be_a(ErrorHandlers::CommandRunner)
+    end
+
+    it 'memoizes the command runner' do
+      expect(handler.command_runner).to be(handler.command_runner)
+    end
+  end
+
+  describe '#resource_succeeded?' do
+    let(:resource_name) { 'test resource' }
+    let(:test_resource) { double('test_resource', resource_name: :execute, name: resource_name) }
+
+    context 'when resource was updated' do
+      let(:action_record) { double('action_record', new_resource: test_resource, status: :updated) }
+
+      before { allow(action_collection).to receive(:filtered_collection).and_return([action_record]) }
+
+      it 'returns true' do
+        expect(handler.resource_succeeded?(resource_name)).to be true
+      end
+    end
+
+    context 'when resource was up_to_date' do
+      let(:action_record) { double('action_record', new_resource: test_resource, status: :up_to_date) }
+
+      before { allow(action_collection).to receive(:filtered_collection).and_return([action_record]) }
+
+      it 'returns true' do
+        expect(handler.resource_succeeded?(resource_name)).to be true
+      end
+    end
+
+    context 'when resource was not executed' do
+      before { allow(action_collection).to receive(:filtered_collection).and_return([]) }
+
+      it 'returns false' do
+        expect(handler.resource_succeeded?(resource_name)).to be false
+      end
+    end
+
+    context 'when resource failed' do
+      let(:action_record) { double('action_record', new_resource: test_resource, status: :failed) }
+
+      before { allow(action_collection).to receive(:filtered_collection).and_return([action_record]) }
+
+      it 'returns false' do
+        expect(handler.resource_succeeded?(resource_name)).to be false
+      end
+    end
+  end
+
+  describe '#resource_status' do
+    let(:resource_name) { 'test resource' }
+    let(:test_resource) { double('test_resource', resource_name: :execute, name: resource_name) }
+
+    context 'when resource was not executed' do
+      before { allow(action_collection).to receive(:filtered_collection).and_return([]) }
+
+      it 'returns :not_executed' do
+        expect(handler.resource_status(resource_name)).to eq(:not_executed)
+      end
+    end
+
+    context 'when resource was executed' do
+      let(:action_record) { double('action_record', new_resource: test_resource, status: :updated) }
+
+      before { allow(action_collection).to receive(:filtered_collection).and_return([action_record]) }
+
+      it 'returns the resource status' do
+        expect(handler.resource_status(resource_name)).to eq(:updated)
+      end
+    end
+  end
+end
diff --git a/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/update_spec.rb b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/update_spec.rb
index cb05d90530..d8cb9d4c91 100644
--- a/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/update_spec.rb
+++ b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/update_spec.rb
@@ -63,6 +63,10 @@
                 chef_run
                 expect(@included_recipes).to eq(expected_recipes)
               end
+
+              it "enables the update failure handler" do
+                expect(chef_run).to enable_chef_handler('ErrorHandlers::UpdateFailureHandler').with(type: { exception: true })
+              end
             end
           end
         end
diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/update.rb b/cookbooks/aws-parallelcluster-slurm/libraries/update.rb
index d4747d502c..2f7fd1feef 100644
--- a/cookbooks/aws-parallelcluster-slurm/libraries/update.rb
+++ b/cookbooks/aws-parallelcluster-slurm/libraries/update.rb
@@ -18,6 +18,8 @@
 require 'net/http'
 require 'timeout'
 
+SCONTROL_RECONFIGURE_RESOURCE_NAME = 'reload config for running nodes'
+
 # Verify if Scheduling section of cluster configuration and compute node bootstrap_timeout have been updated
 def are_queues_updated?
   require 'yaml'
diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb
index 76ba95362a..4aabeaa21c 100644
--- a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb
+++ b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb
@@ -262,7 +262,7 @@ def update_nodes_in_queue(strategy, queues)
   retry_delay 2
 end
 
-execute 'reload config for running nodes' do
+execute SCONTROL_RECONFIGURE_RESOURCE_NAME do
   command "#{node['cluster']['slurm']['install_dir']}/bin/scontrol reconfigure"
   retries 3
   retry_delay 5
@@ -276,7 +276,6 @@ def update_nodes_in_queue(strategy, queues)
 
 execute 'start clustermgtd' do
   command "#{cookbook_virtualenv_path}/bin/supervisorctl start clustermgtd"
-  not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? && !are_bulk_custom_slurm_settings_updated? }
 end
 
 # The updated cfnconfig will be used by post update custom scripts
diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb
index f2f53d13d4..c609db63cd 100644
--- a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb
+++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb
@@ -57,6 +57,12 @@
               retry_delay: 90
             )
           end
+
+          it 'starts clustermgtd unconditionally' do
+            is_expected.to run_execute('start clustermgtd').with(
+              command: "#{cookbook_venv_path}/bin/supervisorctl start clustermgtd"
+            )
+          end
         end
       end