diff --git a/CHANGELOG.md b/CHANGELOG.md index 9029793616..bd7d9793c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste 3.14.1 ------ +**ENHANCEMENTS** +- Ensure clustermgtd runs after cluster update. On success, start it unconditionally. On failure, start it if the queue reconfiguration succeeded. + **CHANGES** - Add chef attribute `cluster/in_place_update_on_fleet_enabled` to disable in-place updates on compute and login nodes and achieve better performance at scale. @@ -27,6 +30,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Rdma-core: rdma-core-59.0-1 - Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.8-11 +**BUG FIXES** +- Fix race condition where compute nodes could deploy the wrong cluster config version after an update failure. + 3.14.0 ------ diff --git a/cookbooks/aws-parallelcluster-entrypoints/libraries/command_runner.rb b/cookbooks/aws-parallelcluster-entrypoints/libraries/command_runner.rb new file mode 100644 index 0000000000..b5f7c88fb9 --- /dev/null +++ b/cookbooks/aws-parallelcluster-entrypoints/libraries/command_runner.rb @@ -0,0 +1,56 @@ +# frozen_string_literal: true + +# +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +module ErrorHandlers + # Executes shell commands with retry logic and logging. + class CommandRunner + include Chef::Mixin::ShellOut + + DEFAULT_RETRIES = 10 + DEFAULT_RETRY_DELAY = 90 + DEFAULT_TIMEOUT = 30 + + def initialize(log_prefix:) + @log_prefix = log_prefix + end + + def run_with_retries(command, description:, retries: DEFAULT_RETRIES, retry_delay: DEFAULT_RETRY_DELAY, timeout: DEFAULT_TIMEOUT) + Chef::Log.info("#{@log_prefix} Executing: #{description}") + max_attempts = retries + 1 + + max_attempts.times do |attempt| + attempt_num = attempt + 1 + Chef::Log.info("#{@log_prefix} Running command (attempt #{attempt_num}/#{max_attempts}): #{command}") + result = shell_out(command, timeout: timeout) + Chef::Log.info("#{@log_prefix} Command stdout: #{result.stdout}") + Chef::Log.info("#{@log_prefix} Command stderr: #{result.stderr}") + + if result.exitstatus == 0 + Chef::Log.info("#{@log_prefix} Successfully executed: #{description}") + return true + end + + Chef::Log.warn("#{@log_prefix} Failed to #{description} (attempt #{attempt_num}/#{max_attempts})") + + if attempt_num < max_attempts + Chef::Log.info("#{@log_prefix} Retrying in #{retry_delay} seconds...") + sleep(retry_delay) + end + end + + Chef::Log.error("#{@log_prefix} Failed to #{description} after #{max_attempts} attempts") + false + end + end +end diff --git a/cookbooks/aws-parallelcluster-entrypoints/libraries/update_failure_handler.rb b/cookbooks/aws-parallelcluster-entrypoints/libraries/update_failure_handler.rb new file mode 100644 index 0000000000..18dcad4382 --- /dev/null +++ b/cookbooks/aws-parallelcluster-entrypoints/libraries/update_failure_handler.rb @@ -0,0 +1,114 @@ +# frozen_string_literal: true + +# +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +require 'chef/handler' +require_relative 'command_runner' + +module ErrorHandlers + # Chef exception handler for cluster update failures. + # + # This handler is triggered when the update recipe fails. It performs recovery actions + # to restore the cluster to a consistent state: + # 1. Logs information about the update failure including which resources succeeded before failure + # 2. Cleans up DNA files shared with compute nodes + # 3. Starts clustermgtd if scontrol reconfigure succeeded + # + # Only runs on HeadNode - compute and login nodes skip this handler. + class UpdateFailureHandler < Chef::Handler + def report + Chef::Log.info("#{log_prefix} Started") + + unless node_type == 'HeadNode' + Chef::Log.info("#{log_prefix} Node type is #{node_type}, recovery from update failure only executes on the HeadNode") + return + end + + begin + write_error_report + run_recovery + Chef::Log.info("#{log_prefix} Completed successfully") + rescue => e + Chef::Log.error("#{log_prefix} Failed with error: #{e.message}") + Chef::Log.error("#{log_prefix} Backtrace: #{e.backtrace.join("\n")}") + end + end + + def write_error_report + Chef::Log.info("#{log_prefix} Update failed on #{node_type} due to: #{run_status.exception}") + Chef::Log.info("#{log_prefix} Resources that have been successfully executed before the failure:") + run_status.updated_resources.each do |resource| + Chef::Log.info("#{log_prefix} - #{resource}") + end + end + + def run_recovery + Chef::Log.info("#{log_prefix} Running recovery commands") + + # Cleanup DNA files + cleanup_dna_files + + # Start clustermgtd if scontrol reconfigure succeeded + # Must match SCONTROL_RECONFIGURE_RESOURCE_NAME in aws-parallelcluster-slurm/libraries/update.rb + scontrol_reconfigure_resource_name = 'reload config for running nodes' + Chef::Log.info("#{log_prefix} Resource '#{scontrol_reconfigure_resource_name}' has execution status: #{resource_status(scontrol_reconfigure_resource_name)}") + if resource_succeeded?(scontrol_reconfigure_resource_name) + Chef::Log.info("#{log_prefix} scontrol reconfigure succeeded, starting clustermgtd") + start_clustermgtd + else + Chef::Log.info("#{log_prefix} scontrol reconfigure did not succeed, skipping clustermgtd start") + end + end + + def cleanup_dna_files + command = "#{cookbook_virtualenv_path}/bin/python #{cluster_attributes['scripts_dir']}/share_compute_fleet_dna.py --region #{cluster_attributes['region']} --cleanup" + command_runner.run_with_retries(command, description: "cleanup DNA files") + end + + def start_clustermgtd + command = "#{cookbook_virtualenv_path}/bin/supervisorctl start clustermgtd" + command_runner.run_with_retries(command, description: "start clustermgtd") + end + + def cluster_attributes + run_status.node['cluster'] + end + + def node_type + cluster_attributes['node_type'] + end + + def cookbook_virtualenv_path + "#{cluster_attributes['system_pyenv_root']}/versions/#{cluster_attributes['python-version']}/envs/cookbook_virtualenv" + end + + def resource_succeeded?(resource_name) + %i(updated up_to_date).include?(resource_status(resource_name)) + end + + def resource_status(resource_name) + # Use action_collection directly (inherited from Chef::Handler) + action_records = action_collection.filtered_collection + record = action_records.find { |r| r.new_resource.resource_name == :execute && r.new_resource.name == resource_name } + record ? record.status : :not_executed + end + + def command_runner + @command_runner ||= CommandRunner.new(log_prefix: log_prefix) + end + + def log_prefix + @log_prefix ||= "#{self.class.name}:" + end + end +end diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb index f69aa24530..a2bf3cc6b3 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb @@ -11,6 +11,11 @@ # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. + +chef_handler 'ErrorHandlers::UpdateFailureHandler' do + type exception: true +end + include_recipe "aws-parallelcluster-shared::setup_envars" # Fetch and load cluster configs diff --git a/cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/command_runner_spec.rb b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/command_runner_spec.rb new file mode 100644 index 0000000000..f5d8937017 --- /dev/null +++ b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/command_runner_spec.rb @@ -0,0 +1,128 @@ +# frozen_string_literal: true + +# Copyright:: 2025 Amazon.com, Inc. and its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +require_relative '../../spec_helper' +require_relative '../../../libraries/command_runner' + +describe ErrorHandlers::CommandRunner do + let(:log_prefix) { 'TestPrefix:' } + let(:runner) { described_class.new(log_prefix: log_prefix) } + let(:command) { 'test command' } + let(:description) { 'test operation' } + let(:shell_out_result) { double('shell_out_result', exitstatus: 0, stdout: 'success', stderr: '') } + + before do + allow(runner).to receive(:shell_out).and_return(shell_out_result) + allow(runner).to receive(:sleep) + end + + describe '#run_with_retries' do + context 'when command succeeds on first attempt' do + it 'returns true and does not retry' do + expect(runner).to receive(:shell_out).once.and_return(shell_out_result) + expect(runner).not_to receive(:sleep) + expect(runner.run_with_retries(command, description: description)).to be true + end + + it 'logs stdout and stderr' do + allow(Chef::Log).to receive(:info) + expect(Chef::Log).to receive(:info).with(/Command stdout: success/) + expect(Chef::Log).to receive(:info).with(/Command stderr:/) + runner.run_with_retries(command, description: description) + end + + it 'logs success message' do + allow(Chef::Log).to receive(:info) + expect(Chef::Log).to receive(:info).with(/Successfully executed: test operation/) + runner.run_with_retries(command, description: description) + end + end + + context 'when command fails then succeeds' do + let(:failed_result) { double('failed_result', exitstatus: 1, stdout: '', stderr: 'error') } + + it 'retries and returns true on success' do + expect(runner).to receive(:shell_out).and_return(failed_result, shell_out_result) + expect(runner).to receive(:sleep).with(90).once + expect(runner.run_with_retries(command, description: description, retries: 1)).to be true + end + + it 'logs retry message' do + allow(runner).to receive(:shell_out).and_return(failed_result, shell_out_result) + allow(Chef::Log).to receive(:info) + allow(Chef::Log).to receive(:warn) + expect(Chef::Log).to receive(:info).with(/Retrying in 90 seconds/) + runner.run_with_retries(command, description: description, retries: 1) + end + end + + context 'when command fails all attempts' do + let(:failed_result) { double('failed_result', exitstatus: 1, stdout: '', stderr: 'error') } + + it 'returns false after exhausting retries' do + allow(runner).to receive(:shell_out).and_return(failed_result) + expect(runner.run_with_retries(command, description: description, retries: 1, retry_delay: 0)).to be false + end + + it 'logs error after all attempts fail' do + allow(runner).to receive(:shell_out).and_return(failed_result) + expect(Chef::Log).to receive(:error).with(/Failed to test operation after 2 attempts/) + runner.run_with_retries(command, description: description, retries: 1, retry_delay: 0) + end + + it 'logs warning for each failed attempt' do + allow(runner).to receive(:shell_out).and_return(failed_result) + allow(Chef::Log).to receive(:info) + allow(Chef::Log).to receive(:error) + expect(Chef::Log).to receive(:warn).with(%r{Failed to test operation \(attempt 1/2\)}) + expect(Chef::Log).to receive(:warn).with(%r{Failed to test operation \(attempt 2/2\)}) + runner.run_with_retries(command, description: description, retries: 1, retry_delay: 0) + end + end + + context 'with custom retry parameters' do + it 'respects custom retries count' do + failed_result = double('failed_result', exitstatus: 1, stdout: '', stderr: 'error') + allow(runner).to receive(:shell_out).and_return(failed_result) + expect(runner).to receive(:shell_out).exactly(3).times + runner.run_with_retries(command, description: description, retries: 2, retry_delay: 0) + end + + it 'respects custom retry delay' do + failed_result = double('failed_result', exitstatus: 1, stdout: '', stderr: 'error') + allow(runner).to receive(:shell_out).and_return(failed_result, shell_out_result) + expect(runner).to receive(:sleep).with(30).once + runner.run_with_retries(command, description: description, retries: 1, retry_delay: 30) + end + + it 'respects custom timeout' do + expect(runner).to receive(:shell_out).with(command, timeout: 60).and_return(shell_out_result) + runner.run_with_retries(command, description: description, timeout: 60) + end + end + + context 'with default parameters' do + it 'uses DEFAULT_RETRIES' do + failed_result = double('failed_result', exitstatus: 1, stdout: '', stderr: 'error') + allow(runner).to receive(:shell_out).and_return(failed_result) + expect(runner).to receive(:shell_out).exactly(11).times # 10 retries + 1 initial = 11 attempts + runner.run_with_retries(command, description: description, retry_delay: 0) + end + + it 'uses DEFAULT_TIMEOUT' do + expect(runner).to receive(:shell_out).with(command, timeout: 30).and_return(shell_out_result) + runner.run_with_retries(command, description: description) + end + end + end +end diff --git a/cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/update_failure_handler_spec.rb b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/update_failure_handler_spec.rb new file mode 100644 index 0000000000..ba8d7db92d --- /dev/null +++ b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/update_failure_handler_spec.rb @@ -0,0 +1,222 @@ +# frozen_string_literal: true + +# Copyright:: 2025 Amazon.com, Inc. and its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +require_relative '../../spec_helper' +require_relative '../../../libraries/update_failure_handler' + +describe ErrorHandlers::UpdateFailureHandler do + let(:handler) { described_class.new } + let(:exception) { StandardError.new('Test error') } + let(:resource1) { double('resource1', to_s: 'file[/tmp/test]') } + let(:updated_resources) { [resource1] } + let(:action_collection) { double('action_collection') } + let(:pyenv_root) { '/opt/parallelcluster/pyenv' } + let(:python_version) { '3.9.0' } + let(:scripts_dir) { '/opt/parallelcluster/scripts' } + let(:region) { 'us-east-1' } + let(:virtualenv_path) { "#{pyenv_root}/versions/#{python_version}/envs/cookbook_virtualenv" } + let(:node) do + { + 'cluster' => { + 'node_type' => node_type, + 'system_pyenv_root' => pyenv_root, + 'python-version' => python_version, + 'scripts_dir' => scripts_dir, + 'region' => region, + }, + } + end + let(:node_type) { 'HeadNode' } + let(:run_status) { double('run_status', exception: exception, updated_resources: updated_resources, node: node) } + let(:scontrol_resource_name) { 'reload config for running nodes' } + let(:command_runner) { instance_double(ErrorHandlers::CommandRunner) } + + before do + allow(handler).to receive(:run_status).and_return(run_status) + allow(handler).to receive(:action_collection).and_return(action_collection) + allow(action_collection).to receive(:filtered_collection).and_return([]) + allow(handler).to receive(:command_runner).and_return(command_runner) + allow(command_runner).to receive(:run_with_retries).and_return(true) + end + + describe '#node_type' do + it 'returns the node type from cluster attributes' do + expect(handler.node_type).to eq('HeadNode') + end + end + + describe '#cookbook_virtualenv_path' do + it 'constructs the correct virtualenv path' do + expect(handler.cookbook_virtualenv_path).to eq(virtualenv_path) + end + end + + describe '#report' do + context 'when node type is HeadNode' do + it 'writes error report and runs recovery commands' do + expect(handler).to receive(:write_error_report) + expect(handler).to receive(:run_recovery) + handler.report + end + + it 'catches and logs exceptions during recovery' do + allow(handler).to receive(:write_error_report).and_raise(StandardError.new('Recovery failed')) + expect(Chef::Log).to receive(:error).with(/Failed with error: Recovery failed/) + expect(Chef::Log).to receive(:error).with(/Backtrace:/) + handler.report + end + end + + context 'when node type is not HeadNode' do + let(:node_type) { 'ComputeFleet' } + + it 'skips recovery and returns early' do + expect(handler).not_to receive(:write_error_report) + expect(handler).not_to receive(:run_recovery) + allow(Chef::Log).to receive(:info) + expect(Chef::Log).to receive(:info).with(/Node type is ComputeFleet/) + handler.report + end + end + end + + describe '#write_error_report' do + it 'logs the exception and updated resources' do + expect(Chef::Log).to receive(:info).with(/Update failed on HeadNode due to: Test error/) + expect(Chef::Log).to receive(:info).with(/Resources that have been successfully executed/) + expect(Chef::Log).to receive(:info).with(%r{file\[/tmp/test\]}) + handler.write_error_report + end + end + + describe '#run_recovery' do + context 'when scontrol reconfigure succeeded' do + let(:reload_resource) { double('reload_resource', resource_name: :execute, name: scontrol_resource_name) } + let(:action_record) { double('action_record', new_resource: reload_resource, status: :updated) } + + before do + allow(action_collection).to receive(:filtered_collection).and_return([action_record]) + end + + it 'cleans up DNA files and starts clustermgtd' do + expect(handler).to receive(:cleanup_dna_files) + expect(handler).to receive(:start_clustermgtd) + handler.run_recovery + end + end + + context 'when scontrol reconfigure did not succeed' do + it 'cleans up DNA files but does not start clustermgtd' do + expect(handler).to receive(:cleanup_dna_files) + expect(handler).not_to receive(:start_clustermgtd) + handler.run_recovery + end + end + end + + describe '#cleanup_dna_files' do + it 'runs the cleanup command with correct arguments' do + expected_command = "#{virtualenv_path}/bin/python #{scripts_dir}/share_compute_fleet_dna.py --region #{region} --cleanup" + expect(command_runner).to receive(:run_with_retries).with(expected_command, description: "cleanup DNA files") + handler.cleanup_dna_files + end + end + + describe '#start_clustermgtd' do + it 'runs the supervisorctl command' do + expected_command = "#{virtualenv_path}/bin/supervisorctl start clustermgtd" + expect(command_runner).to receive(:run_with_retries).with(expected_command, description: "start clustermgtd") + handler.start_clustermgtd + end + end + + describe '#command_runner' do + before do + allow(handler).to receive(:command_runner).and_call_original + end + + it 'returns a CommandRunner instance' do + expect(handler.command_runner).to be_a(ErrorHandlers::CommandRunner) + end + + it 'memoizes the command runner' do + expect(handler.command_runner).to be(handler.command_runner) + end + end + + describe '#resource_succeeded?' do + let(:resource_name) { 'test resource' } + let(:test_resource) { double('test_resource', resource_name: :execute, name: resource_name) } + + context 'when resource was updated' do + let(:action_record) { double('action_record', new_resource: test_resource, status: :updated) } + + before { allow(action_collection).to receive(:filtered_collection).and_return([action_record]) } + + it 'returns true' do + expect(handler.resource_succeeded?(resource_name)).to be true + end + end + + context 'when resource was up_to_date' do + let(:action_record) { double('action_record', new_resource: test_resource, status: :up_to_date) } + + before { allow(action_collection).to receive(:filtered_collection).and_return([action_record]) } + + it 'returns true' do + expect(handler.resource_succeeded?(resource_name)).to be true + end + end + + context 'when resource was not executed' do + before { allow(action_collection).to receive(:filtered_collection).and_return([]) } + + it 'returns false' do + expect(handler.resource_succeeded?(resource_name)).to be false + end + end + + context 'when resource failed' do + let(:action_record) { double('action_record', new_resource: test_resource, status: :failed) } + + before { allow(action_collection).to receive(:filtered_collection).and_return([action_record]) } + + it 'returns false' do + expect(handler.resource_succeeded?(resource_name)).to be false + end + end + end + + describe '#resource_status' do + let(:resource_name) { 'test resource' } + let(:test_resource) { double('test_resource', resource_name: :execute, name: resource_name) } + + context 'when resource was not executed' do + before { allow(action_collection).to receive(:filtered_collection).and_return([]) } + + it 'returns :not_executed' do + expect(handler.resource_status(resource_name)).to eq(:not_executed) + end + end + + context 'when resource was executed' do + let(:action_record) { double('action_record', new_resource: test_resource, status: :updated) } + + before { allow(action_collection).to receive(:filtered_collection).and_return([action_record]) } + + it 'returns the resource status' do + expect(handler.resource_status(resource_name)).to eq(:updated) + end + end + end +end diff --git a/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/update_spec.rb b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/update_spec.rb index cb05d90530..d8cb9d4c91 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/update_spec.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/update_spec.rb @@ -63,6 +63,10 @@ chef_run expect(@included_recipes).to eq(expected_recipes) end + + it "enables the update failure handler" do + expect(chef_run).to enable_chef_handler('ErrorHandlers::UpdateFailureHandler').with(type: { exception: true }) + end end end end diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/update.rb b/cookbooks/aws-parallelcluster-slurm/libraries/update.rb index d4747d502c..2f7fd1feef 100644 --- a/cookbooks/aws-parallelcluster-slurm/libraries/update.rb +++ b/cookbooks/aws-parallelcluster-slurm/libraries/update.rb @@ -18,6 +18,8 @@ require 'net/http' require 'timeout' +SCONTROL_RECONFIGURE_RESOURCE_NAME = 'reload config for running nodes' + # Verify if Scheduling section of cluster configuration and compute node bootstrap_timeout have been updated def are_queues_updated? require 'yaml' diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb index 76ba95362a..4aabeaa21c 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb @@ -262,7 +262,7 @@ def update_nodes_in_queue(strategy, queues) retry_delay 2 end -execute 'reload config for running nodes' do +execute SCONTROL_RECONFIGURE_RESOURCE_NAME do command "#{node['cluster']['slurm']['install_dir']}/bin/scontrol reconfigure" retries 3 retry_delay 5 @@ -276,7 +276,6 @@ def update_nodes_in_queue(strategy, queues) execute 'start clustermgtd' do command "#{cookbook_virtualenv_path}/bin/supervisorctl start clustermgtd" - not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? && !are_bulk_custom_slurm_settings_updated? } end # The updated cfnconfig will be used by post update custom scripts diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb index f2f53d13d4..c609db63cd 100644 --- a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb @@ -57,6 +57,12 @@ retry_delay: 90 ) end + + it 'starts clustermgtd unconditionally' do + is_expected.to run_execute('start clustermgtd').with( + command: "#{cookbook_venv_path}/bin/supervisorctl start clustermgtd" + ) + end end end