Skip to content

Commit 4a68a7f

Browse files
authored
Merge branch 'release-3.14' into wip/fix-log-timestamp-3141
2 parents f415f1d + da4e9d1 commit 4a68a7f

File tree

13 files changed

+560
-12
lines changed

13 files changed

+560
-12
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste
66
3.14.1
77
------
88

9+
**ENHANCEMENTS**
10+
- Ensure clustermgtd runs after cluster update. On success, start it unconditionally. On failure, start it if the queue reconfiguration succeeded.
11+
912
**CHANGES**
1013
- Add chef attribute `cluster/in_place_update_on_fleet_enabled` to disable in-place updates on compute and login nodes
1114
and mitigate performance impact at scale.
@@ -27,6 +30,8 @@ This file is used to list changes made in each version of the AWS ParallelCluste
2730

2831
**BUG FIXES**
2932
- Fix incorrect timestamp parsing for chef-client.log in CloudWatch Agent configuration. The timestamp format now correctly matches Chef's output format `[YYYY-MM-DDTHH:MM:SS+TZ]`.
33+
- Prevent cluster readiness check failures due to instances launched while the check is in progress.
34+
- Fix race condition where compute nodes could deploy the wrong cluster config version after an update failure.
3035

3136
3.14.0
3237
------
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# frozen_string_literal: true
2+
3+
#
4+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
7+
# License. A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
12+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
module ErrorHandlers
16+
# Executes shell commands with retry logic and logging.
17+
class CommandRunner
18+
include Chef::Mixin::ShellOut
19+
20+
DEFAULT_RETRIES = 10
21+
DEFAULT_RETRY_DELAY = 90
22+
DEFAULT_TIMEOUT = 30
23+
24+
def initialize(log_prefix:)
25+
@log_prefix = log_prefix
26+
end
27+
28+
def run_with_retries(command, description:, retries: DEFAULT_RETRIES, retry_delay: DEFAULT_RETRY_DELAY, timeout: DEFAULT_TIMEOUT)
29+
Chef::Log.info("#{@log_prefix} Executing: #{description}")
30+
max_attempts = retries + 1
31+
32+
max_attempts.times do |attempt|
33+
attempt_num = attempt + 1
34+
Chef::Log.info("#{@log_prefix} Running command (attempt #{attempt_num}/#{max_attempts}): #{command}")
35+
result = shell_out(command, timeout: timeout)
36+
Chef::Log.info("#{@log_prefix} Command stdout: #{result.stdout}")
37+
Chef::Log.info("#{@log_prefix} Command stderr: #{result.stderr}")
38+
39+
if result.exitstatus == 0
40+
Chef::Log.info("#{@log_prefix} Successfully executed: #{description}")
41+
return true
42+
end
43+
44+
Chef::Log.warn("#{@log_prefix} Failed to #{description} (attempt #{attempt_num}/#{max_attempts})")
45+
46+
if attempt_num < max_attempts
47+
Chef::Log.info("#{@log_prefix} Retrying in #{retry_delay} seconds...")
48+
sleep(retry_delay)
49+
end
50+
end
51+
52+
Chef::Log.error("#{@log_prefix} Failed to #{description} after #{max_attempts} attempts")
53+
false
54+
end
55+
end
56+
end
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# frozen_string_literal: true
2+
3+
#
4+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
7+
# License. A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
12+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
require 'chef/handler'
16+
require_relative 'command_runner'
17+
18+
module ErrorHandlers
19+
# Chef exception handler for cluster update failures.
20+
#
21+
# This handler is triggered when the update recipe fails. It performs recovery actions
22+
# to restore the cluster to a consistent state:
23+
# 1. Logs information about the update failure including which resources succeeded before failure
24+
# 2. Cleans up DNA files shared with compute nodes
25+
# 3. Starts clustermgtd if scontrol reconfigure succeeded
26+
#
27+
# Only runs on HeadNode - compute and login nodes skip this handler.
28+
class UpdateFailureHandler < Chef::Handler
29+
def report
30+
Chef::Log.info("#{log_prefix} Started")
31+
32+
unless node_type == 'HeadNode'
33+
Chef::Log.info("#{log_prefix} Node type is #{node_type}, recovery from update failure only executes on the HeadNode")
34+
return
35+
end
36+
37+
begin
38+
write_error_report
39+
run_recovery
40+
Chef::Log.info("#{log_prefix} Completed successfully")
41+
rescue => e
42+
Chef::Log.error("#{log_prefix} Failed with error: #{e.message}")
43+
Chef::Log.error("#{log_prefix} Backtrace: #{e.backtrace.join("\n")}")
44+
end
45+
end
46+
47+
def write_error_report
48+
Chef::Log.info("#{log_prefix} Update failed on #{node_type} due to: #{run_status.exception}")
49+
Chef::Log.info("#{log_prefix} Resources that have been successfully executed before the failure:")
50+
run_status.updated_resources.each do |resource|
51+
Chef::Log.info("#{log_prefix} - #{resource}")
52+
end
53+
end
54+
55+
def run_recovery
56+
Chef::Log.info("#{log_prefix} Running recovery commands")
57+
58+
# Cleanup DNA files
59+
cleanup_dna_files
60+
61+
# Start clustermgtd if scontrol reconfigure succeeded
62+
# Must match SCONTROL_RECONFIGURE_RESOURCE_NAME in aws-parallelcluster-slurm/libraries/update.rb
63+
scontrol_reconfigure_resource_name = 'reload config for running nodes'
64+
Chef::Log.info("#{log_prefix} Resource '#{scontrol_reconfigure_resource_name}' has execution status: #{resource_status(scontrol_reconfigure_resource_name)}")
65+
if resource_succeeded?(scontrol_reconfigure_resource_name)
66+
Chef::Log.info("#{log_prefix} scontrol reconfigure succeeded, starting clustermgtd")
67+
start_clustermgtd
68+
else
69+
Chef::Log.info("#{log_prefix} scontrol reconfigure did not succeed, skipping clustermgtd start")
70+
end
71+
end
72+
73+
def cleanup_dna_files
74+
command = "#{cookbook_virtualenv_path}/bin/python #{cluster_attributes['scripts_dir']}/share_compute_fleet_dna.py --region #{cluster_attributes['region']} --cleanup"
75+
command_runner.run_with_retries(command, description: "cleanup DNA files")
76+
end
77+
78+
def start_clustermgtd
79+
command = "#{cookbook_virtualenv_path}/bin/supervisorctl start clustermgtd"
80+
command_runner.run_with_retries(command, description: "start clustermgtd")
81+
end
82+
83+
def cluster_attributes
84+
run_status.node['cluster']
85+
end
86+
87+
def node_type
88+
cluster_attributes['node_type']
89+
end
90+
91+
def cookbook_virtualenv_path
92+
"#{cluster_attributes['system_pyenv_root']}/versions/#{cluster_attributes['python-version']}/envs/cookbook_virtualenv"
93+
end
94+
95+
def resource_succeeded?(resource_name)
96+
%i(updated up_to_date).include?(resource_status(resource_name))
97+
end
98+
99+
def resource_status(resource_name)
100+
# Use action_collection directly (inherited from Chef::Handler)
101+
action_records = action_collection.filtered_collection
102+
record = action_records.find { |r| r.new_resource.resource_name == :execute && r.new_resource.name == resource_name }
103+
record ? record.status : :not_executed
104+
end
105+
106+
def command_runner
107+
@command_runner ||= CommandRunner.new(log_prefix: log_prefix)
108+
end
109+
110+
def log_prefix
111+
@log_prefix ||= "#{self.class.name}:"
112+
end
113+
end
114+
end

cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@
1111
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
1212
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
15+
chef_handler 'ErrorHandlers::UpdateFailureHandler' do
16+
type exception: true
17+
end
18+
1419
include_recipe "aws-parallelcluster-shared::setup_envars"
1520

1621
# Fetch and load cluster configs
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. and its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
6+
# License. A copy of the License is located at
7+
#
8+
# http://aws.amazon.com/apache2.0/
9+
#
10+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
11+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
require_relative '../../spec_helper'
15+
require_relative '../../../libraries/command_runner'
16+
17+
describe ErrorHandlers::CommandRunner do
18+
let(:log_prefix) { 'TestPrefix:' }
19+
let(:runner) { described_class.new(log_prefix: log_prefix) }
20+
let(:command) { 'test command' }
21+
let(:description) { 'test operation' }
22+
let(:shell_out_result) { double('shell_out_result', exitstatus: 0, stdout: 'success', stderr: '') }
23+
24+
before do
25+
allow(runner).to receive(:shell_out).and_return(shell_out_result)
26+
allow(runner).to receive(:sleep)
27+
end
28+
29+
describe '#run_with_retries' do
30+
context 'when command succeeds on first attempt' do
31+
it 'returns true and does not retry' do
32+
expect(runner).to receive(:shell_out).once.and_return(shell_out_result)
33+
expect(runner).not_to receive(:sleep)
34+
expect(runner.run_with_retries(command, description: description)).to be true
35+
end
36+
37+
it 'logs stdout and stderr' do
38+
allow(Chef::Log).to receive(:info)
39+
expect(Chef::Log).to receive(:info).with(/Command stdout: success/)
40+
expect(Chef::Log).to receive(:info).with(/Command stderr:/)
41+
runner.run_with_retries(command, description: description)
42+
end
43+
44+
it 'logs success message' do
45+
allow(Chef::Log).to receive(:info)
46+
expect(Chef::Log).to receive(:info).with(/Successfully executed: test operation/)
47+
runner.run_with_retries(command, description: description)
48+
end
49+
end
50+
51+
context 'when command fails then succeeds' do
52+
let(:failed_result) { double('failed_result', exitstatus: 1, stdout: '', stderr: 'error') }
53+
54+
it 'retries and returns true on success' do
55+
expect(runner).to receive(:shell_out).and_return(failed_result, shell_out_result)
56+
expect(runner).to receive(:sleep).with(90).once
57+
expect(runner.run_with_retries(command, description: description, retries: 1)).to be true
58+
end
59+
60+
it 'logs retry message' do
61+
allow(runner).to receive(:shell_out).and_return(failed_result, shell_out_result)
62+
allow(Chef::Log).to receive(:info)
63+
allow(Chef::Log).to receive(:warn)
64+
expect(Chef::Log).to receive(:info).with(/Retrying in 90 seconds/)
65+
runner.run_with_retries(command, description: description, retries: 1)
66+
end
67+
end
68+
69+
context 'when command fails all attempts' do
70+
let(:failed_result) { double('failed_result', exitstatus: 1, stdout: '', stderr: 'error') }
71+
72+
it 'returns false after exhausting retries' do
73+
allow(runner).to receive(:shell_out).and_return(failed_result)
74+
expect(runner.run_with_retries(command, description: description, retries: 1, retry_delay: 0)).to be false
75+
end
76+
77+
it 'logs error after all attempts fail' do
78+
allow(runner).to receive(:shell_out).and_return(failed_result)
79+
expect(Chef::Log).to receive(:error).with(/Failed to test operation after 2 attempts/)
80+
runner.run_with_retries(command, description: description, retries: 1, retry_delay: 0)
81+
end
82+
83+
it 'logs warning for each failed attempt' do
84+
allow(runner).to receive(:shell_out).and_return(failed_result)
85+
allow(Chef::Log).to receive(:info)
86+
allow(Chef::Log).to receive(:error)
87+
expect(Chef::Log).to receive(:warn).with(%r{Failed to test operation \(attempt 1/2\)})
88+
expect(Chef::Log).to receive(:warn).with(%r{Failed to test operation \(attempt 2/2\)})
89+
runner.run_with_retries(command, description: description, retries: 1, retry_delay: 0)
90+
end
91+
end
92+
93+
context 'with custom retry parameters' do
94+
it 'respects custom retries count' do
95+
failed_result = double('failed_result', exitstatus: 1, stdout: '', stderr: 'error')
96+
allow(runner).to receive(:shell_out).and_return(failed_result)
97+
expect(runner).to receive(:shell_out).exactly(3).times
98+
runner.run_with_retries(command, description: description, retries: 2, retry_delay: 0)
99+
end
100+
101+
it 'respects custom retry delay' do
102+
failed_result = double('failed_result', exitstatus: 1, stdout: '', stderr: 'error')
103+
allow(runner).to receive(:shell_out).and_return(failed_result, shell_out_result)
104+
expect(runner).to receive(:sleep).with(30).once
105+
runner.run_with_retries(command, description: description, retries: 1, retry_delay: 30)
106+
end
107+
108+
it 'respects custom timeout' do
109+
expect(runner).to receive(:shell_out).with(command, timeout: 60).and_return(shell_out_result)
110+
runner.run_with_retries(command, description: description, timeout: 60)
111+
end
112+
end
113+
114+
context 'with default parameters' do
115+
it 'uses DEFAULT_RETRIES' do
116+
failed_result = double('failed_result', exitstatus: 1, stdout: '', stderr: 'error')
117+
allow(runner).to receive(:shell_out).and_return(failed_result)
118+
expect(runner).to receive(:shell_out).exactly(11).times # 10 retries + 1 initial = 11 attempts
119+
runner.run_with_retries(command, description: description, retry_delay: 0)
120+
end
121+
122+
it 'uses DEFAULT_TIMEOUT' do
123+
expect(runner).to receive(:shell_out).with(command, timeout: 30).and_return(shell_out_result)
124+
runner.run_with_retries(command, description: description)
125+
end
126+
end
127+
end
128+
end

0 commit comments

Comments
 (0)