Skip to content

Commit cb9a586

Browse files
authored
Add support for Slurm Accounting
Add support for Slurm Accounting at cookbook level - Add Slurm configuration parameters to set up accounting (in slurm.conf and slurmdbd.conf) - Define new user to run the slurmdbd daemon and assign it the slurmdbd.conf files - Create slurmdbd.service unit file - Create script to update database password from Secrets Manager - Enable slurmdbd and activate accounting in the slurm configuration recipe - Add action to wait for Slurm database to become responsive - Bootstrap slurm database with cluster, default account and users in an idempotent way Allow update-cluster to enable and disable Slurm Accounting - Call the Slurm accounting configuration recipe if Slurm Accounting is enabled on an existing cluster - Call a clean-up recipe to clear some of the Slurm Accounting configuration if it is disabled Add unit tests for Slurm config file generation for Slurm Accounting - Adapt existing tests to account for additional input needed by Slurm Accounting Update Changelog Signed-off-by: Jacopo De Amicis <[email protected]>
1 parent 3987786 commit cb9a586

File tree

28 files changed

+488
-6
lines changed

28 files changed

+488
-6
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
88

99
**ENHANCEMENTS**
1010
- Add support for AWS Trainium instances.
11+
- Add support for Slurm Accounting.
1112
- Upgrade third-party cookbook dependencies:
1213
- selinux-6.0.5 (from selinux-6.0.4)
1314
- nfs-5.0.0 (from nfs-2.6.4)

attributes/default.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,10 @@
134134
default['cluster']['slurm']['user_id'] = node['cluster']['reserved_base_uid'] + 1
135135
default['cluster']['slurm']['group'] = node['cluster']['slurm']['user']
136136
default['cluster']['slurm']['group_id'] = node['cluster']['slurm']['user_id']
137+
default['cluster']['slurm']['dbduser'] = 'slurmdbd'
138+
default['cluster']['slurm']['dbduser_id'] = node['cluster']['reserved_base_uid'] + 5
139+
default['cluster']['slurm']['dbdgroup'] = node['cluster']['slurm']['dbduser']
140+
default['cluster']['slurm']['dbdgroup_id'] = node['cluster']['slurm']['dbduser_id']
137141
default['cluster']['slurm']['install_dir'] = "/opt/slurm"
138142
default['cluster']['slurm']['fleet_config_path'] = "#{node['cluster']['slurm_plugin_dir']}/fleet-config.json"
139143

cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_slurm_config_generator.py

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from os import makedirs, path
1919
from socket import gethostname
2020
from typing import Tuple
21+
from urllib.parse import ParseResult, urlparse
2122

2223
import requests
2324
import yaml
@@ -43,6 +44,7 @@ def generate_slurm_config_files(
4344
no_gpu,
4445
compute_node_bootstrap_timeout,
4546
realmemory_to_ec2memory_ratio,
47+
slurmdbd_user,
4648
):
4749
"""
4850
Generate Slurm configuration files.
@@ -65,6 +67,7 @@ def generate_slurm_config_files(
6567
cluster_config = _load_cluster_config(input_file)
6668
head_node_config = _get_head_node_config()
6769
queues = cluster_config["Scheduling"]["SlurmQueues"]
70+
cluster_name = next(tag["Value"] for tag in cluster_config["Tags"] if tag["Key"] == "parallelcluster:cluster-name")
6871

6972
global instance_types_data
7073
with open(instance_types_data_path) as input_file:
@@ -86,16 +89,19 @@ def generate_slurm_config_files(
8689
)
8790
is_default_queue = False
8891

89-
# Generate slurm_parallelcluster.conf, slurm_parallelcluster_gres.conf and slurm_parallelcluster_cgroup.conf
92+
# Generate include files for slurm configuration files
9093
for template_name in [
9194
"slurm_parallelcluster.conf",
9295
"slurm_parallelcluster_gres.conf",
9396
"slurm_parallelcluster_cgroup.conf",
97+
"slurm_parallelcluster_slurmdbd.conf",
9498
]:
9599
_generate_slurm_parallelcluster_configs(
96100
queues,
97101
head_node_config,
98102
cluster_config["Scheduling"]["SlurmSettings"],
103+
cluster_name,
104+
slurmdbd_user,
99105
template_name,
100106
compute_node_bootstrap_timeout,
101107
env,
@@ -161,6 +167,8 @@ def _generate_slurm_parallelcluster_configs(
161167
queues,
162168
head_node_config,
163169
scaling_config,
170+
cluster_name,
171+
slurmdbd_user,
164172
template_name,
165173
compute_node_bootstrap_timeout,
166174
jinja_env,
@@ -172,6 +180,8 @@ def _generate_slurm_parallelcluster_configs(
172180
queues=queues,
173181
head_node_config=head_node_config,
174182
scaling_config=scaling_config,
183+
cluster_name=cluster_name,
184+
slurmdbd_user=slurmdbd_user,
175185
compute_node_bootstrap_timeout=compute_node_bootstrap_timeout,
176186
output_dir=output_dir,
177187
)
@@ -195,6 +205,8 @@ def _get_jinja_env(template_directory, realmemory_to_ec2memory_ratio):
195205
_realmemory,
196206
realmemory_to_ec2memory_ratio=realmemory_to_ec2memory_ratio,
197207
)
208+
env.filters["uri_host"] = functools.partial(_parse_uri, attr="host")
209+
env.filters["uri_port"] = functools.partial(_parse_uri, attr="port")
198210

199211
return env
200212

@@ -307,6 +319,39 @@ def _realmemory(compute_resource, realmemory_to_ec2memory_ratio) -> int:
307319
return realmemory
308320

309321

322+
def _parse_netloc(uri: str, uri_parse: ParseResult, attr: str) -> str:
323+
try:
324+
netloc = uri_parse.netloc
325+
except ValueError as e:
326+
error_msg = f"Failure to parse uri with error '{str(e)}'. Please review the provided URI ('{uri}')"
327+
log.critical(error_msg)
328+
raise CriticalError(error_msg)
329+
if not netloc:
330+
error_msg = f"Invalid URI specified. Please review the provided URI ('{uri}')"
331+
log.critical(error_msg)
332+
raise CriticalError(error_msg)
333+
if attr == "host":
334+
ret = uri_parse.hostname
335+
elif attr == "port":
336+
ret = uri_parse.port
337+
# Provide default MySQL port if port is not explicitely set
338+
if not ret:
339+
ret = "3306"
340+
return ret
341+
342+
343+
def _parse_uri(uri, attr) -> str:
344+
"""Get a host from a URI/URL using urlparse."""
345+
uri_parse = urlparse(uri)
346+
if not uri_parse.netloc:
347+
# This happens if users provide an URI without explicit scheme followed by ://
348+
# (for example 'test.example.com:3306' instead of 'mysql://test.example.com:3306`).
349+
uri_parse = urlparse("//" + uri)
350+
351+
# Parse netloc to get hostname or port
352+
return _parse_netloc(uri, uri_parse, attr)
353+
354+
310355
def _write_rendered_template_to_file(rendered_template, filename):
311356
log.info("Writing contents of %s", filename)
312357
with open(filename, "w") as output_file:
@@ -399,6 +444,7 @@ def memory_ratio_float(arg):
399444
help="Configure ratio between RealMemory and memory advertised by EC2",
400445
required=True,
401446
)
447+
parser.add_argument("--slurmdbd-user", help="User for the slurmdbd service.", required=True)
402448
args = parser.parse_args()
403449
generate_slurm_config_files(
404450
args.output_directory,
@@ -409,6 +455,7 @@ def memory_ratio_float(arg):
409455
args.no_gpu,
410456
args.compute_node_bootstrap_timeout,
411457
args.realmemory_to_ec2memory_ratio,
458+
args.slurmdbd_user,
412459
)
413460
except Exception as e:
414461
log.exception("Failed to generate slurm configurations, exception: %s", e)

cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/templates/slurm_parallelcluster.conf

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@ SelectTypeParameters=CR_CPU_Memory
1111
{% else %}
1212
SelectTypeParameters=CR_CPU
1313
{% endif %}
14+
{% if scaling_config.Database.Uri is defined %}
15+
AccountingStorageType=accounting_storage/slurmdbd
16+
AccountingStorageHost={{ head_node_config.head_node_hostname }}
17+
AccountingStoragePort=6819
18+
AccountingStorageUser={{ slurmdbd_user }}
19+
JobAcctGatherType=jobacct_gather/cgroup
20+
{% endif %}
1421

1522
{% for queue in queues %}
1623
include {{ output_dir }}/pcluster/slurm_parallelcluster_{{ queue.Name }}_partition.conf
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# slurm_parallelcluster_slurmdbd.conf is managed by the pcluster processes.
2+
# Do not modify.
3+
# Please add user-specific slurmdbd configuration options in slurmdbd.conf
4+
{% if scaling_config.Database.Uri is defined %}
5+
DbdHost={{ head_node_config.head_node_hostname }}
6+
StorageHost={{ scaling_config.Database.Uri | uri_host }}
7+
StoragePort={{ scaling_config.Database.Uri | uri_port }}
8+
{# Dashes in StorageLoc cause issues with the database creation #}
9+
StorageLoc={{ cluster_name | replace("-", "_") }}
10+
StorageUser={{ scaling_config.Database.UserName }}
11+
StoragePass=dummy
12+
{% endif %}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# frozen_string_literal: true
2+
3+
#
4+
# Cookbook:: aws-parallelcluster-slurm
5+
# Recipe:: config_head_node
6+
#
7+
# Copyright:: 2013-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
8+
#
9+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
10+
# License. A copy of the License is located at
11+
#
12+
# http://aws.amazon.com/apache2.0/
13+
#
14+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
15+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
18+
file "#{node['cluster']['scripts_dir']}/slurm/update_slurm_database_password.sh" do
19+
action :delete
20+
end
21+
22+
service "slurmdbd" do
23+
supports restart: false
24+
action %i(disable stop)
25+
end

cookbooks/aws-parallelcluster-slurm/recipes/config_head_node.rb

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@
6868
" --output-directory #{node['cluster']['slurm']['install_dir']}/etc/ --template-directory #{node['cluster']['scripts_dir']}/slurm/templates/"\
6969
" --input-file #{node['cluster']['cluster_config_path']} --instance-types-data #{node['cluster']['instance_types_data_path']}"\
7070
" --compute-node-bootstrap-timeout #{node['cluster']['compute_node_bootstrap_timeout']} #{no_gpu}"\
71-
" --realmemory-to-ec2memory-ratio #{node['cluster']['realmemory_to_ec2memory_ratio']}"
71+
" --realmemory-to-ec2memory-ratio #{node['cluster']['realmemory_to_ec2memory_ratio']}"\
72+
" --slurmdbd-user #{node['cluster']['slurm']['dbduser']}"
7273
end
7374

7475
# Generate pcluster fleet config
@@ -192,6 +193,21 @@
192193
action :create
193194
end
194195

196+
template '/etc/systemd/system/slurmdbd.service' do
197+
source 'slurm/head_node/slurmdbd.service.erb'
198+
owner 'root'
199+
group 'root'
200+
mode '0644'
201+
action :create
202+
end
203+
204+
ruby_block "Configure Slurm Accounting" do
205+
block do
206+
run_context.include_recipe "aws-parallelcluster-slurm::config_slurm_accounting"
207+
end
208+
not_if { node['cluster']['config'].dig(:Scheduling, :SlurmSettings, :Database).nil? }
209+
end
210+
195211
service "slurmctld" do
196212
supports restart: false
197213
action %i(enable start)
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# frozen_string_literal: true
2+
3+
#
4+
# Cookbook:: aws-parallelcluster-slurm
5+
# Recipe:: config_head_node
6+
#
7+
# Copyright:: 2013-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
8+
#
9+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
10+
# License. A copy of the License is located at
11+
#
12+
# http://aws.amazon.com/apache2.0/
13+
#
14+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
15+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
18+
template "#{node['cluster']['slurm']['install_dir']}/etc/slurmdbd.conf" do
19+
source 'slurm/slurmdbd.conf.erb'
20+
owner "#{node['cluster']['slurm']['dbduser']}"
21+
group "#{node['cluster']['slurm']['dbdgroup']}"
22+
mode '0600'
23+
# Do not overwrite possible user customization if the database credentials are updated
24+
not_if { ::File.exist?("#{node['cluster']['slurm']['install_dir']}/etc/slurmdbd.conf") }
25+
end
26+
27+
file "#{node['cluster']['slurm']['install_dir']}/etc/slurm_parallelcluster_slurmdbd.conf" do
28+
owner "#{node['cluster']['slurm']['dbduser']}"
29+
group "#{node['cluster']['slurm']['dbdgroup']}"
30+
mode '0600'
31+
end
32+
33+
template "#{node['cluster']['scripts_dir']}/slurm/update_slurm_database_password.sh" do
34+
source 'slurm/head_node/update_slurm_database_password.sh.erb'
35+
owner 'root'
36+
group 'root'
37+
mode '0700'
38+
variables(
39+
secret_arn: lazy { node['cluster']['config'].dig(:Scheduling, :SlurmSettings, :Database, :PasswordSecretArn) },
40+
region: node['cluster']['region']
41+
)
42+
sensitive true
43+
end
44+
45+
execute "update Slurm database password" do
46+
user 'root'
47+
group 'root'
48+
command "#{node['cluster']['scripts_dir']}/slurm/update_slurm_database_password.sh"
49+
end
50+
51+
service "slurmdbd" do
52+
supports restart: false
53+
action %i(enable start)
54+
end
55+
56+
# After starting slurmdbd the database may not be fully responsive yet and
57+
# its bootstrapping may fail. We need to wait for sacctmgr to successfully
58+
# query the database before proceeding.
59+
execute "wait for slurm database" do
60+
command "#{node['cluster']['slurm']['install_dir']}/bin/sacctmgr show clusters -Pn"
61+
# Very large value to simulate infinite loop (we will hit some other timeout
62+
# before this).
63+
retries 100000
64+
retry_delay 10
65+
end
66+
67+
bash "bootstrap slurm database" do
68+
user 'root'
69+
group 'root'
70+
code <<-BOOTSTRAP
71+
SACCTMGR_CMD=#{node['cluster']['slurm']['install_dir']}/bin/sacctmgr
72+
CLUSTER_NAME=#{node['cluster']['stack_name']}
73+
DEF_ACCOUNT=pcdefault
74+
SLURM_USER=#{node['cluster']['slurm']['user']}
75+
DEF_USER=#{node['cluster']['cluster_user']}
76+
77+
# Add cluster to database if it is not present yet
78+
[[ $($SACCTMGR_CMD show clusters -Pn cluster=$CLUSTER_NAME | grep $CLUSTER_NAME) ]] || \
79+
$SACCTMGR_CMD -iQ add cluster $CLUSTER_NAME
80+
81+
# Add account-cluster association to database if it is not present yet
82+
[[ $($SACCTMGR_CMD list associations -Pn cluster=$CLUSTER_NAME account=$DEF_ACCOUNT format=account | grep $DEF_ACCOUNT) ]] || \
83+
$SACCTMGR_CMD -iQ add account $DEF_ACCOUNT Cluster=$CLUSTER_NAME \
84+
Description="ParallelCluster default account" Organization="none"
85+
86+
# Add user-account associations to database if they are not present yet
87+
[[ $($SACCTMGR_CMD list associations -Pn cluster=$CLUSTER_NAME account=$DEF_ACCOUNT user=$SLURM_USER format=user | grep $SLURM_USER) ]] || \
88+
$SACCTMGR_CMD -iQ add user $SLURM_USER Account=$DEF_ACCOUNT AdminLevel=Admin
89+
[[ $($SACCTMGR_CMD list associations -Pn cluster=$CLUSTER_NAME account=$DEF_ACCOUNT user=$DEF_USER format=user | grep $DEF_USER) ]] || \
90+
$SACCTMGR_CMD -iQ add user $DEF_USER Account=$DEF_ACCOUNT AdminLevel=Admin
91+
92+
# sacctmgr might throw errors if the DEF_ACCOUNT is not associated to a cluster already defined on the database.
93+
# This is not important for the scope of this script, so we return 0.
94+
exit 0
95+
BOOTSTRAP
96+
end

cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,23 @@
5050
shell '/bin/bash'
5151
end
5252

53+
# Setup slurmdbd group
54+
group node['cluster']['slurm']['dbdgroup'] do
55+
comment 'slurm group'
56+
gid node['cluster']['slurm']['dbdgroup_id']
57+
system true
58+
end
59+
60+
# Setup slurmdbd user
61+
user node['cluster']['slurm']['dbduser'] do
62+
comment 'slurmdbd user'
63+
uid node['cluster']['slurm']['dbduser_id']
64+
gid node['cluster']['slurm']['dbdgroup_id']
65+
manage_home false
66+
system true
67+
shell '/bin/bash'
68+
end
69+
5370
include_recipe 'aws-parallelcluster-slurm::install_jwt'
5471

5572
slurm_tarball = "#{node['cluster']['sources_dir']}/slurm-#{node['cluster']['slurm']['version']}.tar.gz"

cookbooks/aws-parallelcluster-slurm/recipes/update_head_node.rb

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,8 @@ def update_nodes_in_queue(strategy, queues)
154154
" --instance-types-data #{node['cluster']['instance_types_data_path']}" \
155155
" --compute-node-bootstrap-timeout #{node['cluster']['compute_node_bootstrap_timeout']}" \
156156
" #{nvidia_installed? ? '' : '--no-gpu'}"\
157-
" --realmemory-to-ec2memory-ratio #{node['cluster']['realmemory_to_ec2memory_ratio']}"
157+
" --realmemory-to-ec2memory-ratio #{node['cluster']['realmemory_to_ec2memory_ratio']}"\
158+
" --slurmdbd-user #{node['cluster']['slurm']['dbduser']}"
158159
not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? }
159160
end
160161

@@ -171,6 +172,17 @@ def update_nodes_in_queue(strategy, queues)
171172
replace_only true
172173
end
173174

175+
ruby_block "Update Slurm Accounting" do
176+
block do
177+
if node['cluster']['config'].dig(:Scheduling, :SlurmSettings, :Database).nil?
178+
run_context.include_recipe "aws-parallelcluster-slurm::clear_slurm_accounting"
179+
else
180+
run_context.include_recipe "aws-parallelcluster-slurm::config_slurm_accounting"
181+
end
182+
end
183+
only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_slurm_database_updated? }
184+
end
185+
174186
service 'slurmctld' do
175187
action :restart
176188
not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? }

0 commit comments

Comments
 (0)