diff --git a/README.md b/README.md index 34c6908..d77663a 100644 --- a/README.md +++ b/README.md @@ -50,36 +50,55 @@ each list element: ### slurm.conf -`openhpc_slurm_partitions`: Optional. List of one or more slurm partitions, default `[]`. Each partition may contain the following values: -* `groups`: If there are multiple node groups that make up the partition, a list of group objects can be defined here. - Otherwise, `groups` can be omitted and the following attributes can be defined in the partition object: - * `name`: The name of the nodes within this group. - * `cluster_name`: Optional. An override for the top-level definition `openhpc_cluster_name`. - * `extra_nodes`: Optional. A list of additional node definitions, e.g. for nodes in this group/partition not controlled by this role. Each item should be a dict, with keys/values as per the ["NODE CONFIGURATION"](https://slurm.schedmd.com/slurm.conf.html#lbAE) docs for slurm.conf. Note the key `NodeName` must be first. - * `ram_mb`: Optional. The physical RAM available in each node of this group ([slurm.conf](https://slurm.schedmd.com/slurm.conf.html) parameter `RealMemory`) in MiB. This is set using ansible facts if not defined, equivalent to `free --mebi` total * `openhpc_ram_multiplier`. - * `ram_multiplier`: Optional. An override for the top-level definition `openhpc_ram_multiplier`. Has no effect if `ram_mb` is set. +`openhpc_nodegroups`: Optional, default `[]`. List of mappings, each defining a +unique set of homogenous nodes: + * `name`: Required. Name of node group. + * `ram_mb`: Optional. The physical RAM available in each node of this group + ([slurm.conf](https://slurm.schedmd.com/slurm.conf.html) parameter `RealMemory`) + in MiB. This is set using ansible facts if not defined, equivalent to + `free --mebi` total * `openhpc_ram_multiplier`. + * `ram_multiplier`: Optional. An override for the top-level definition + `openhpc_ram_multiplier`. Has no effect if `ram_mb` is set. * `gres`: Optional. List of dicts defining [generic resources](https://slurm.schedmd.com/gres.html). Each dict must define: - `conf`: A string with the [resource specification](https://slurm.schedmd.com/slurm.conf.html#OPT_Gres_1) but requiring the format `::`, e.g. `gpu:A100:2`. Note the `type` is an arbitrary string. - `file`: A string with the [File](https://slurm.schedmd.com/gres.conf.html#OPT_File) (path to device(s)) for this resource, e.g. `/dev/nvidia[0-1]` for the above example. - Note [GresTypes](https://slurm.schedmd.com/slurm.conf.html#OPT_GresTypes) must be set in `openhpc_config` if this is used. - -* `default`: Optional. A boolean flag for whether this partion is the default. Valid settings are `YES` and `NO`. -* `maxtime`: Optional. A partition-specific time limit following the format of [slurm.conf](https://slurm.schedmd.com/slurm.conf.html) parameter `MaxTime`. The default value is + * `params`: Optional. Mapping of additional parameters and values for + [node configuration](https://slurm.schedmd.com/slurm.conf.html#lbAE). + + Each nodegroup will contain hosts from an Ansible inventory group named + `{{ openhpc_cluster_name }}_{{ group_name}}`. Note that: + - Each host may only appear in one nodegroup. + - Hosts in a nodegroup are assumed to be homogenous in terms of processor and memory. + - Hosts may have arbitrary hostnames, but these should be lowercase to avoid a + mismatch between inventory and actual hostname. + - An inventory group may be missing or empty, in which case the node group + contains no hosts. + - If the inventory group is not empty the play must contain at least one host. + This is used to set `Sockets`, `CoresPerSocket`, `ThreadsPerCore` and + optionally `RealMemory` for the nodegroup. + +`openhpc_partitions`: Optional, default `[]`. List of mappings, each defining a +partition. Each partition mapping may contain: + * `name`: Required. Name of partition. + * `groups`: Optional. List of nodegroup names. If omitted, the partition name + is assumed to match a nodegroup name. + * `default`: Optional. A boolean flag for whether this partion is the default. Valid settings are `YES` and `NO`. + * `maxtime`: Optional. A partition-specific time limit following the format of [slurm.conf](https://slurm.schedmd.com/slurm.conf.html) parameter `MaxTime`. The default value is given by `openhpc_job_maxtime`. The value should be quoted to avoid Ansible conversions. -* `partition_params`: Optional. Mapping of additional parameters and values for [partition configuration](https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION). - -For each group (if used) or partition any nodes in an ansible inventory group `_` will be added to the group/partition. Note that: -- Nodes may have arbitrary hostnames but these should be lowercase to avoid a mismatch between inventory and actual hostname. -- Nodes in a group are assumed to be homogenous in terms of processor and memory. -- An inventory group may be empty or missing, but if it is not then the play must contain at least one node from it (used to set processor information). - + * `params`: Optional. Mapping of additional parameters and values for + [partition configuration](https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION). `openhpc_job_maxtime`: Maximum job time limit, default `'60-0'` (60 days). See [slurm.conf](https://slurm.schedmd.com/slurm.conf.html) parameter `MaxTime` for format. The default is 60 days. The value should be quoted to avoid Ansible conversions. `openhpc_cluster_name`: name of the cluster. -`openhpc_config`: Optional. Mapping of additional parameters and values for `slurm.conf`. Note these will override any included in `templates/slurm.conf.j2`. +`openhpc_config`: Optional. Mapping of additional parameters and values for +[slurm.conf](https://slurm.schedmd.com/slurm.conf.html). Keys are parameter +names and values are lists or strings as appropriate. This can be used to +supplement or override the template defaults, or to remove a template parameter +by setting the value to `'omit'` - note this is the literal string, not the +Ansible special variable. `openhpc_ram_multiplier`: Optional, default `0.95`. Multiplier used in the calculation: `total_memory * openhpc_ram_multiplier` when setting `RealMemory` for the partition in slurm.conf. Can be overriden on a per partition basis using `openhpc_slurm_partitions.ram_multiplier`. Has no effect if `openhpc_slurm_partitions.ram_mb` is set. diff --git a/defaults/main.yml b/defaults/main.yml index c806809..56b15c4 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -4,14 +4,45 @@ openhpc_slurm_service_started: "{{ openhpc_slurm_service_enabled }}" openhpc_slurm_service: openhpc_slurm_control_host: "{{ inventory_hostname }}" #openhpc_slurm_control_host_address: -openhpc_slurm_partitions: [] +openhpc_partitions: [] +openhpc_nodegroups: [] openhpc_cluster_name: openhpc_packages: - slurm-libpmi-ohpc openhpc_resume_timeout: 300 openhpc_retry_delay: 10 openhpc_job_maxtime: '60-0' # quote this to avoid ansible converting some formats to seconds, which is interpreted as minutes by Slurm -openhpc_config: "{{ openhpc_extra_config | default({}) }}" +openhpc_default_config: + # This only defines values which are not Slurm defaults + SlurmctldHost: "{{ openhpc_slurm_control_host }}{% if openhpc_slurm_control_host_address is defined %}({{ openhpc_slurm_control_host_address }}){% endif %}" + ProctrackType: proctrack/linuxproc # TODO: really want cgroup but needs cgroup.conf and workaround for CI + SlurmdSpoolDir: /var/spool/slurm # NB: not OpenHPC default! + SlurmUser: slurm + StateSaveLocation: "{{ openhpc_state_save_location }}" + SlurmctldTimeout: 300 + SchedulerType: sched/backfill + SelectType: select/cons_tres + SelectTypeParameters: CR_Core + PriorityWeightPartition: 1000 + PreemptType: preempt/partition_prio + PreemptMode: SUSPEND,GANG + AccountingStoragePass: "{{ openhpc_slurm_accounting_storage_pass | default('omit') }}" + AccountingStorageHost: "{{ openhpc_slurm_accounting_storage_host }}" + AccountingStoragePort: "{{ openhpc_slurm_accounting_storage_port }}" + AccountingStorageType: "{{ openhpc_slurm_accounting_storage_type }}" + AccountingStorageUser: "{{ openhpc_slurm_accounting_storage_user }}" + JobCompLoc: "{{ openhpc_slurm_job_comp_loc }}" + JobCompType: "{{ openhpc_slurm_job_comp_type }}" + JobAcctGatherFrequency: "{{ openhpc_slurm_job_acct_gather_frequency }}" + JobAcctGatherType: "{{ openhpc_slurm_job_acct_gather_type }}" + SlurmctldSyslogDebug: info + SlurmdSyslogDebug: info + PropagateResourceLimitsExcept: MEMLOCK + Epilog: /etc/slurm/slurm.epilog.clean + ReturnToService: 2 + SlurmctldParameters: "{{ 'enable_configless' if openhpc_slurm_configless else 'omit' }}" + +openhpc_config: {} openhpc_gres_template: gres.conf.j2 openhpc_slurm_configless: "{{ 'enable_configless' in openhpc_config.get('SlurmctldParameters', []) }}" diff --git a/tasks/runtime.yml b/tasks/runtime.yml index 18d75f7..358bf95 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -80,43 +80,9 @@ notify: Restart slurmdbd service when: openhpc_enable.database | default(false) | bool -- name: Make local tempfile for slurm.conf templating # ensures simultaneous runs don't clobber each other - ansible.builtin.tempfile: - register: _slurm_conf_tmpfile - delegate_to: localhost - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless | bool - changed_when: false # so molecule doesn't fail - become: no - -- name: Template basic slurm.conf +- name: Template slurm.conf template: src: slurm.conf.j2 - dest: "{{ _slurm_conf_tmpfile.path }}" - lstrip_blocks: true - mode: 0644 - delegate_to: localhost - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless | bool - changed_when: false # so molecule doesn't fail - become: no - -- name: Customise slurm.conf - community.general.ini_file: - path: "{{ _slurm_conf_tmpfile.path }}" - option: "{{ item.key }}" - section: '' - value: "{{ (item.value | join(',')) if (item.value is sequence and item.value is not string) else item.value }}" - no_extra_spaces: true - create: no - mode: 0644 - loop: "{{ openhpc_config | dict2items }}" - delegate_to: localhost - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless | bool - changed_when: false # so molecule doesn't fail - become: no - -- name: Create slurm.conf - copy: - src: "{{ _slurm_conf_tmpfile.path }}" dest: /etc/slurm/slurm.conf owner: root group: root diff --git a/templates/gres.conf.j2 b/templates/gres.conf.j2 index a6fa27b..bc23ed5 100644 --- a/templates/gres.conf.j2 +++ b/templates/gres.conf.j2 @@ -1,16 +1,11 @@ AutoDetect=off -{% for part in openhpc_slurm_partitions %} -{% set nodelist = [] %} -{% for group in part.get('groups', [part]) %} -{% if 'gres' in group %} -{% for gres in group.gres %} -{% set gres_name, gres_type, _ = gres.conf.split(':') %} -{% set group_name = group.cluster_name|default(openhpc_cluster_name) ~ '_' ~ group.name %} -{% set inventory_group_hosts = groups.get(group_name, []) %} -{% for hostlist in (inventory_group_hosts | hostlist_expression) %} +{% for nodegroup in openhpc_nodegroups %} +{% for gres in nodegroup.gres | default([]) %} +{% set gres_name, gres_type, _ = gres.conf.split(':') %} +{% set inventory_group_name = openhpc_cluster_name ~ '_' ~ nodegroup.name %} +{% set inventory_group_hosts = groups.get(inventory_group_name, []) %} +{% for hostlist in (inventory_group_hosts | hostlist_expression) %} NodeName={{ hostlist }} Name={{ gres_name }} Type={{ gres_type }} File={{ gres.file }} -{% endfor %} -{% endfor %} -{% endif %} -{% endfor %} -{% endfor %} +{% endfor %}{# hostlists #} +{% endfor %}{# gres #} +{% endfor %}{# nodegroup #} diff --git a/templates/slurm.conf.j2 b/templates/slurm.conf.j2 index 94f0465..3fc7075 100644 --- a/templates/slurm.conf.j2 +++ b/templates/slurm.conf.j2 @@ -1,143 +1,12 @@ -# -# Example slurm.conf file. Please run configurator.html -# (in doc/html) to build a configuration file customized -# for your environment. -# -# -# slurm.conf file generated by configurator.html. -# -# See the slurm.conf man page for more information. -# ClusterName={{ openhpc_cluster_name }} -SlurmctldHost={{ openhpc_slurm_control_host }}{% if openhpc_slurm_control_host_address is defined %}({{ openhpc_slurm_control_host_address }}){% endif %} -#DisableRootJobs=NO -#EnforcePartLimits=NO -#EpilogSlurmctld= -#FirstJobId=1 -#MaxJobId=67043328 -#GresTypes= -#GroupUpdateForce=0 -#GroupUpdateTime=600 -#JobFileAppend=0 -#JobRequeue=1 -#JobSubmitPlugins=lua -#KillOnBadExit=0 -#LaunchType=launch/slurm -#Licenses=foo*4,bar -#MailProg=/bin/mail -#MaxJobCount=10000 -#MaxStepCount=40000 -#MaxTasksPerNode=512 -MpiDefault=none -#MpiParams=ports=#-# -#PluginDir= -#PlugStackConfig= -#PrivateData=jobs -ProctrackType=proctrack/linuxproc # TODO: really want cgroup but needs cgroup.conf and workaround for CI -#Prolog= -#PrologFlags= -#PrologSlurmctld= -#PropagatePrioProcess=0 -#PropagateResourceLimits= -#PropagateResourceLimitsExcept= -#RebootProgram= -SlurmctldPidFile=/var/run/slurmctld.pid -SlurmctldPort=6817 -SlurmdPidFile=/var/run/slurmd.pid -SlurmdPort=6818 -SlurmdSpoolDir=/var/spool/slurm # NB: not OpenHPC default! -SlurmUser=slurm -#SlurmdUser=root -#SrunEpilog= -#SrunProlog= -StateSaveLocation={{ openhpc_state_save_location }} -SwitchType=switch/none -#TaskEpilog= -#TaskPlugin=task/affinity -#TaskProlog= -#TopologyPlugin=topology/tree -#TmpFS=/tmp -#TrackWCKey=no -#TreeWidth= -#UnkillableStepProgram= -#UsePAM=0 -# -# -# TIMERS -#BatchStartTimeout=10 -#CompleteWait=0 -#EpilogMsgTime=2000 -#GetEnvTimeout=2 -#HealthCheckInterval=0 -#HealthCheckProgram= -InactiveLimit=0 -KillWait=30 -#MessageTimeout=10 -#ResvOverRun=0 -MinJobAge=300 -#OverTimeLimit=0 -SlurmctldTimeout=300 -SlurmdTimeout=300 -#UnkillableStepTimeout=60 -#VSizeFactor=0 -Waittime=0 -# -# -# SCHEDULING -#DefMemPerCPU=0 -#MaxMemPerCPU=0 -#SchedulerTimeSlice=30 -SchedulerType=sched/backfill -SelectType=select/cons_tres -SelectTypeParameters=CR_Core -# -# -# JOB PRIORITY -#PriorityFlags= -PriorityType=priority/multifactor -#PriorityDecayHalfLife= -#PriorityCalcPeriod= -#PriorityFavorSmall= -#PriorityMaxAge= -#PriorityUsageResetPeriod= -#PriorityWeightAge= -#PriorityWeightFairshare= -#PriorityWeightJobSize= -PriorityWeightPartition=1000 -#PriorityWeightQOS= -PreemptType=preempt/partition_prio -PreemptMode=SUSPEND,GANG -# -# LOGGING AND ACCOUNTING -#AccountingStorageEnforce=0 -AccountingStorageHost={{ openhpc_slurm_accounting_storage_host }} -{% if openhpc_slurm_accounting_storage_pass | default(false, true) %} -AccountingStoragePass={{ openhpc_slurm_accounting_storage_pass }} -{% endif %} -AccountingStoragePort={{ openhpc_slurm_accounting_storage_port }} -AccountingStorageType={{ openhpc_slurm_accounting_storage_type }} -AccountingStorageUser={{ openhpc_slurm_accounting_storage_user }} -#AccountingStoreFlags= -#JobCompHost= -JobCompLoc={{ openhpc_slurm_job_comp_loc }} -#JobCompPass= -#JobCompPort= -JobCompType={{ openhpc_slurm_job_comp_type }} -#JobCompUser= -#JobContainerType=job_container/none -JobAcctGatherFrequency={{ openhpc_slurm_job_acct_gather_frequency }} -JobAcctGatherType={{ openhpc_slurm_job_acct_gather_type }} +# PARAMETERS +{% for k, v in openhpc_default_config | combine(openhpc_config) | items %} +{% if v != "omit" %}{# allow removing items using setting key: null #} +{{ k }}={{ v | join(',') if (v is sequence and v is not string) else v }} +{% endif %} +{% endfor %} -# By default, SLURM will log to syslog, which is what we want -SlurmctldSyslogDebug=info -SlurmdSyslogDebug=info -#SlurmSchedLogFile= -#SlurmSchedLogLevel= -#DebugFlags= -# -# -# POWER SAVE SUPPORT FOR IDLE NODES - NOT SUPPORTED IN THIS APPLIANCE VERSION # LOGIN-ONLY NODES # Define slurmd nodes not in partitions for login-only nodes in "configless" mode: @@ -145,45 +14,43 @@ SlurmdSyslogDebug=info NodeName={{ node }} {% endfor %}{% endif %} + # COMPUTE NODES -# OpenHPC default configuration -PropagateResourceLimitsExcept=MEMLOCK -Epilog=/etc/slurm/slurm.epilog.clean -{% set donehosts = [] %} -{% for part in openhpc_slurm_partitions %} - {% set nodelist = [] %} - {% for group in part.get('groups', [part]) %} - {% set group_name = group.cluster_name|default(openhpc_cluster_name) ~ '_' ~ group.name %} -# openhpc_slurm_partitions group: {{ group_name }} - {% set inventory_group_hosts = groups.get(group_name, []) %} - {% if inventory_group_hosts | length > 0 %} - {% set play_group_hosts = inventory_group_hosts | intersect (play_hosts) %} - {% set first_host = play_group_hosts | first | mandatory('Group "' ~ group_name ~ '" contains no hosts in this play - was --limit used?') %} - {% set first_host_hv = hostvars[first_host] %} - {% set ram_mb = (first_host_hv['ansible_memory_mb']['real']['total'] * (group.ram_multiplier | default(openhpc_ram_multiplier))) | int %} - {% for hostlist in (inventory_group_hosts | hostlist_expression) %} - {% set gres = ' Gres=%s' % (','.join(group.gres | map(attribute='conf') )) if 'gres' in group else '' %} - {% if hostlist not in donehosts %} -NodeName={{ hostlist }} State=UNKNOWN RealMemory={{ group.get('ram_mb', ram_mb) }} Sockets={{first_host_hv['ansible_processor_count']}} CoresPerSocket={{ first_host_hv['ansible_processor_cores'] }} ThreadsPerCore={{ first_host_hv['ansible_processor_threads_per_core'] }}{{ gres }} - {% endif %} - {% set _ = nodelist.append(hostlist) %} - {% set _ = donehosts.append(hostlist) %} - {% endfor %}{# nodes #} - {% endif %}{# inventory_group_hosts #} - {% for extra_node_defn in group.get('extra_nodes', []) %} -{{ extra_node_defn.items() | map('join', '=') | join(' ') }} - {% set _ = nodelist.append(extra_node_defn['NodeName']) %} - {% endfor %} - {% endfor %}{# group #} -{% if not nodelist %}{# empty partition #} -{% set nodelist = ['""'] %} -{% endif %} -PartitionName={{part.name}} Default={{ part.get('default', 'YES') }} MaxTime={{ part.get('maxtime', openhpc_job_maxtime) }} State=UP Nodes={{ nodelist | join(',') }} {{ part.partition_params | default({}) | dict2parameters }} -{% endfor %}{# partitions #} +{% for nodegroup in openhpc_nodegroups %} +{% set inventory_group_name = openhpc_cluster_name ~ '_' ~ nodegroup.name %} +{% set inventory_group_hosts = groups.get(inventory_group_name, []) %} +{% if inventory_group_hosts | length > 0 %} +{% set play_group_hosts = inventory_group_hosts | intersect (play_hosts) %} +{% set first_host = play_group_hosts | first | mandatory('Inventory group "' ~ inventory_group_name ~ '" contains no hosts in this play - was --limit used?') %} +{% set first_host_hv = hostvars[first_host] %} +{% set ram_mb = (first_host_hv['ansible_memory_mb']['real']['total'] * (nodegroup.ram_multiplier | default(openhpc_ram_multiplier))) | int %} +{% set hostlists = (inventory_group_hosts | hostlist_expression) %}{# hosts in inventory group aren't necessarily a single hostlist expression #} +{% for hostlist in hostlists %} +NodeName={{ hostlist }} {{ '' -}} + State=UNKNOWN {{ '' -}} + RealMemory={{ nodegroup.ram_mb | default(ram_mb) }} {{ '' -}} + Sockets={{first_host_hv['ansible_processor_count'] }} {{ '' -}} + CoresPerSocket={{ first_host_hv['ansible_processor_cores'] }} {{ '' -}} + ThreadsPerCore={{ first_host_hv['ansible_processor_threads_per_core'] }} {{ '' -}} + {{ nodegroup.params | default({}) | dict2parameters }} {{ '' -}} + {% if 'gres' in nodegroup %}Gres={{ ','.join(nodegroup.gres | map(attribute='conf')) }}{% endif %} +{% endfor %}{# hostlists #} +{% endif %}{# 1 or more hosts in inventory #} + +NodeSet={{ nodegroup.name }} Nodes={{ ','.join(hostlists | default(['""'])) }}{# no support for creating nodesets by Feature #} + +{% endfor %} # Define a non-existent node, in no partition, so that slurmctld starts even with all partitions empty NodeName=nonesuch -{% if openhpc_slurm_configless | bool %}SlurmctldParameters=enable_configless{% endif %} +# PARTITIONS +{% for partition in openhpc_partitions %} +PartitionName={{partition.name}} {{ '' -}} + Default={{ partition.get('default', 'YES') }} {{ '' -}} + MaxTime={{ partition.get('maxtime', openhpc_job_maxtime) }} {{ '' -}} + State=UP Nodes={{ partition.get('groups', [partition.name]) | join(',') }} {{ '' -}} + {{ partition.params | default({}) | dict2parameters }} +{% endfor %}{# openhpc_partitions #} -ReturnToService=2 +{% if openhpc_slurm_configless | bool %}SlurmctldParameters=enable_configless{% endif %}