diff --git a/README.md b/README.md index 769a7df..022c9ab 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,12 @@ partition configuration for each. `openhpc_cluster_name`: name of the cluster. -`openhpc_config`: Optional. Mapping of additional parameters and values for `slurm.conf`. Note these will override any included in `templates/slurm.conf.j2`. +`openhpc_config`: Optional. Mapping of additional parameters and values for +[slurm.conf](https://slurm.schedmd.com/slurm.conf.html). Keys are slurm.conf +parameter names and values are lists or strings as appropriate. This can be +used to supplement or override the template defaults. Templated parameters can +also be removed by setting the value to the literal string`'omit'` - note +that this is *not the same* as the Ansible `omit` [special variable](https://docs.ansible.com/ansible/latest/reference_appendices/special_variables.html#term-omit). `openhpc_ram_multiplier`: Optional, default `0.95`. Multiplier used in the calculation: `total_memory * openhpc_ram_multiplier` when setting `RealMemory` for the partition in slurm.conf. Can be overriden on a per partition basis using `openhpc_slurm_partitions.ram_multiplier`. Has no effect if `openhpc_slurm_partitions.ram_mb` is set. diff --git a/defaults/main.yml b/defaults/main.yml index ea91c75..94ba868 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -12,7 +12,36 @@ openhpc_packages: openhpc_resume_timeout: 300 openhpc_retry_delay: 10 openhpc_job_maxtime: '60-0' # quote this to avoid ansible converting some formats to seconds, which is interpreted as minutes by Slurm -openhpc_config: "{{ openhpc_extra_config | default({}) }}" +openhpc_default_config: + # This only defines values which are not Slurm defaults + SlurmctldHost: "{{ openhpc_slurm_control_host }}{% if openhpc_slurm_control_host_address is defined %}({{ openhpc_slurm_control_host_address }}){% endif %}" + ProctrackType: proctrack/linuxproc # TODO: really want cgroup but needs cgroup.conf and workaround for CI + SlurmdSpoolDir: /var/spool/slurm # NB: not OpenHPC default! + SlurmUser: slurm + StateSaveLocation: "{{ openhpc_state_save_location }}" + SlurmctldTimeout: 300 + SchedulerType: sched/backfill + SelectType: select/cons_tres + SelectTypeParameters: CR_Core + PriorityWeightPartition: 1000 + PreemptType: preempt/partition_prio + PreemptMode: SUSPEND,GANG + AccountingStoragePass: "{{ openhpc_slurm_accounting_storage_pass | default('omit') }}" + AccountingStorageHost: "{{ openhpc_slurm_accounting_storage_host }}" + AccountingStoragePort: "{{ openhpc_slurm_accounting_storage_port }}" + AccountingStorageType: "{{ openhpc_slurm_accounting_storage_type }}" + AccountingStorageUser: "{{ openhpc_slurm_accounting_storage_user }}" + JobCompLoc: "{{ openhpc_slurm_job_comp_loc }}" + JobCompType: "{{ openhpc_slurm_job_comp_type }}" + JobAcctGatherFrequency: "{{ openhpc_slurm_job_acct_gather_frequency }}" + JobAcctGatherType: "{{ openhpc_slurm_job_acct_gather_type }}" + SlurmctldSyslogDebug: info + SlurmdSyslogDebug: info + PropagateResourceLimitsExcept: MEMLOCK + Epilog: /etc/slurm/slurm.epilog.clean + ReturnToService: 2 + +openhpc_config: {} openhpc_gres_template: gres.conf.j2 openhpc_slurm_configless: "{{ 'enable_configless' in openhpc_config.get('SlurmctldParameters', []) }}" diff --git a/filter_plugins/slurm_conf.py b/filter_plugins/slurm_conf.py index 3f0ba56..9c38638 100644 --- a/filter_plugins/slurm_conf.py +++ b/filter_plugins/slurm_conf.py @@ -22,16 +22,6 @@ # Pattern to match a hostname with numerical ending pattern = re.compile("^(.*\D(?=\d))(\d+)$") -def _get_hostvar(context, var_name, inventory_hostname=None): - if inventory_hostname is None: - namespace = context - else: - if inventory_hostname not in context['hostvars']: - raise errors.AnsibleFilterError( - "Inventory hostname '%s' not in hostvars" % inventory_hostname) - namespace = context["hostvars"][inventory_hostname] - return namespace.get(var_name) - def hostlist_expression(hosts): """ Group hostnames using Slurm's hostlist expression format. diff --git a/tasks/runtime.yml b/tasks/runtime.yml index a19aa89..b08a451 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -70,43 +70,9 @@ notify: Restart slurmdbd service when: openhpc_enable.database | default(false) | bool -- name: Make local tempfile for slurm.conf templating # ensures simultaneous runs don't clobber each other - ansible.builtin.tempfile: - register: _slurm_conf_tmpfile - delegate_to: localhost - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless | bool - changed_when: false # so molecule doesn't fail - become: no - -- name: Template basic slurm.conf +- name: Template slurm.conf template: src: slurm.conf.j2 - dest: "{{ _slurm_conf_tmpfile.path }}" - lstrip_blocks: true - mode: 0644 - delegate_to: localhost - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless | bool - changed_when: false # so molecule doesn't fail - become: no - -- name: Customise slurm.conf - community.general.ini_file: - path: "{{ _slurm_conf_tmpfile.path }}" - option: "{{ item.key }}" - section: '' - value: "{{ (item.value | join(',')) if (item.value is sequence and item.value is not string) else item.value }}" - no_extra_spaces: true - create: no - mode: 0644 - loop: "{{ openhpc_config | dict2items }}" - delegate_to: localhost - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless | bool - changed_when: false # so molecule doesn't fail - become: no - -- name: Create slurm.conf - copy: - src: "{{ _slurm_conf_tmpfile.path }}" dest: /etc/slurm/slurm.conf owner: root group: root diff --git a/templates/slurm.conf.j2 b/templates/slurm.conf.j2 index 3d29994..3ffeff5 100644 --- a/templates/slurm.conf.j2 +++ b/templates/slurm.conf.j2 @@ -1,140 +1,18 @@ -# -# Example slurm.conf file. Please run configurator.html -# (in doc/html) to build a configuration file customized -# for your environment. -# -# -# slurm.conf file generated by configurator.html. -# -# See the slurm.conf man page for more information. -# ClusterName={{ openhpc_cluster_name }} -SlurmctldHost={{ openhpc_slurm_control_host }}{% if openhpc_slurm_control_host_address is defined %}({{ openhpc_slurm_control_host_address }}){% endif %} -#DisableRootJobs=NO -#EnforcePartLimits=NO -#EpilogSlurmctld= -#FirstJobId=1 -#MaxJobId=67043328 -#GresTypes= -#GroupUpdateForce=0 -#GroupUpdateTime=600 -#JobFileAppend=0 -#JobRequeue=1 -#JobSubmitPlugins=lua -#KillOnBadExit=0 -#LaunchType=launch/slurm -#Licenses=foo*4,bar -#MailProg=/bin/mail -#MaxJobCount=10000 -#MaxStepCount=40000 -#MaxTasksPerNode=512 -MpiDefault=none -#MpiParams=ports=#-# -#PluginDir= -#PlugStackConfig= -#PrivateData=jobs -ProctrackType=proctrack/linuxproc # TODO: really want cgroup but needs cgroup.conf and workaround for CI -#Prolog= -#PrologFlags= -#PrologSlurmctld= -#PropagatePrioProcess=0 -#PropagateResourceLimits= -#PropagateResourceLimitsExcept= -#RebootProgram= -SlurmctldPidFile=/var/run/slurmctld.pid -SlurmctldPort=6817 -SlurmdPidFile=/var/run/slurmd.pid -SlurmdPort=6818 -SlurmdSpoolDir=/var/spool/slurm # NB: not OpenHPC default! -SlurmUser=slurm -#SlurmdUser=root -#SrunEpilog= -#SrunProlog= -StateSaveLocation={{ openhpc_state_save_location }} -SwitchType=switch/none -#TaskEpilog= -#TaskPlugin=task/affinity -#TaskProlog= -#TopologyPlugin=topology/tree -#TmpFS=/tmp -#TrackWCKey=no -#TreeWidth= -#UnkillableStepProgram= -#UsePAM=0 -# -# -# TIMERS -#BatchStartTimeout=10 -#CompleteWait=0 -#EpilogMsgTime=2000 -#GetEnvTimeout=2 -#HealthCheckInterval=0 -#HealthCheckProgram= -InactiveLimit=0 -KillWait=30 -#MessageTimeout=10 -#ResvOverRun=0 -MinJobAge=300 -#OverTimeLimit=0 -SlurmctldTimeout=300 -SlurmdTimeout=300 -#UnkillableStepTimeout=60 -#VSizeFactor=0 -Waittime=0 -# -# -# SCHEDULING -#DefMemPerCPU=0 -#MaxMemPerCPU=0 -#SchedulerTimeSlice=30 -SchedulerType=sched/backfill -SelectType=select/cons_tres -SelectTypeParameters=CR_Core -# -# -# JOB PRIORITY -#PriorityFlags= -PriorityType=priority/multifactor -#PriorityDecayHalfLife= -#PriorityCalcPeriod= -#PriorityFavorSmall= -#PriorityMaxAge= -#PriorityUsageResetPeriod= -#PriorityWeightAge= -#PriorityWeightFairshare= -#PriorityWeightJobSize= -PriorityWeightPartition=1000 -#PriorityWeightQOS= -PreemptType=preempt/partition_prio -PreemptMode=SUSPEND,GANG -# -# LOGGING AND ACCOUNTING -#AccountingStorageEnforce=0 -AccountingStorageHost={{ openhpc_slurm_accounting_storage_host }} -{% if openhpc_slurm_accounting_storage_pass | default(false, true) %} -AccountingStoragePass={{ openhpc_slurm_accounting_storage_pass }} -{% endif %} -AccountingStoragePort={{ openhpc_slurm_accounting_storage_port }} -AccountingStorageType={{ openhpc_slurm_accounting_storage_type }} -AccountingStorageUser={{ openhpc_slurm_accounting_storage_user }} -#AccountingStoreFlags= -#JobCompHost= -JobCompLoc={{ openhpc_slurm_job_comp_loc }} -#JobCompPass= -#JobCompPort= -JobCompType={{ openhpc_slurm_job_comp_type }} -#JobCompUser= -#JobContainerType=job_container/none -JobAcctGatherFrequency={{ openhpc_slurm_job_acct_gather_frequency }} -JobAcctGatherType={{ openhpc_slurm_job_acct_gather_type }} +# PARAMETERS +{% for k, v in openhpc_default_config | combine(openhpc_config) | items %} +{% if v != "omit" %}{# allow removing items using setting key: null #} +{% if k != 'SlurmctldParameters' %}{# handled separately due to openhpc_slurm_configless #} +{{ k }}={{ v | join(',') if (v is sequence and v is not string) else v }} +{% endif %} +{% endif %} +{% endfor %} -# By default, SLURM will log to syslog, which is what we want -SlurmctldSyslogDebug=info -SlurmdSyslogDebug=info -#SlurmSchedLogFile= -#SlurmSchedLogLevel= -#DebugFlags= +{% set slurmctldparameters = ((openhpc_config.get('SlurmctldParameters', []) + (['enable_configless'] if openhpc_slurm_configless | bool else [])) | unique) %} +{% if slurmctldparameters | length > 0 %} +SlurmctldParameters={{ slurmctldparameters | join(',') }} +{% endif %} # LOGIN-ONLY NODES # Define slurmd nodes not in partitions for login-only nodes in "configless" mode: @@ -142,8 +20,6 @@ SlurmdSyslogDebug=info NodeName={{ node }} {% endfor %}{% endif %} -PropagateResourceLimitsExcept=MEMLOCK -Epilog=/etc/slurm/slurm.epilog.clean # COMPUTE NODES {% for nodegroup in openhpc_nodegroups %} @@ -183,8 +59,3 @@ PartitionName={{partition.name}} {{ '' -}} Nodes={{ partition.get('nodegroups', [partition.name]) | map('regex_replace', '^', 'nodegroup_') | join(',') }} {{ '' -}} {{ partition.partition_params | default({}) | dict2parameters }} {% endfor %}{# openhpc_partitions #} - -{% if openhpc_slurm_configless | bool %}SlurmctldParameters=enable_configless{% endif %} - - -ReturnToService=2