diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index cf23fa526..4073b9657 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -58,9 +58,9 @@ name: mysql tasks_from: install.yml - name: OpenHPC - import_role: + include_role: name: stackhpc.openhpc - tasks_from: install.yml + tasks_from: "install-{{ openhpc_install_type }}.yml" - name: Include distribution variables for osc.ood include_vars: "{{ appliances_repository_root }}/ansible/roles/osc.ood/vars/Rocky/8.yml" diff --git a/ansible/roles/filebeat/tasks/install.yml b/ansible/roles/filebeat/tasks/install.yml index 8e64722ec..6514e3028 100644 --- a/ansible/roles/filebeat/tasks/install.yml +++ b/ansible/roles/filebeat/tasks/install.yml @@ -15,3 +15,4 @@ - name: Reload filebeat unit file command: systemctl daemon-reload when: _filebeat_unit.changed + become: true diff --git a/ansible/roles/hpctests/README.md b/ansible/roles/hpctests/README.md index ee37791ec..c6a28d26b 100644 --- a/ansible/roles/hpctests/README.md +++ b/ansible/roles/hpctests/README.md @@ -38,6 +38,8 @@ The following variables should not generally be changed: - `hpctests_pingpong_plot`: Whether to plot pingpong results. Default `yes`. - `hpctests_hpl_modules`: As above but for hpl tests. - `hpctests_hpl_version`: Version of HPL +- `hpctests_extra_paths`: List of additional paths to add to $PATH in `pingpong` and `pingmatrix` sbatch scripts. +- `hpctests_pingpong_command`: Command to use to run IMB-MPI1 pingpong. Dependencies ------------ diff --git a/ansible/roles/hpctests/defaults/main.yml b/ansible/roles/hpctests/defaults/main.yml index 280fd454e..08831b7c2 100644 --- a/ansible/roles/hpctests/defaults/main.yml +++ b/ansible/roles/hpctests/defaults/main.yml @@ -1,7 +1,9 @@ --- hpctests_rootdir: +hpctests_extra_paths: [] hpctests_pingmatrix_modules: [gnu12 openmpi4] hpctests_pingpong_modules: [gnu12 openmpi4 imb] +hpctests_pingpong_command: 'mpirun IMB-MPI1 pingpong' # NB 'srun --mpi=pmi2 IMB-MPI1 pingpong' doesn't work in ohpc v2.1 hpctests_pingpong_plot: yes hpctests_hpl_modules: [gnu12 openmpi4 openblas] hpctests_outdir: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hpctests" diff --git a/ansible/roles/hpctests/templates/pingmatrix.sh.j2 b/ansible/roles/hpctests/templates/pingmatrix.sh.j2 index 17fb3fd6a..26b4de6c2 100644 --- a/ansible/roles/hpctests/templates/pingmatrix.sh.j2 +++ b/ansible/roles/hpctests/templates/pingmatrix.sh.j2 @@ -12,7 +12,8 @@ export UCX_NET_DEVICES={{ hpctests_ucx_net_devices }} echo SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST echo SLURM_JOB_ID: $SLURM_JOB_ID echo UCX_NET_DEVICES: $UCX_NET_DEVICES -module load {{ hpctests_pingmatrix_modules | join(' ' ) }} +{% if hpctests_pingmatrix_modules %}module load {{ hpctests_pingmatrix_modules | join(' ' ) }}{% endif %} +{% if hpctests_extra_paths %}export PATH={{ hpctests_extra_paths | join(':') }}:$PATH{% endif %} mpicc -o nxnlatbw mpi_nxnlatbw.c mpirun nxnlatbw diff --git a/ansible/roles/hpctests/templates/pingpong.sh.j2 b/ansible/roles/hpctests/templates/pingpong.sh.j2 index e74e52539..ae7baa45a 100644 --- a/ansible/roles/hpctests/templates/pingpong.sh.j2 +++ b/ansible/roles/hpctests/templates/pingpong.sh.j2 @@ -12,7 +12,7 @@ export UCX_NET_DEVICES={{ hpctests_ucx_net_devices }} echo SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST echo SLURM_JOB_ID: $SLURM_JOB_ID echo UCX_NET_DEVICES: $UCX_NET_DEVICES -module load {{ hpctests_pingpong_modules | join(' ' ) }} +{% if hpctests_pingpong_modules %}module load {{ hpctests_pingpong_modules | join(' ' ) }}{% endif %} +{% if hpctests_extra_paths %}export PATH={{ hpctests_extra_paths | join(':') }}:$PATH{% endif %} -#srun --mpi=pmi2 IMB-MPI1 pingpong # doesn't work in ohpc v2.1 -mpirun IMB-MPI1 pingpong +{{ hpctests_pingpong_command }} diff --git a/ansible/roles/mysql/tasks/install.yml b/ansible/roles/mysql/tasks/install.yml index 4427b7d18..4ed5d30ba 100644 --- a/ansible/roles/mysql/tasks/install.yml +++ b/ansible/roles/mysql/tasks/install.yml @@ -1,6 +1,12 @@ +- name: Install pip + dnf: + name: python3-pip + - name: Install python mysql client pip: - name: pymysql + name: + - pymysql + - cryptography state: present - name: Create systemd mysql container unit file @@ -11,6 +17,6 @@ - name: Pull container image containers.podman.podman_image: - name: "mysql" + name: docker.io/library/mysql tag: "{{ mysql_tag }}" become_user: "{{ mysql_podman_user }}" diff --git a/ansible/roles/mysql/templates/mysql.service.j2 b/ansible/roles/mysql/templates/mysql.service.j2 index 3b531cd3f..794035a8e 100644 --- a/ansible/roles/mysql/templates/mysql.service.j2 +++ b/ansible/roles/mysql/templates/mysql.service.j2 @@ -26,7 +26,7 @@ ExecStart=/usr/bin/podman run \ --volume {{ mysql_datadir }}:/var/lib/mysql:U \ --publish 3306:3306 \ --env MYSQL_ROOT_PASSWORD=${MYSQL_INITIAL_ROOT_PASSWORD} \ - mysql:{{ mysql_tag }}{%- for opt in mysql_mysqld_options %} \ + docker.io/library/mysql:{{ mysql_tag }}{%- for opt in mysql_mysqld_options %} \ --{{ opt }}{% endfor %} ExecStop=/usr/bin/podman stop --ignore mysql -t 10 diff --git a/ansible/roles/openondemand/tasks/main.yml b/ansible/roles/openondemand/tasks/main.yml index 34e1ac223..86184f13c 100644 --- a/ansible/roles/openondemand/tasks/main.yml +++ b/ansible/roles/openondemand/tasks/main.yml @@ -10,7 +10,7 @@ - include_role: name: osc.ood tasks_from: install-package.yml - vars_from: Rocky/8.yml + vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" public: yes # Expose the vars from this role to the rest of the play # can't set vars: from a dict hence the workaround above diff --git a/ansible/roles/openondemand/tasks/vnc_compute.yml b/ansible/roles/openondemand/tasks/vnc_compute.yml index bde13c383..5f403bf86 100644 --- a/ansible/roles/openondemand/tasks/vnc_compute.yml +++ b/ansible/roles/openondemand/tasks/vnc_compute.yml @@ -17,6 +17,7 @@ - turbovnc-3.0.1 - nmap-ncat - python3.9 + - dbus-x11 - name: Install Xfce desktop tags: install diff --git a/ansible/roles/opensearch/tasks/install.yml b/ansible/roles/opensearch/tasks/install.yml index 81547e5a0..9a0ffd361 100644 --- a/ansible/roles/opensearch/tasks/install.yml +++ b/ansible/roles/opensearch/tasks/install.yml @@ -16,7 +16,7 @@ - name: Pull container image containers.podman.podman_image: - name: "opensearchproject/opensearch" + name: docker.io/opensearchproject/opensearch tag: "{{ opensearch_version }}" become_user: "{{ opensearch_podman_user }}" diff --git a/ansible/roles/opensearch/templates/opensearch.service.j2 b/ansible/roles/opensearch/templates/opensearch.service.j2 index 6951bafc0..2d98305eb 100644 --- a/ansible/roles/opensearch/templates/opensearch.service.j2 +++ b/ansible/roles/opensearch/templates/opensearch.service.j2 @@ -29,7 +29,7 @@ ExecStart=/usr/bin/podman run \ --env bootstrap.memory_lock=true \ --env "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" \ --env DISABLE_INSTALL_DEMO_CONFIG=true \ - opensearchproject/opensearch:{{ opensearch_version }} + docker.io/opensearchproject/opensearch:{{ opensearch_version }} ExecStop=/usr/bin/podman stop --ignore opensearch -t 10 # note for some reason this returns status=143 which makes systemd show the unit as failed, not stopped ExecStopPost=/usr/bin/podman rm --ignore -f opensearch diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 080c74dcb..4144db5f4 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -25,8 +25,14 @@ tags: - openhpc tasks: - - import_role: + - include_role: + name: stackhpc.openhpc + tasks_from: "install-{{ openhpc_install_type }}.yml" + tags: install + - include_role: name: stackhpc.openhpc + tasks_from: runtime.yml + tags: runtime - name: Set locked memory limits on user-facing nodes hosts: diff --git a/environments/.stackhpc/ARCUS.pkrvars.hcl b/environments/.stackhpc/ARCUS.pkrvars.hcl index 2b1bbfb39..73d6d8c99 100644 --- a/environments/.stackhpc/ARCUS.pkrvars.hcl +++ b/environments/.stackhpc/ARCUS.pkrvars.hcl @@ -4,7 +4,7 @@ volume_size = 12 # GB. Compatible with SMS-lab's general.v1.tiny image_disk_format = "qcow2" networks = ["4b6b2722-ee5b-40ec-8e52-a6610e14cc51"] # portal-internal (DNS broken on ilab-60) source_image_name = "openhpc-230804-1754-80b8d714" # https://github.com/stackhpc/ansible-slurm-appliance/pull/298 -fatimage_source_image_name = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2" +fatimage_source_image_name = "Rocky-9-GenericCloud-Base-9.3-20231113.0.x86_64.qcow2" ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" security_groups = ["default", "SSH"] diff --git a/environments/.stackhpc/SMS.pkrvars.hcl b/environments/.stackhpc/SMS.pkrvars.hcl index cd9fe589a..f3678b23f 100644 --- a/environments/.stackhpc/SMS.pkrvars.hcl +++ b/environments/.stackhpc/SMS.pkrvars.hcl @@ -1,7 +1,7 @@ flavor = "general.v1.tiny" networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # stackhpc-ipv4-geneve source_image_name = "openhpc-230503-0944-bf8c3f63" # https://github.com/stackhpc/ansible-slurm-appliance/pull/252 -fatimage_source_image_name = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2" +fatimage_source_image_name = "Rocky-9-GenericCloud-Base-9.3-20231113.0.x86_64.qcow2" ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" security_groups = ["default", "SSH"] diff --git a/environments/.stackhpc/hooks/post-bootstrap.yml b/environments/.stackhpc/hooks/post-bootstrap.yml index fe783e469..df3902698 100644 --- a/environments/.stackhpc/hooks/post-bootstrap.yml +++ b/environments/.stackhpc/hooks/post-bootstrap.yml @@ -3,14 +3,15 @@ gather_facts: false tags: podman tasks: - - name: Configure container image registry for unqualified searches to avoid docker.io ratelimits + - name: Configure container image registry to avoid docker.io ratelimits copy: - dest: /etc/containers/registries.conf.d/003-arcus-unqualfied-overrides.conf + dest: /etc/containers/registries.conf.d/003-arcus-mirror.conf content: | - unqualified-search-registries = ['{{ podman_registry_address | split('/') | first }}', 'registry.access.redhat.com', 'registry.redhat.io', 'docker.io'] - [[registry]] - prefix = "{{ podman_registry_address }}" + location="docker.io/library/" + prefix="docker.io/library/" + + [[registry.mirror]] location = "{{ podman_registry_address }}" insecure = true when: "ci_cloud == 'ARCUS'" diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 0ab3be5ee..e7560119a 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -13,8 +13,8 @@ variable "cluster_name" { variable "cluster_image" { description = "single image for all cluster nodes - a convenience for CI" type = string - default = "openhpc-240116-1156-aa8dba7d" # https://github.com/stackhpc/ansible-slurm-appliance/pull/351 - # default = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2" + # default = "openhpc-240116-1156-aa8dba7d" # https://github.com/stackhpc/ansible-slurm-appliance/pull/351 + default = "Rocky-9-GenericCloud-Base-9.3-20231113.0.x86_64.qcow2" # TODO: create packer build } variable "cluster_net" {} @@ -62,8 +62,6 @@ module "cluster" { compute_nodes = { compute-0: "small" compute-1: "small" - compute-2: "extra" - compute-3: "extra" } volume_backed_instances = var.volume_backed_instances diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 23448c80d..91db4dc3a 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -58,6 +58,7 @@ appliances_local_users_default: uid: 981 home: "{{ prometheus_db_dir }}" shell: /usr/sbin/nologin + system: true enable: "{{ 'prometheus' in group_names }}" - group: @@ -69,6 +70,7 @@ appliances_local_users_default: uid: 984 home: /usr/share/grafana shell: /sbin/nologin + system: true enable: "{{ 'grafana' in group_names }}" # Overide this to add extra users whilst keeping the defaults. diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 1cb963657..d3f4964af 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -2,7 +2,7 @@ # See: https://github.com/stackhpc/ansible-role-openhpc # for variable definitions - +openhpc_install_type: ohpc # use "ohcp" for an OpenHPC-based system or "generic" if providing binaries openhpc_enable: control: "{{ inventory_hostname in groups['control'] }}" batch: "{{ inventory_hostname in groups['compute'] }}" diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index b7bdfdabc..18e741ce7 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -49,6 +49,8 @@ openondemand_clusters: module purge export PATH=/opt/TurboVNC/bin:$PATH + # avoid "Failed to create secure directory (/run/user/*/pulse)" + export XDG_RUNTIME_DIR="$TMPDIR/xdg_runtime" # Workaround to avoid "Unable to contact settings server" when # lauching xfce4-session diff --git a/requirements.yml b/requirements.yml index 3587966aa..59e717c67 100644 --- a/requirements.yml +++ b/requirements.yml @@ -3,7 +3,7 @@ roles: - src: stackhpc.nfs version: v23.12.1 # Tolerate state nfs file handles - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: v0.23.0 # https://github.com/stackhpc/ansible-role-openhpc/pull/165 + version: feat/no-ohpc # https://github.com/stackhpc/ansible-role-openhpc/pull/162 name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc