diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index c8bb9b06f..c60507aae 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -31,8 +31,10 @@ jobs: env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack - TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_id }} + CI_CLUSTER_NAME: slurmci-${{ matrix.os_version }}-${{ github.run_id }} CI_CLOUD: ${{ vars.CI_CLOUD }} + ANSIBLE_INVENTORY: environments/common/inventory,environments/.caas/inventory + DISTRO_VERSION: ${{ matrix.os_version }} steps: - uses: actions/checkout@v2 @@ -40,115 +42,55 @@ jobs: run: | echo CI_CLOUD: ${{ vars.CI_CLOUD }} - - name: Setup ssh - run: | - set -x - mkdir ~/.ssh - echo "${{ secrets[format('{0}_SSH_KEY', vars.CI_CLOUD)] }}" > ~/.ssh/id_rsa - chmod 0600 ~/.ssh/id_rsa - shell: bash - - - name: Add bastion's ssh key to known_hosts - run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts - shell: bash - + - name: Setup ssh for azimuth user + run: ssh-keygen -t rsa -q -f "$HOME/.ssh/id_rsa" -N "" + - name: Install ansible etc run: dev/setup-env.sh - - name: Install OpenTofu - uses: opentofu/setup-opentofu@v1 - with: - tofu_version: 1.6.2 - - - name: Initialise terraform - run: terraform init - working-directory: ${{ github.workspace }}/environments/.stackhpc/terraform - - name: Write clouds.yaml run: | mkdir -p ~/.config/openstack/ echo "${{ secrets[format('{0}_CLOUDS_YAML', vars.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml shell: bash - - name: Setup environment-specific inventory/terraform inputs - run: | - . venv/bin/activate - . environments/.stackhpc/activate - ansible-playbook ansible/adhoc/generate-passwords.yml - echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml - env: - TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - - - name: Provision nodes using fat image - id: provision_servers + - name: Setup CI-specific extravars (including per-cloud vars) run: | . venv/bin/activate - . environments/.stackhpc/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/terraform - terraform apply -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars" - env: - TF_VAR_os_version: ${{ matrix.os_version }} + cp environments/.caas/CI-extravars.yml environments/.caas/inventory/group_vars/all/CI-extravars.yml - - name: Delete infrastructure if provisioning failed + - name: Provision and configure cluster, run mpi-based tests and check slurm partitions run: | . venv/bin/activate - . environments/.stackhpc/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/terraform - terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars" - if: failure() && steps.provision_servers.outcome == 'failure' - env: - TF_VAR_os_version: ${{ matrix.os_version }} - - - name: Configure cluster - run: | - . venv/bin/activate - . environments/.stackhpc/activate - ansible all -m wait_for_connection ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml - - name: Run MPI-based tests - run: | - . venv/bin/activate - . environments/.stackhpc/activate - ansible-playbook -vv ansible/adhoc/hpctests.yml - # - name: Run EESSI tests # run: | # . venv/bin/activate - # . environments/.stackhpc/activate # ansible-playbook -vv ansible/ci/check_eessi.yml - - name: Confirm Open Ondemand is up (via SOCKS proxy) + - name: Confirm Open Ondemand is up (via FIP) run: | . venv/bin/activate - . environments/.stackhpc/activate # load ansible variables into shell: ansible-playbook ansible/ci/output_vars.yml \ - -e output_vars_hosts=openondemand \ + -e output_vars_hosts=control \ -e output_vars_path=$APPLIANCES_ENVIRONMENT_ROOT/vars.txt \ - -e output_vars_items=bastion_ip,bastion_user,openondemand_servername + -e output_vars_items=cluster_gateway_ip,vault_azimuth_user_password source $APPLIANCES_ENVIRONMENT_ROOT/vars.txt - # setup ssh proxying: - sudo apt-get --yes install proxychains - echo proxychains installed - ssh -v -fN -D 9050 ${bastion_user}@${bastion_ip} - echo port 9050 forwarded - # check OOD server returns 200: - statuscode=$(proxychains wget \ + statuscode=$(wget \ --quiet \ --spider \ --server-response \ --no-check-certificate \ - --http-user=testuser \ - --http-password=${TESTUSER_PASSWORD} https://${openondemand_servername} \ + --http-user=azimuth \ + --http-password=${vault_azimuth_user_password} https://${cluster_gateway_ip} \ 2>&1) (echo $statuscode | grep "200 OK") || (echo $statuscode && exit 1) - env: - TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} # - name: Build environment-specific compute image # id: packer_build @@ -171,33 +113,28 @@ jobs: - name: Test reimage of login and control nodes (via rebuild adhoc) run: | . venv/bin/activate - . environments/.stackhpc/activate ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down + rm environments/.caas/inventory/cluster_hosts.yml ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml - name: Check sacct state survived reimage run: | . venv/bin/activate - . environments/.stackhpc/activate ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml - name: Check MPI-based tests are shown in Grafana run: | . venv/bin/activate - . environments/.stackhpc/activate ansible-playbook -vv ansible/ci/check_grafana.yml - name: Delete infrastructure run: | . venv/bin/activate - . environments/.stackhpc/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/terraform - terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars" - if: ${{ success() || cancelled() }} - env: - TF_VAR_os_version: ${{ matrix.os_version }} + rm -f environments/.caas/inventory/cluster_hosts.yml + ansible-playbook -v ansible/site.yml -e cluster_state=absent + if: ${{ always() }} # success, failure or cancelled # - name: Delete images # run: | diff --git a/ansible/ci/check_grafana.yml b/ansible/ci/check_grafana.yml index 216cb1ed9..3f4469e02 100644 --- a/ansible/ci/check_grafana.yml +++ b/ansible/ci/check_grafana.yml @@ -2,8 +2,8 @@ # Can't actually check the dashboard programatically so this queries the datasource used by the dashboard instead. - hosts: control # so proxying etc is irrelevant - gather_facts: no - become: no + gather_facts: yes + become: yes tasks: - name: Wait for slurm-stats file to exist (run by cron) ansible.builtin.wait_for: diff --git a/ansible/ci/check_slurm.yml b/ansible/ci/check_slurm.yml index d95c5bb5c..4da8da747 100644 --- a/ansible/ci/check_slurm.yml +++ b/ansible/ci/check_slurm.yml @@ -21,4 +21,4 @@ vars: expected_sinfo: - - "{{ openhpc_cluster_name }}-compute-[0-1] standard* up 60-00:00:00 2 idle" + - "{{ openhpc_cluster_name }}-compute-standard-[0-1] standard* up 60-00:00:00 {{ compute_count }} idle" diff --git a/ansible/ci/output_vars.yml b/ansible/ci/output_vars.yml index 0e2bc4c61..b456349ad 100644 --- a/ansible/ci/output_vars.yml +++ b/ansible/ci/output_vars.yml @@ -1,12 +1,13 @@ # Output specific hostvars to a file in a form which can be sourced by bash # NB: obviously the keys and values for the hostvars need to be suitable bash variables - hosts: "{{ output_vars_hosts }}" - gather_facts: no + gather_facts: yes + become: yes tasks: - copy: dest: "{{ output_vars_path }}" content: | {% for item in output_vars_items.split(',') %} - export {{output_vars_prefix | default('') }}{{ item }}={{ lookup('vars', item) }} + export {{output_vars_prefix | default('') }}{{ item }}="{{ lookup('vars', item) }}" {% endfor %} delegate_to: localhost diff --git a/ansible/roles/cluster_infra/defaults/main.yml b/ansible/roles/cluster_infra/defaults/main.yml new file mode 100644 index 000000000..81830ddb7 --- /dev/null +++ b/ansible/roles/cluster_infra/defaults/main.yml @@ -0,0 +1 @@ +cluster_infra_inventory_file: "{{ appliances_environment_root }}/inventory/cluster_hosts.yml" diff --git a/ansible/roles/cluster_infra/tasks/main.yml b/ansible/roles/cluster_infra/tasks/main.yml index 5976b133c..971bb8a87 100644 --- a/ansible/roles/cluster_infra/tasks/main.yml +++ b/ansible/roles/cluster_infra/tasks/main.yml @@ -1,3 +1,18 @@ +- name: Remove inventory hosts file + file: + path: "{{ cluster_infra_inventory_file }}" + state: absent + register: _inventory_file + +- name: Error if inventory hosts file exists + # When creating infra this file must not exist to avoid inventory errors from mismatches + # between this hosts file and in-memory inventory. Note hosts cannot be deleted from + # inventory so ensuring the file does not exist before starting the infra role is the + # only safe approach. + assert: + that: not _inventory_file.changed + fail_msg: "Inventory file has been deleted - rerun the ansible command now" + - debug: msg: | terraform_backend_type: {{ terraform_backend_type }} @@ -60,3 +75,9 @@ - name: Provision infrastructure include_role: name: stackhpc.terraform.infra + +- name: Create inventory hosts file to permit ad-hoc uses + template: + src: cluster_hosts.yml.j2 + dest: "{{ cluster_infra_inventory_file }}" + when: terraform_state != 'absent' diff --git a/ansible/roles/cluster_infra/templates/cluster_hosts.yml.j2 b/ansible/roles/cluster_infra/templates/cluster_hosts.yml.j2 new file mode 100644 index 000000000..8e7eba528 --- /dev/null +++ b/ansible/roles/cluster_infra/templates/cluster_hosts.yml.j2 @@ -0,0 +1,33 @@ +all: + vars: + # TODO: cluster_domain_suffix: + cluster_gateway_ip: "{{ cluster_gateway_ip }}" + ansible_ssh_common_args: "{{ cluster_ssh_common_args }}" + ansible_ssh_private_key_file: "{{ cluster_ssh_private_key_file }}" +control: + hosts: +{% for control in groups['control'] %} + {{ control }}: + ansible_host: {{ hostvars[control].ansible_host }} + # TODO: instance_id: +{% endfor %} + +login: + hosts: +{% for login in groups['login'] %} + {{ login }}: + ansible_host: {{ hostvars[login].ansible_host }} + # TODO: instance_id: +{% endfor %} + +{{ cluster_name }}_standard: + hosts: +{% for compute in groups[cluster_name + '_standard'] %} + {{ compute }}: + ansible_host: {{ hostvars[compute].ansible_host }} + # TODO: instance_id: +{% endfor %} + +compute: + children: + {{ cluster_name }}_standard: diff --git a/environments/.caas/.gitignore b/environments/.caas/.gitignore new file mode 100644 index 000000000..53c95c9b9 --- /dev/null +++ b/environments/.caas/.gitignore @@ -0,0 +1 @@ +inventory/group_vars/all/CI-extravars.yml diff --git a/environments/.caas/CI-extravars.yml b/environments/.caas/CI-extravars.yml new file mode 100644 index 000000000..c31ced405 --- /dev/null +++ b/environments/.caas/CI-extravars.yml @@ -0,0 +1,47 @@ +# Variables in this file are normally injected by Azimuth/CaaS. This file is moved into +# the inventory by the CI workflow or the activate script + +# -- Environment lookups -- +# Set either by the workflow for CI or by the activate script for dev/debugging +cluster_name: "{{ lookup('env', 'CI_CLUSTER_NAME') | default('dev') }}" # TODO: potentially want to override this for dev +distro_version: "{{ lookup('env', 'DISTRO_VERSION') | default('RL9') }}" +ci_cloud: "{{ lookup('env', 'CI_CLOUD') }}" # set by GH actions +# -- + +# Use tiny volumes for CI +state_volume_size: 10 +home_volume_size: 10 + +# Likely to have multiple (identical) images with same name in dev due to image build + community image deploys +cluster_image_name: + RL8: openhpc-RL8-240423-1002-4b09ba85 + RL9: openhpc-ofed-RL9-240423-1059-4b09ba85 +cluster_image: "{{ (lookup('pipe', 'openstack image list -f yaml') | from_yaml | selectattr('Name', 'eq', cluster_image_name[distro_version]) | first)['ID'] }}" + +compute_count: 2 +cluster_run_validation: true # enable hpctests +cluster_id: "{{ cluster_name }}" # TODO not sure what this should be +cluster_user_ssh_public_key: "{{ lookup('file', lookup('fileglob', '~/.ssh/*.pub', wantlist=True) | first) }}" + +ci_cloud_vars: # cloud-specific vars: + LEAFCLOUD: + control_flavor_name: ec1.medium # gets down to ~100Mi mem free on deployment + login_flavor_name: en1.xsmall + compute_flavor: en1.xsmall + cluster_external_network: external + use_home_volume_type_fast: true + home_volume_type_fast: unencrypted + # TODO: can't currently select state volume type as unencrypted + ARCUS: + control_flavor_name: vm.ska.cpu.general.eighth + login_flavor_name: vm.ska.cpu.general.small + compute_flavor: vm.ska.cpu.general.small + cluster_external_network: CUDN-Internet + use_home_volume_type_fast: false + +login_flavor_name: "{{ ci_cloud_vars[ci_cloud].login_flavor_name }}" +control_flavor_name: "{{ ci_cloud_vars[ci_cloud].control_flavor_name }}" +compute_flavor: "{{ ci_cloud_vars[ci_cloud].compute_flavor }}" +cluster_external_network: "{{ ci_cloud_vars[ci_cloud].cluster_external_network }}" +use_home_volume_type_fast: "{{ ci_cloud_vars[ci_cloud].use_home_volume_type_fast }}" +home_volume_type_fast: "{{ ci_cloud_vars[ci_cloud].home_volume_type_fast }}" diff --git a/environments/.caas/README.md b/environments/.caas/README.md index 4a08433b0..de2844b6e 100644 --- a/environments/.caas/README.md +++ b/environments/.caas/README.md @@ -1,9 +1,10 @@ # Caas cluster -Environment for default Azimuth Slurm. This is not intended to be manually deployed. +Environment for Azimuth CaaS Slurm. This is also used for CI and may be manually deployed +for debugging and development. It should *not* be used for a non-CaaS Slurm cluster. Non-standard things for this environment: -- There is no activate script. +- The `activate` script is provided *only* for development/debugging. - `ansible.cgf` is provided in the repo root, as expected by the caas operator. - `ANSIBLE_INVENTORY` is set in the cluster type template, using a path relative to the runner project directory: @@ -16,3 +17,12 @@ Non-standard things for this environment: Ansible then defines `ansible_inventory_sources` which contains absolute paths, and that is used to derive the `appliances_environment_root` and `appliances_repository_root`. + +It is also used for CI, and may be manually deployed for development/debugging as follows: + + . venv/bin/activate + . enviroments/.caas/activate # NB: CI_CLOUD may need changing + ansible-playbook ansible/site.yml #-e cluster_state=absent + +Once deployed or at least the `cluster_infra` role has finished, individual or ad-hoc +playbooks may be run as usual. diff --git a/environments/.caas/activate b/environments/.caas/activate new file mode 100644 index 000000000..93425921e --- /dev/null +++ b/environments/.caas/activate @@ -0,0 +1,12 @@ +export ANSIBLE_INVENTORY=environments/common/inventory,environments/.caas/inventory +echo "Set ANSIBLE_INVENTORY=${ANSIBLE_INVENTORY}" +export OS_CLOUD=openstack +echo "Set OS_CLOUD=${OS_CLOUD} - override if using a non-default clouds.yaml" +export CI_CLOUD=LEAFCLOUD +echo "Set CI_CLOUD=${CI_CLOUD} - override if on another cloud" +export DISTRO_VERSION=RL9 +echo "Set DISTRO_VERSION=${DISTRO_VERSION} - or override for RL8" +cp environments/.caas/CI-extravars.yml environments/.caas/inventory/group_vars/all/CI-extravars.yml +echo "Copied CI extravars into inventory" +export CI_CLUSTER_NAME=dev +echo "Set CI_CLUSTER_NAME=$CI_CLUSTER_NAME - override for multiple dev deployments" diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index 05b0255c8..fe8474928 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -3,6 +3,7 @@ # Provision the infrastructure using Terraform - name: Provision infrastructure hosts: openstack + tags: cluster_infra roles: - cluster_infra