Skip to content

Commit ea3155a

Browse files
authored
Merge pull request #325 from stackhpc/feat/caas
Merge caas slurm appliance into slurm appliance
2 parents 6f31af4 + 268a3bc commit ea3155a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+1595
-43
lines changed

ansible.cfg

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Only used for Azimuth running the caas environment
2+
[defaults]
3+
any_errors_fatal = True
4+
gathering = smart
5+
forks = 30
6+
host_key_checking = False
7+
remote_tmp = /tmp
8+
collections_path = ansible/collections
9+
roles_path = ansible/roles
10+
filter_plugins = ansible/filter_plugins
11+
callbacks_enabled = ansible.posix.profile_tasks
12+
13+
[ssh_connection]
14+
ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
15+
pipelining = True
16+
# This is important because we are using one of the hosts in the play as a jump host
17+
# This ensures that if the proxy connection is interrupted, rendering the other hosts
18+
# unreachable, the connection is retried instead of failing the entire play
19+
retries = 10

ansible/.gitignore

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,6 @@ roles/*
2828
!roles/firewalld/**
2929
!roles/etc_hosts/
3030
!roles/etc_hosts/**
31-
!roles/cloud_init/
32-
!roles/cloud_init/**
3331
!roles/mysql/
3432
!roles/mysql/**
3533
!roles/systemd/
@@ -44,3 +42,16 @@ roles/*
4442
!roles/resolv_conf/**
4543
!roles/cve-2023-41914
4644
!roles/cve-2023-41914/**
45+
!roles/cluster_infra/
46+
!roles/cluster_infra/**
47+
!roles/image_build_infra/
48+
!roles/image_build_infra/**
49+
!roles/persist_openhpc_secrets/
50+
!roles/persist_openhpc_secrets/**
51+
!roles/zenith_proxy/
52+
!roles/zenith_proxy/**
53+
!roles/image_build/
54+
!roles/image_build/**
55+
!roles/persist_hostkeys/
56+
!roles/persist_hostkeys/**
57+
!roles/requirements.yml

ansible/adhoc/template-cloud-init.yml

Lines changed: 0 additions & 9 deletions
This file was deleted.

ansible/bootstrap.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@
114114
name: fail2ban
115115

116116
- name: Setup podman
117+
gather_facts: false
117118
hosts: podman
118119
tags: podman
119120
tasks:

ansible/noop.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@
66

77
- hosts: localhost
88
gather_facts: false
9-
tasks: []
9+
tasks: []
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
cluster_deploy_ssh_keys_extra: []
2+
3+
# List of hw_scsi_models that result in block devices presenting as /dev/sdX
4+
# rather than /dev/vdX
5+
scsi_models:
6+
# Ceph [https://docs.ceph.com/en/quincy/rbd/rbd-openstack/#image-properties]
7+
- virtio-scsi
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
- debug:
2+
msg: |
3+
terraform_backend_type: {{ terraform_backend_type }}
4+
terraform_state: {{ terraform_state }}
5+
cluster_upgrade_system_packages: {{ cluster_upgrade_system_packages | default('undefined') }}
6+
7+
# We need to convert the floating IP id to an address for Terraform
8+
# if we we have cluster_floating_ip, otherwise assume that we're
9+
# assigning the FIP in Terraform and that it will be available in
10+
# outputs.cluster_gateway_ip.
11+
- block:
12+
- name: Look up floating IP
13+
include_role:
14+
name: stackhpc.terraform.infra
15+
tasks_from: lookup_floating_ip
16+
vars:
17+
os_floating_ip_id: "{{ cluster_floating_ip }}"
18+
19+
- name: Set floating IP address fact
20+
set_fact:
21+
cluster_floating_ip_address: "{{ os_floating_ip_info.floating_ip_address }}"
22+
when: cluster_floating_ip is defined
23+
24+
- name: Install Terraform binary
25+
include_role:
26+
name: stackhpc.terraform.install
27+
28+
- name: Make Terraform project directory
29+
file:
30+
path: "{{ terraform_project_path }}"
31+
state: directory
32+
33+
- name: Write backend configuration
34+
copy:
35+
content: |
36+
terraform {
37+
backend "{{ terraform_backend_type }}" { }
38+
}
39+
dest: "{{ terraform_project_path }}/backend.tf"
40+
41+
# Patching in this appliance is implemented as a switch to a new base image
42+
# So unless explicitly patching, we want to use the same image as last time
43+
# To do this, we query the previous Terraform state before updating
44+
- block:
45+
- name: Get previous Terraform state
46+
stackhpc.terraform.terraform_output:
47+
binary_path: "{{ terraform_binary_path }}"
48+
project_path: "{{ terraform_project_path }}"
49+
backend_config: "{{ terraform_backend_config }}"
50+
register: cluster_infra_terraform_output
51+
52+
- name: Extract image from Terraform state
53+
set_fact:
54+
cluster_previous_image: "{{ cluster_infra_terraform_output.outputs.cluster_image.value }}"
55+
when: '"cluster_image" in cluster_infra_terraform_output.outputs'
56+
when:
57+
- terraform_state == "present"
58+
- cluster_upgrade_system_packages is not defined or not cluster_upgrade_system_packages
59+
60+
- name: Detect volume device prefix from image metadata
61+
block:
62+
- name: Get image metadata from OpenStack API
63+
openstack.cloud.image_info:
64+
image: "{{ cluster_previous_image | default(cluster_image) }}"
65+
register: cluster_image_info
66+
- name: Check only single image found
67+
assert:
68+
that: cluster_image_info.images | length == 1
69+
fail_msg: "Multiple images found for 'cluster_image' {{ cluster_image }}"
70+
- name: Set volume_device_prefix fact
71+
set_fact:
72+
block_device_prefix: >-
73+
{{
74+
'sd' if (cluster_image_info.images | first).hw_scsi_model is defined and
75+
(cluster_image_info.images | first).hw_scsi_model in scsi_models
76+
else 'vd'
77+
}}
78+
# Only run when block_device_prefix isn't set as an extravar
79+
when:
80+
- block_device_prefix is not defined
81+
- cluster_image is defined
82+
83+
- name: Template Terraform files into project directory
84+
template:
85+
src: >-
86+
{{
87+
"{}{}.j2".format(
88+
(
89+
cluster_terraform_template_dir ~ "/"
90+
if cluster_terraform_template_dir is defined
91+
else ""
92+
),
93+
item
94+
)
95+
}}
96+
dest: "{{ terraform_project_path }}/{{ item }}"
97+
loop:
98+
- outputs.tf
99+
- providers.tf
100+
- resources.tf
101+
102+
- name: Provision infrastructure
103+
include_role:
104+
name: stackhpc.terraform.infra
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
output "cluster_gateway_ip" {
2+
description = "The IP address of the gateway used to contact the cluster nodes"
3+
value = openstack_compute_floatingip_associate_v2.login_floatingip_assoc.floating_ip
4+
}
5+
6+
{% if cluster_ssh_private_key_file is not defined %}
7+
output "cluster_ssh_private_key" {
8+
description = "The private component of the keypair generated on cluster provision"
9+
value = openstack_compute_keypair_v2.cluster_keypair.private_key
10+
sensitive = true
11+
}
12+
{% endif %}
13+
14+
output "cluster_nodes" {
15+
description = "A list of the nodes in the cluster from which an Ansible inventory will be populated"
16+
value = concat(
17+
[
18+
{
19+
name = openstack_compute_instance_v2.login.name
20+
ip = openstack_compute_instance_v2.login.network[0].fixed_ip_v4
21+
groups = ["login", "{{ cluster_name }}_login"],
22+
facts = {
23+
openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id
24+
}
25+
},
26+
{
27+
name = openstack_compute_instance_v2.control.name
28+
ip = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
29+
groups = ["control", "{{ cluster_name }}_control"],
30+
facts = {
31+
openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id
32+
}
33+
}
34+
],
35+
{% for partition in openhpc_slurm_partitions %}
36+
[
37+
for compute in openstack_compute_instance_v2.{{ partition.name }}: {
38+
name = compute.name
39+
ip = compute.network[0].fixed_ip_v4
40+
groups = ["compute", "{{ cluster_name }}_compute", "{{ cluster_name }}_{{ partition.name }}"],
41+
facts = {
42+
openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id
43+
}
44+
}
45+
]{{ ',' if not loop.last }}
46+
{% endfor %}
47+
)
48+
}
49+
50+
output "cluster_image" {
51+
description = "The id of the image used to build the cluster nodes"
52+
value = "{{ cluster_previous_image | default(cluster_image) }}"
53+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
terraform {
2+
required_version = ">= 0.14"
3+
4+
# We need the OpenStack provider
5+
required_providers {
6+
openstack = {
7+
source = "terraform-provider-openstack/openstack"
8+
}
9+
}
10+
}

0 commit comments

Comments
 (0)