|
| 1 | +# Exactly as for slurm-infra.yml but to allow for separate manila/non-manila home appliances |
| 2 | +name: "slurm-manila-home" |
| 3 | +label: "Slurm (CephFS home)" |
| 4 | +description: >- |
| 5 | + Batch cluster running the Slurm workload manager, the Open |
| 6 | + OnDemand web interface, and custom monitoring. |
| 7 | +
|
| 8 | + This version uses CephFS for home directories, deleted |
| 9 | + when the platform is deleted. |
| 10 | +logo: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/158px-Slurm_logo.svg.png |
| 11 | + |
| 12 | +parameters: |
| 13 | + - name: cluster_floating_ip |
| 14 | + label: External IP |
| 15 | + description: The external IP to use for the login node. |
| 16 | + kind: cloud.ip |
| 17 | + immutable: true |
| 18 | + |
| 19 | + - name: login_flavor |
| 20 | + label: Login node size |
| 21 | + description: The size to use for the login node. |
| 22 | + kind: cloud.size |
| 23 | + immutable: true |
| 24 | + options: |
| 25 | + min_ram: 2048 |
| 26 | + min_disk: 20 |
| 27 | + |
| 28 | + - name: control_flavor |
| 29 | + label: Control node size |
| 30 | + description: The size to use for the control node. |
| 31 | + kind: cloud.size |
| 32 | + immutable: true |
| 33 | + options: |
| 34 | + min_ram: 2048 |
| 35 | + min_disk: 20 |
| 36 | + |
| 37 | + - name: compute_count |
| 38 | + label: Compute node count |
| 39 | + description: The number of compute nodes in the cluster. |
| 40 | + kind: integer |
| 41 | + options: |
| 42 | + min: 1 |
| 43 | + default: 3 |
| 44 | + |
| 45 | + - name: compute_flavor |
| 46 | + label: Compute node size |
| 47 | + description: The size to use for the compute node. |
| 48 | + kind: cloud.size |
| 49 | + immutable: true |
| 50 | + options: |
| 51 | + count_parameter: compute_count |
| 52 | + min_ram: 2048 |
| 53 | + min_disk: 20 |
| 54 | + |
| 55 | + - name: home_volume_size |
| 56 | + label: Home share size (GB) |
| 57 | + description: The size of the share to use for home directories. |
| 58 | + kind: integer |
| 59 | + immutable: true |
| 60 | + options: |
| 61 | + min: 10 |
| 62 | + default: 100 |
| 63 | + |
| 64 | + - name: state_volume_size |
| 65 | + label: State volume size (GB) |
| 66 | + description: | |
| 67 | + The size of the state volume, used to hold and persist important files and data. Of |
| 68 | + this volume, 10GB is set aside for cluster state and the remaining space is used |
| 69 | + to store cluster metrics. |
| 70 | +
|
| 71 | + The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be |
| 72 | + discarded to ensure that the database does not grow larger than this volume. |
| 73 | + kind: cloud.volume_size |
| 74 | + immutable: true |
| 75 | + options: |
| 76 | + min: 20 |
| 77 | + default: 20 |
| 78 | + |
| 79 | + - name: cluster_run_validation |
| 80 | + label: Post-configuration validation |
| 81 | + description: >- |
| 82 | + If selected, post-configuration jobs will be executed to validate the core functionality |
| 83 | + of the cluster when it is re-configured. |
| 84 | + kind: boolean |
| 85 | + required: false |
| 86 | + default: true |
| 87 | + options: |
| 88 | + checkboxLabel: Run post-configuration validation? |
| 89 | + |
| 90 | +usage_template: |- |
| 91 | + # Accessing the cluster using Open OnDemand |
| 92 | +
|
| 93 | + [Open OnDemand](https://openondemand.org/) is a web portal for managing HPC jobs, including graphical |
| 94 | + environments such as [Jupyter Notebooks](https://jupyter.org/). |
| 95 | +
|
| 96 | + {% if cluster.outputs.openondemand_url %} |
| 97 | + The Open OnDemand portal for this cluster is available at |
| 98 | + [{{ cluster.outputs.openondemand_url.slice(8) }}]({{ cluster.outputs.openondemand_url }}). |
| 99 | +
|
| 100 | + Enter the username `azimuth` and password `{{ cluster.outputs.azimuth_user_password }}` when prompted. |
| 101 | + {% else %} |
| 102 | + The Open OnDemand portal for this cluster can be accessed from the services list. |
| 103 | + {% endif %} |
| 104 | +
|
| 105 | + # Accessing the cluster using SSH |
| 106 | +
|
| 107 | + The cluster can be accessed over SSH via the external IP. The SSH public key of the user that |
| 108 | + deployed the cluster is injected into the `azimuth` user: |
| 109 | +
|
| 110 | + ``` |
| 111 | + $ ssh azimuth@{{ cluster.outputs.cluster_access_ip | default('[cluster ip]') }} |
| 112 | + [azimuth@{{ cluster.name }}-login-0 ~]$ sinfo |
| 113 | + PARTITION AVAIL TIMELIMIT NODES STATE NODELIST |
| 114 | + compute* up 60-00:00:0 {{ "%3s" | format(cluster.parameter_values.compute_count) }} idle {{ cluster.name }}-compute-[0-{{ cluster.parameter_values.compute_count - 1 }}] |
| 115 | + ``` |
| 116 | +
|
| 117 | + The `rocky` user can be accessed the same way and has passwordless `sudo` enabled. |
| 118 | +
|
| 119 | + SSH access can be granted to additional users by placing their SSH public key in `~azimuth/.ssh/authorized_keys`. |
| 120 | +
|
| 121 | +services: |
| 122 | + - name: ood |
| 123 | + label: Open OnDemand |
| 124 | + icon_url: https://github.com/stackhpc/ansible-slurm-appliance/raw/main/environments/.caas/assets/ood-icon.png |
| 125 | + - name: monitoring |
| 126 | + label: Monitoring |
| 127 | + icon_url: https://raw.githubusercontent.com/cncf/artwork/master/projects/prometheus/icon/color/prometheus-icon-color.png |
0 commit comments