Skip to content

Commit

Permalink
Pyxis Installation (#22)
Browse files Browse the repository at this point in the history
* Updated packer AMI building recipee (#17)

* Update aws-ubuntu.pkr.hcl

* initial commit

* removed Docker related lines, moved installs to AMI creation

* changed enroot runtime directory; removed .sqsh download

* More updates

* added enroot.conf; changed comments in post_install.bash

* Moved enroot conf to packer
Updated tests and installing script

* Updated post_install.bash

Co-authored-by: vfdev <[email protected]>
  • Loading branch information
MrA2K2 and vfdev-5 committed Sep 19, 2021
1 parent 15c496c commit 14020be
Show file tree
Hide file tree
Showing 10 changed files with 146 additions and 41 deletions.
9 changes: 3 additions & 6 deletions .github/workflows/ami-install-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,10 @@ jobs:
steps:
- uses: actions/checkout@v2

- name: Install Docker
- name: Install Enroot
run: |
sudo docker run -d -t --name ubuntu_bash -v $PWD/packer:/packer -w /packer ubuntu:20.04
sudo docker exec -u root ubuntu_bash apt-get update
sudo docker exec -u root ubuntu_bash apt-get -y install sudo
sudo docker exec ubuntu_bash /bin/bash install_docker.sh
sudo docker exec ubuntu_bash docker --version
sudo bash ./packer/install_pyxis.bash
ENROOT_RUNTIME_DIR=$HOME/enroot-runtime enroot version
- name: Install Nvidia
run: |
Expand Down
4 changes: 0 additions & 4 deletions .github/workflows/users-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ jobs:
id testuser
grep testuser /shared/.userslist
sudo grep "abc" /shared/home/testuser/.ssh/authorized_keys
cat /etc/group | grep docker | grep testuser
- name: Remove user
run: |
Expand All @@ -39,8 +38,5 @@ jobs:
if grep testuser /shared/.userslist &>/dev/null ; then
exit 1
fi
if cat /etc/group | grep docker | grep testuser &>/dev/null ; then
exit 1
fi
sudo ls -1qA /shared/home/.backup/testuser/ | grep -q .
12 changes: 8 additions & 4 deletions packer/aws-ubuntu.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ packer {

variable "ami_prefix" {
type = string
default = "ubuntu-pcluster-cuda-docker"
default = "ubuntu-pcluster-cuda-enroot-pyxis"
}

variable "ami_version" {
type = string
default = "0.2.1"
default = "0.3.0"
}


Expand Down Expand Up @@ -46,10 +46,14 @@ build {
"source.amazon-ebs.ubuntu"
]
provisioner "shell" {
script = "install_docker.sh"
script = "install_nvidia.sh"
}
provisioner "file" {
source = "enroot.conf"
destination = "/tmp/enroot.conf"
}
provisioner "shell" {
script = "install_nvidia.sh"
script = "install_pyxis.bash"
}
}

71 changes: 71 additions & 0 deletions packer/enroot.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#ENROOT_LIBRARY_PATH /usr/local/lib/enroot
#ENROOT_SYSCONF_PATH /usr/local/etc/enroot
ENROOT_RUNTIME_PATH ${ENROOT_RUNTIME_DIR:-/tmp/enroot_runtime/${UID}}
#ENROOT_CONFIG_PATH ${XDG_CONFIG_HOME}/enroot
ENROOT_CACHE_PATH ${ENROOT_CACHE_DIR:-/tmp/enroot_cache/${UID}}
ENROOT_DATA_PATH ${ENROOT_DATA_DIR:-/tmp/enroot_data/${UID}}
#ENROOT_TEMP_PATH ${TMPDIR:-/tmp}

# Gzip program used to uncompress digest layers.
#ENROOT_GZIP_PROGRAM gzip

# Options passed to zstd to compress digest layers.
#ENROOT_ZSTD_OPTIONS -1

# Options passed to mksquashfs to produce container images.
#ENROOT_SQUASH_OPTIONS -comp lzo -noD

# Make the container root filesystem writable by default.
#ENROOT_ROOTFS_WRITABLE no

# Remap the current user to root inside containers by default.
#ENROOT_REMAP_ROOT no

# Maximum number of processors to use for parallel tasks (0 means unlimited).
#ENROOT_MAX_PROCESSORS $(nproc)

# Maximum number of concurrent connections (0 means unlimited).
#ENROOT_MAX_CONNECTIONS 10

# Maximum time in seconds to wait for connections establishment (0 means unlimited).
#ENROOT_CONNECT_TIMEOUT 30

# Maximum time in seconds to wait for network operations to complete (0 means unlimited).
#ENROOT_TRANSFER_TIMEOUT 0

# Number of times network operations should be retried.
#ENROOT_TRANSFER_RETRIES 0

# Use a login shell to run the container initialization.
#ENROOT_LOGIN_SHELL yes

# Allow root to retain his superuser privileges inside containers.
#ENROOT_ALLOW_SUPERUSER no

# Use HTTP for outgoing requests instead of HTTPS (UNSECURE!).
#ENROOT_ALLOW_HTTP no

# Include user-specific configuration inside bundles by default.
#ENROOT_BUNDLE_ALL no

# Generate an embedded checksum inside bundles by default.
#ENROOT_BUNDLE_CHECKSUM no

# Mount the current user's home directory by default.
#ENROOT_MOUNT_HOME no

# Restrict /dev inside the container to a minimal set of devices.
#ENROOT_RESTRICT_DEV no

# Always use --force on command invocations.
#ENROOT_FORCE_OVERRIDE no

# SSL certificates settings:
#SSL_CERT_DIR
#SSL_CERT_FILE

# Proxy settings:
#all_proxy
#no_proxy
#http_proxy
#https_proxy
10 changes: 0 additions & 10 deletions packer/install_docker.sh

This file was deleted.

43 changes: 43 additions & 0 deletions packer/install_pyxis.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash

set -e

script_folder="`dirname $0`"

enroot_dir="./enroot"
pyxis_dir="./pyxis"

# Refs:
# - https://github.com/NVIDIA/enroot/blob/v3.3.1/doc/installation.md
# - https://github.com/NVIDIA/pyxis/wiki/Installation

# Install enroot
sudo apt-get update
sudo apt-get install -y git gcc make libcap2-bin libtool automake zstd
sudo apt-get install -y curl gawk jq squashfs-tools parallel

# Setup Nvidia container runtime package
curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | sudo apt-key add -
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.list | sudo tee /etc/apt/sources.list.d/nvidia-container-runtime.list
sudo apt-get update

sudo apt-get install -y fuse-overlayfs libnvidia-container-tools pigz squashfuse
git clone --recurse-submodules https://github.com/NVIDIA/enroot.git -b v3.3.1 $enroot_dir
sudo make --directory=$enroot_dir install
sudo make --directory=$enroot_dir setcap
sudo rm -rf $enroot_dir
sudo apt-get clean

# Use our configuration:
sudo cp $script_folder/enroot.conf /usr/local/etc/enroot/enroot.conf

# Install pyxis
if [ -f "/opt/slurm/include/slurm/slurm.h" ] ; then
git clone https://github.com/NVIDIA/pyxis.git $pyxis_dir
sudo ln -s /opt/slurm/include/slurm /usr/include/slurm
sudo make --directory=$pyxis_dir install
sudo rm -rf $pyxis_dir
sudo apt-get clean
fi

8 changes: 4 additions & 4 deletions packer/readme.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

### Install Packer
### Install Packer
https://learn.hashicorp.com/tutorials/packer/get-started-install-cli?in=packer/aws-get-started
```bash
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
Expand Down Expand Up @@ -38,10 +38,10 @@ The document contains multiple blocks, here is the brief description of each

2. __variable__ block contains the variable definition. In this case, the variables are used for AMI name. There are multiple ways to assign variables which include variable file(s) and command-line flag. It will be shown in the later part of the tutorial

3. __source__ block configures a specific builder plugin, which is then invoked by a build block. A source block has two important labels: a builder type and a name. These two labels together will allow us to uniquely reference sources later on when we define build runs. In the example template, the builder type is __amazon-ebs__ and the name is __ubuntu__. We declare the name of the created AMI using the previously declared variables, the region and the instance type which will be used during the building process. __launch_block_device_mappings__ is used to either modify or to add additional volume mounts to the instance. In this case we modify the root volume to have the size of 500 GB. __source_ami_filter__ is used to specify the base AMI used for creation of custom AMI (https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeImages.html).
3. __source__ block configures a specific builder plugin, which is then invoked by a build block. A source block has two important labels: a builder type and a name. These two labels together will allow us to uniquely reference sources later on when we define build runs. In the example template, the builder type is __amazon-ebs__ and the name is __ubuntu__. We declare the name of the created AMI using the previously declared variables, the region and the instance type which will be used during the building process. __launch_block_device_mappings__ is used to either modify or to add additional volume mounts to the instance. In this case we modify the root volume to have the size of 500 GB. __source_ami_filter__ is used to specify the base AMI used for creation of custom AMI (https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeImages.html).


4. __build__ block defines what Packer should do with the EC2 instance after it launches. In the example template, the build block references the AMI defined by the source block above __source.amazon-ebs.ubuntu__. It also contains the __provisioner__ section, which describes the instruction needed to be executed during the building of AMI. In our case, these are stored in the __install_nvidia.sh__ and __install_docker.sh__.
4. __build__ block defines what Packer should do with the EC2 instance after it launches. In the example template, the build block references the AMI defined by the source block above __source.amazon-ebs.ubuntu__. It also contains the __provisioner__ section, which describes the instruction needed to be executed during the building of AMI. In our case, these are stored in the __install_nvidia.sh__ and __install_pyxis.bash__.

Before you can build the AMI, you need to provide your AWS credentials to Packer as environment environments. These credentials have permissions to create, modify and delete EC2 instances. Refer to the documentation to find the full list IAM permissions required to run the amazon-ebs builder (https://www.packer.io/docs/builders/amazon#iam-task-or-instance-role). Add your AWS credentials as two environment variables, AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY, replacing YOUR_ACCESS_KEY and YOUR_SECRET_KEY with their respective values.

Expand All @@ -62,7 +62,7 @@ Format your configuration and validate it
packer fmt .
packer validate .
```
Now you can finally build the AMI
Now you can finally build the AMI

```bash
packer build aws-ubuntu.pkr.hcl
Expand Down
12 changes: 4 additions & 8 deletions playground_cluster.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Then you can use ssh to connect to the head node and check `/var/log/chef-client
which should confirm where the creation is stuck on or `/var/log/parallelcluster/clustermgtd`
that contains the reason why capacity cannot be provisioned"

- https://docs.aws.amazon.com/parallelcluster/latest/ug/troubleshooting.html#retrieving-and-preserve-logs

1.1 Check connection to the head node
```bash
Expand Down Expand Up @@ -170,17 +171,12 @@ sbatch script5.sbatch
squeue
```

#### Using Docker containers

1. Check docker
Connect to the cluster and execute:
```bash
docker images
```
#### Using Docker images

1. ...
TODO ...

2. Submit CPU job using docker container
2. Submit CPU job using docker images


TODO ...
Expand Down
13 changes: 13 additions & 0 deletions setup/playground/post_install.bash
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,17 @@ if [ "${cfn_node_type}" = "ComputeFleet" ]; then

fi


# Enroot configuration
echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] Creating enroot directories" >&2

sudo touch /opt/slurm/etc/plugstack.conf
sudo bash -c "echo 'required /usr/local/lib/slurm/spank_pyxis.so runtime_path=/tmp/pyxis' > /opt/slurm/etc/plugstack.conf"

export ENROOT_RUNTIME_DIR=/shared/enroot_runtime/${UID}
export ENROOT_CACHE_DIR=/shared/enroot_data/${UID}
export ENROOT_DATA_DIR=/shared/enroot_cache/${UID}

echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] post_install.bash: enroot version: $(enroot version)" >&2

echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] post_install.bash: STOP" >&2
5 changes: 0 additions & 5 deletions setup/users/add_new_user.bash
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,3 @@ if conda --version &> /dev/null; then
sudo bash $script_folder/add_conda_init.bash $USERNAME
echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] Added conda initialization to .bashrc" >&2
fi

if docker --version &> /dev/null; then
sudo usermod -aG docker $USERNAME
echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] Added user to docker group: $(cat /etc/group | grep docker)" >&2
fi

0 comments on commit 14020be

Please sign in to comment.