Added steps to install slinky on K8s and example training workload

farshadghodsian · farshadghodsian · commit 0598ea8c5b9e · 2025-04-10T17:39:38.000-04:00
diff --git a/slinky/Readme.md b/slinky/Readme.md
@@ -0,0 +1,139 @@
+# Example Slinky Training Workload on Kubernetes
+
+The following outlines steps to get up and running with Slinky on Kubernetes and running a simple image classification training workload to verify GPUs are accessible.
+
+## Clone this repo and go into slinky folder
+
+```bash
+git clone https://github.com/amd/ada.git
+cd slinky
+```
+
+## Installing Slinky Prerequisites
+
+The following steps for installing pre-requisites and installing Slinky have been taking from the SlinkProject/slinky-operator repo [quick-start guide](https://github.com/SlinkyProject/slurm-operator/blob/main/docs/quickstart.md)
+
+```bash
+helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+helm repo add metrics-server https://kubernetes-sigs.github.io/metrics-server/
+helm repo add bitnami https://charts.bitnami.com/bitnami
+helm repo add jetstack https://charts.jetstack.io
+helm repo update
+helm install cert-manager jetstack/cert-manager \
+	--namespace cert-manager --create-namespace --set crds.enabled=true
+helm install prometheus prometheus-community/kube-prometheus-stack \
+	--namespace prometheus --create-namespace --set installCRDs=true
+```
+
+## Installing Slinky Operator
+
+```bash
+helm install slurm-operator oci://ghcr.io/slinkyproject/charts/slurm-operator \
+  --values=values-operator.yaml --version=0.1.0 --namespace=slinky --create-namespace
+```
+
+Make sure the operator deployed successfully with:
+
+```sh
+kubectl --namespace=slinky get pods
+```
+
+Output should be similar to:
+
+```sh
+NAME                                      READY   STATUS    RESTARTS   AGE
+slurm-operator-7444c844d5-dpr5h           1/1     Running   0          5m00s
+slurm-operator-webhook-6fd8d7857d-zcvqh   1/1     Running   0          5m00s
+```
+
+## Installing Slurm Cluster
+
+```bash
+helm install slurm oci://ghcr.io/slinkyproject/charts/slurm \
+  --values=values-slurm.yaml --version=0.1.0 --namespace=slurm --create-namespace
+```
+
+Make sure the Slurm cluster deployed successfully with:
+
+```sh
+kubectl --namespace=slurm get pods
+```
+
+Output should be similar to:
+
+```sh
+NAME                              READY   STATUS    RESTARTS       AGE
+slurm-accounting-0                1/1     Running   0              5m00s
+slurm-compute-gpu-node            1/1     Running   0              5m00s
+slurm-controller-0                2/2     Running   0              5m00s
+slurm-exporter-7b44b6d856-d86q5   1/1     Running   0              5m00s
+slurm-mariadb-0                   1/1     Running   0              5m00s
+slurm-restapi-5f75db85d9-67gpl    1/1     Running   0              5m00s
+```
+
+## Prepping Compute Node
+
+1. Get SLURM Compute Node Name
+
+    ```bash
+    SLURM_COMPUTE_POD=$(kubectl get pods -n slurm | grep ^slurm-compute-gpu-node | awk '{print $1}');echo $SLURM_COMPUTE_POD
+    ```
+
+2. Add Slurm user to video and render group and create Slurm user home directory to Slrum Compute node
+
+    ```bash
+    kubectl exec -it -n slurm $SLURM_COMPUTE_POD -- bash -c "
+        usermod -aG video,render slurm
+        mkdir -p /home/slurm
+        chown slurm:slurm /home/slurm"
+    ```
+
+3. Copy PyTorch test script to Slurm compute node
+
+    ```bash
+    kubectl cp test.py slurm/$SLURM_COMPUTE_POD:/tmp/test.py 
+    ```
+
+4. Copy Fashion MNIST Image Classification Model Training script to Slurm compute node
+
+    ```bash
+    kubectl cp train_fashion_mnist.py slurm/$SLURM_COMPUTE_POD:/tmp/train_fashion_mnist.py 
+    ```
+
+5. Run test.py script on compute node to confirm GPUs are accessible
+
+    ```bash
+    kubectl exec -it slurm-controller-0 -n slurm --  srun python3 test.py
+    ```
+
+6. Run single-GPU training script on compute node
+
+    ```bash
+    kubectl exec -it slurm-controller-0 -n slurm --  srun python3 train_fashion_mnist.py
+    ```
+
+7. Run multi-GPU training script on compute node
+
+    ```bash
+    kubectl exec -it slurm-controller-0 -n slurm --  srun apptainer exec --rocm --bind /tmp:/tmp torch_rocm.sif torchrun --standalone --nnodes=1 --nproc_per_node=8 --master-addr localhost train_mnist_distributed.py
+    ```
+
+## Other Useful Slurm Commands
+
+### Check Slurm Node Info
+
+```bash
+kubectl exec -it slurm-controller-0 -n slurm --  sinfo
+```
+
+### Check Job Queue
+
+```bash
+kubectl exec -it slurm-controller-0 -n slurm --  squeue
+```
+
+### Check Node Resources
+
+```bash
+kubectl exec -it slurm-controller-0 -n slurm -- sinfo -N -o "%N %G"
+```
diff --git a/slinky/test.py b/slinky/test.py
@@ -0,0 +1,12 @@
+# run this command to check if the GPUs are available
+# srun -N 2 --gpus=16 -t 00:02:00 python3 test.py
+import torch
+ 
+if torch.cuda.is_available():
+    print(f"GPUs available: {torch.cuda.device_count()}")
+    for i in range(torch.cuda.device_count()):
+        print(f" - GPU {i}: {torch.cuda.get_device_name(i)}")
+        print(f" - GPU {i} Pytorch and rocm version: {torch.__version__}")
+        print(f" - GPU {i} Nccl version: {torch.cuda.nccl.version()}")
+else:
+    print("No GPUs available.")
diff --git a/slinky/train_fashion_mnist.py b/slinky/train_fashion_mnist.py
@@ -0,0 +1,120 @@
+import os
+
+# Set the Torch Distributed env variables so the training function can be run locally in the Notebook.
+# See https://pytorch.org/docs/stable/elastic/run.html#environment-variables
+os.environ["RANK"] = "0"
+os.environ["LOCAL_RANK"] = "0"
+os.environ["WORLD_SIZE"] = "1"
+os.environ["MASTER_ADDR"] = "localhost"
+os.environ["MASTER_PORT"] = "1234"
+
+def train_fashion_mnist():
+    import torch
+    import torch.distributed as dist
+    import torch.nn.functional as F
+    from torch import nn
+    from torch.utils.data import DataLoader, DistributedSampler
+    from torchvision import datasets, transforms
+
+    # Define the PyTorch CNN model to be trained
+    class Net(nn.Module):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.conv1 = nn.Conv2d(1, 20, 5, 1)
+            self.conv2 = nn.Conv2d(20, 50, 5, 1)
+            self.fc1 = nn.Linear(4 * 4 * 50, 500)
+            self.fc2 = nn.Linear(500, 10)
+
+        def forward(self, x):
+            x = F.relu(self.conv1(x))
+            x = F.max_pool2d(x, 2, 2)
+            x = F.relu(self.conv2(x))
+            x = F.max_pool2d(x, 2, 2)
+            x = x.view(-1, 4 * 4 * 50)
+            x = F.relu(self.fc1(x))
+            x = self.fc2(x)
+            return F.log_softmax(x, dim=1)
+
+    # Use NCCL if a GPU is available, otherwise use Gloo as communication backend.
+    device, backend = ("cuda", "nccl") if torch.cuda.is_available() else ("cpu", "gloo")
+    print(f"Using Device: {device}, Backend: {backend}")
+
+    # Setup PyTorch distributed.
+    local_rank = int(os.getenv("LOCAL_RANK", 0))
+    dist.init_process_group(backend=backend)
+    print(
+        "Distributed Training for WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}".format(
+            dist.get_world_size(),
+            dist.get_rank(),
+            local_rank,
+        )
+    )
+
+    # Create the model and load it into the device.
+    device = torch.device(f"{device}:{local_rank}")
+    model = nn.parallel.DistributedDataParallel(Net().to(device))
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+
+    
+    # Download FashionMNIST dataset only on local_rank=0 process.
+    if local_rank == 0:
+        dataset = datasets.FashionMNIST(
+            "./data",
+            train=True,
+            download=True,
+            transform=transforms.Compose([transforms.ToTensor()]),
+        )
+    dist.barrier()
+    dataset = datasets.FashionMNIST(
+        "./data",
+        train=True,
+        download=False,
+        transform=transforms.Compose([transforms.ToTensor()]),
+    )
+
+
+    # Shard the dataset accross workers.
+    train_loader = DataLoader(
+        dataset,
+        batch_size=100,
+        sampler=DistributedSampler(dataset)
+    )
+
+    # TODO(astefanutti): add parameters to the training function
+    dist.barrier()
+    for epoch in range(1, 10):
+        model.train()
+
+        # Iterate over mini-batches from the training set
+        for batch_idx, (inputs, labels) in enumerate(train_loader):
+            # Copy the data to the GPU device if available
+            inputs, labels = inputs.to(device), labels.to(device)
+            # Forward pass
+            outputs = model(inputs)
+            loss = F.nll_loss(outputs, labels)
+            # Backward pass
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            if batch_idx % 10 == 0 and dist.get_rank() == 0:
+                print(
+                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                        epoch,
+                        batch_idx * len(inputs),
+                        len(train_loader.dataset),
+                        100.0 * batch_idx / len(train_loader),
+                        loss.item(),
+                    )
+                )
+
+    # Wait for the distributed training to complete
+    dist.barrier()
+    if dist.get_rank() == 0:
+        print("Training is finished")
+
+    # Finally clean up PyTorch distributed
+    dist.destroy_process_group()
+
+# Run the training function locally.
+train_fashion_mnist()
diff --git a/slinky/train_mnist_distributed.py b/slinky/train_mnist_distributed.py
@@ -0,0 +1,117 @@
+import os
+
+# Set the Torch Distributed env variables so the training function can be run locally in the Notebook.
+# See https://pytorch.org/docs/stable/elastic/run.html#environment-variables
+os.environ["RANK"] = "0"
+os.environ["WORLD_SIZE"] = "1"
+
+def train_fashion_mnist():
+    import torch
+    import torch.distributed as dist
+    import torch.nn.functional as F
+    from torch import nn
+    from torch.utils.data import DataLoader, DistributedSampler
+    from torchvision import datasets, transforms
+
+    # Define the PyTorch CNN model to be trained
+    class Net(nn.Module):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.conv1 = nn.Conv2d(1, 20, 5, 1)
+            self.conv2 = nn.Conv2d(20, 50, 5, 1)
+            self.fc1 = nn.Linear(4 * 4 * 50, 500)
+            self.fc2 = nn.Linear(500, 10)
+
+        def forward(self, x):
+            x = F.relu(self.conv1(x))
+            x = F.max_pool2d(x, 2, 2)
+            x = F.relu(self.conv2(x))
+            x = F.max_pool2d(x, 2, 2)
+            x = x.view(-1, 4 * 4 * 50)
+            x = F.relu(self.fc1(x))
+            x = self.fc2(x)
+            return F.log_softmax(x, dim=1)
+
+    # Use NCCL if a GPU is available, otherwise use Gloo as communication backend.
+    device, backend = ("cuda", "nccl") if torch.cuda.is_available() else ("cpu", "gloo")
+    print(f"Using Device: {device}, Backend: {backend}")
+
+    # Setup PyTorch distributed.
+    local_rank = int(os.getenv("LOCAL_RANK", 0))
+    dist.init_process_group(backend=backend)
+    print(
+        "Distributed Training for WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}".format(
+            dist.get_world_size(),
+            dist.get_rank(),
+            local_rank,
+        )
+    )
+
+    # Create the model and load it into the device.
+    device = torch.device(f"{device}:{local_rank}")
+    model = nn.parallel.DistributedDataParallel(Net().to(device))
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+
+    
+    # Download FashionMNIST dataset only on local_rank=0 process.
+    if local_rank == 0:
+        dataset = datasets.FashionMNIST(
+            "./data",
+            train=True,
+            download=True,
+            transform=transforms.Compose([transforms.ToTensor()]),
+        )
+    dist.barrier()
+    dataset = datasets.FashionMNIST(
+        "./data",
+        train=True,
+        download=False,
+        transform=transforms.Compose([transforms.ToTensor()]),
+    )
+
+
+    # Shard the dataset accross workers.
+    train_loader = DataLoader(
+        dataset,
+        batch_size=100,
+        sampler=DistributedSampler(dataset)
+    )
+
+    # TODO(astefanutti): add parameters to the training function
+    dist.barrier()
+    for epoch in range(1, 10):
+        model.train()
+
+        # Iterate over mini-batches from the training set
+        for batch_idx, (inputs, labels) in enumerate(train_loader):
+            # Copy the data to the GPU device if available
+            inputs, labels = inputs.to(device), labels.to(device)
+            # Forward pass
+            outputs = model(inputs)
+            loss = F.nll_loss(outputs, labels)
+            # Backward pass
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            if batch_idx % 10 == 0 and dist.get_rank() == 0:
+                print(
+                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                        epoch,
+                        batch_idx * len(inputs),
+                        len(train_loader.dataset),
+                        100.0 * batch_idx / len(train_loader),
+                        loss.item(),
+                    )
+                )
+
+    # Wait for the distributed training to complete
+    dist.barrier()
+    if dist.get_rank() == 0:
+        print("Training is finished")
+
+    # Finally clean up PyTorch distributed
+    dist.destroy_process_group()
+
+# Run the training function locally.
+train_fashion_mnist()
diff --git a/slinky/values-operator.yaml b/slinky/values-operator.yaml
diff --git a/slinky/values-slurm.yaml b/slinky/values-slurm.yaml