Add TF multi-gpu example

paulasanematsu · paulasanematsu · commit 2649d9a4aff5 · 2023-10-24T15:52:41.000-04:00
diff --git a/AI/TensorFlow/Example4/README.md b/AI/TensorFlow/Example4/README.md
@@ -0,0 +1,14 @@
+## Purpose
+
+Show how to use multiple GPUs with Tensorflow
+
+## Contents
+
+- `tf_test_multi_gpu.py`: Modified code [`tf_test.py`](../tf_test.py) to use all available GPUs on a node
+- `run.sbatch`: Slurm batch-job submission script to pull singularity image and run `tf_test_multi_gpu.py`
+- `tf_test.out`: Output file
+
+## Important notes
+
+1. In this example the slurm batch script pulls a singularity container with TensorFlow and runs the examples inside the singularity container. However, you can modify `run.sbatch` script to run within a conda/mamba environment.
+
diff --git a/AI/TensorFlow/Example4/run.sbatch b/AI/TensorFlow/Example4/run.sbatch
@@ -0,0 +1,16 @@
+#!/bin/bash
+#SBATCH -p gpu
+#SBATCH -c 8
+#SBATCH -t 00:30:00
+#SBATCH -J tf_test
+#SBATCH -o tf_test.out
+#SBATCH -e tf_test.err
+#SBATCH --gres=gpu:4
+#SBATCH --mem=8G
+
+# pull singularity image
+# this is a one-time setup. Once downloaded, you don't need to pull it again
+srun -c $SLURM_CPUS_PER_TASK singularity pull --disable-cache docker://tensorflow/tensorflow:latest-gpu
+
+# --- run code tf_test_multi_gpu.py ---
+singularity exec --nv tensorflow_latest-gpu.sif python tf_test_multi_gpu.py
diff --git a/AI/TensorFlow/Example4/tf_test.out b/AI/TensorFlow/Example4/tf_test.out
@@ -0,0 +1,29 @@
+2.14.0
+Number of devices: 4
+Epoch 1/10
+1875/1875 - 13s - loss: 0.5010 - accuracy: 0.8232 - 13s/epoch - 7ms/step
+Epoch 2/10
+1875/1875 - 7s - loss: 0.3747 - accuracy: 0.8646 - 7s/epoch - 4ms/step
+Epoch 3/10
+1875/1875 - 7s - loss: 0.3367 - accuracy: 0.8768 - 7s/epoch - 4ms/step
+Epoch 4/10
+1875/1875 - 7s - loss: 0.3129 - accuracy: 0.8856 - 7s/epoch - 4ms/step
+Epoch 5/10
+1875/1875 - 7s - loss: 0.2960 - accuracy: 0.8910 - 7s/epoch - 4ms/step
+Epoch 6/10
+1875/1875 - 7s - loss: 0.2799 - accuracy: 0.8973 - 7s/epoch - 4ms/step
+Epoch 7/10
+1875/1875 - 7s - loss: 0.2685 - accuracy: 0.9000 - 7s/epoch - 4ms/step
+Epoch 8/10
+1875/1875 - 7s - loss: 0.2580 - accuracy: 0.9036 - 7s/epoch - 4ms/step
+Epoch 9/10
+1875/1875 - 7s - loss: 0.2480 - accuracy: 0.9083 - 7s/epoch - 4ms/step
+Epoch 10/10
+1875/1875 - 7s - loss: 0.2383 - accuracy: 0.9110 - 7s/epoch - 4ms/step
+313/313 - 1s - loss: 0.3294 - accuracy: 0.8843 - 1s/epoch - 5ms/step
+
+Test accuracy: 0.8842999935150146
+313/313 - 1s - 1s/epoch - 4ms/step
+[4.0451411e-07 4.3211493e-12 1.8949876e-10 1.1165977e-12 3.1353355e-08
+ 1.5895354e-03 4.6215266e-08 5.1007383e-03 7.3685516e-07 9.9330854e-01]
+9
diff --git a/AI/TensorFlow/Example4/tf_test_multi_gpu.py b/AI/TensorFlow/Example4/tf_test_multi_gpu.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+from __future__ import absolute_import, division, print_function, unicode_literals
+import os
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import tensorflow as tf
+from tensorflow import keras
+import numpy as np
+
+print(tf.__version__)
+
+# Create a MirroredStrategy.
+strategy = tf.distribute.MirroredStrategy()
+print("Number of devices: {}".format(strategy.num_replicas_in_sync))
+
+# Open a strategy scope.
+with strategy.scope():
+    # Everything that creates variables should be under the strategy scope.
+    # In general this is only model construction & `compile()`.
+    fashion_mnist = keras.datasets.fashion_mnist
+    (train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
+    
+    class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
+    
+    train_images = train_images / 255.0
+    test_images = test_images / 255.0
+    model = keras.Sequential([
+    	keras.layers.Flatten(input_shape=(28, 28)),
+    	keras.layers.Dense(128, activation='relu'),
+    	keras.layers.Dense(10, activation='softmax')
+       ])
+    
+    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
+
+# you can change verbose=1 to see progress bars when running interactively
+model.fit(train_images, train_labels, epochs=10, verbose=2)
+
+# you can change verbose=1 to see progress bars when running interactively
+test_loss, test_acc = model.evaluate(test_images,  test_labels, verbose=2)
+print('\nTest accuracy:', test_acc)
+
+# you can change verbose=1 to see progress bars when running interactively
+predictions = model.predict(test_images, verbose=2)
+print(predictions[0])
+print(np.argmax(predictions[0]))
+
+