From 41befd0fcb637e14932b8a7a0a64ddf67d2eea4e Mon Sep 17 00:00:00 2001
From: Zoey Zhang <zozhang@nvidia.com>
Date: Wed, 9 Jul 2025 12:03:17 -0700
Subject: [PATCH 1/4] finetune on dgxcloud with nemo-run and deploy on bedrock
 example

Signed-off-by: Zoey Zhang <zozhang@nvidia.com>
---
 examples/dgxcloud/sft-finetune/convert.sh     |   7 +
 .../finetune-llama3.1-70b-sft.ipynb           | 280 ++++++++++++++++++
 .../sft-finetune/sft-finetune-llama3.1-70b.sh |  46 +++
 3 files changed, 333 insertions(+)
 create mode 100644 examples/dgxcloud/sft-finetune/convert.sh
 create mode 100644 examples/dgxcloud/sft-finetune/finetune-llama3.1-70b-sft.ipynb
 create mode 100644 examples/dgxcloud/sft-finetune/sft-finetune-llama3.1-70b.sh

diff --git a/examples/dgxcloud/sft-finetune/convert.sh b/examples/dgxcloud/sft-finetune/convert.sh
new file mode 100644
index 00000000..73d3605f
--- /dev/null
+++ b/examples/dgxcloud/sft-finetune/convert.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+torchrun /opt/NeMo/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py \
+--input_name_or_path /demo-workspace/llama3.1-70b-daring-anteater-sft/checkpoints/megatron_gpt_sft.nemo \
+--output_path /demo-workspace/llama-output-weights.bin \
+--hf_input_path /demo-workspace/Meta-Llama-3.1-70B \
+--hf_output_path /demo-workspace/sft-llama-3.1-hf
\ No newline at end of file
diff --git a/examples/dgxcloud/sft-finetune/finetune-llama3.1-70b-sft.ipynb b/examples/dgxcloud/sft-finetune/finetune-llama3.1-70b-sft.ipynb
new file mode 100644
index 00000000..b72b93fa
--- /dev/null
+++ b/examples/dgxcloud/sft-finetune/finetune-llama3.1-70b-sft.ipynb
@@ -0,0 +1,280 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Finetune HF Llama 3.1 70b and Deploy on AWS Bedrock\n",
+    "\n",
+    "This notebook has the following steps: \n",
+    "\n",
+    "1. imports and converts [Llama 3.1 70b](https://huggingface.co/meta-llama/Meta-Llama-3-8B) from Hugging Face transformer file format to .nemo file format\n",
+    "\n",
+    "    Note: you will need to create a HuggingFace account and request access to the model\n",
+    "\n",
+    "2. Supervised Fine Tuning (SFT) using the NeMo framework on the [NVIDIA Daring-Anteater dataset](https://huggingface.co/datasets/nvidia/Daring-Anteater), a comprehensive dataset for instruction tuning\n",
+    "\n",
+    "3. Move your finetuned model to AWS S3 for use with AWS Bedrock Custom Model Import"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Convert Hugging Face Model to NeMo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!pip install ipywidgets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import huggingface_hub\n",
+    "\n",
+    "# Set your Hugging Face access token\n",
+    "huggingface_hub.login(\"<HF_TOKEN>\")\n",
+    "os.makedirs(\"/demo-workspace/Meta-Llama-3.1-70B\" ,exist_ok=True)\n",
+    "huggingface_hub.snapshot_download(repo_id=\"meta-llama/Llama-3.1-70B\", repo_type=\"model\", local_dir=\"Meta-Llama-3.1-70B\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "# clear any previous temporary weights dir if any\n",
+    "rm -r model_weights\n",
+    "\n",
+    "#converter script from NeMo\n",
+    "python /opt/NeMo/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \\\n",
+    "  --precision bf16 \\\n",
+    "  --input_name_or_path=/demo-workspace/Meta-Llama-3.1-70B \\\n",
+    "  --output_path=/demo-workspace/Meta-Llama-3.1-70B.nemo \\\n",
+    "  --llama31 True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import and Configure Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "mkdir /demo-workspace/datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "import json\n",
+    "\n",
+    "dataset = load_dataset(\"nvidia/daring-anteater\")\n",
+    "\n",
+    "for split, shard in dataset.items():\n",
+    "    length = len(shard)\n",
+    "    train_limit = length * 0.85\n",
+    "    with open(\"/demo-workspace/datasets/daring-anteater-train.jsonl\", \"w\") as train:\n",
+    "        with open(\"/demo-workspace/datasets/daring-anteater-val.jsonl\", \"w\") as val:\n",
+    "            for count, line in enumerate(shard):\n",
+    "                desired_data = {\n",
+    "                    \"system\": line[\"system\"],\n",
+    "                    \"conversations\": line[\"conversations\"],\n",
+    "                    \"mask\": line[\"mask\"],\n",
+    "                    \"type\": \"TEXT_TO_VALUE\",\n",
+    "                }\n",
+    "                if count < train_limit:\n",
+    "                    json.dump(desired_data, train)\n",
+    "                    train.write('\\n')\n",
+    "                else:\n",
+    "                    json.dump(desired_data, val)\n",
+    "                    val.write('\\n')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Finetuning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash \n",
+    "chmod +x /demo-workspace/sft-finetune-llama3.1-70b.sh\n",
+    "ls -l /demo-workspace/sft-finetune-llama3.1-70b.sh"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nemo_run as run\n",
+    "\n",
+    "def dgxc_executor(nodes: int = 1, devices: int = 1) -> run.DGXCloudExecutor:\n",
+    "    pvcs = [\n",
+    "        {\n",
+    "            \"name\": \"workspace\",  # Default name to identify the PVC\n",
+    "            \"path\": \"/demo-workspace\",  # Directory where PVC will be mounted in pods\n",
+    "            \"existingPvc\": True,  # The PVC already exists\n",
+    "            \"claimName\": \"llama-3-1-70b-pvc-project-ax4ia\",  # Replace with the name of the PVC to use\n",
+    "        }\n",
+    "    ]\n",
+    "\n",
+    "    return run.DGXCloudExecutor(\n",
+    "        base_url=\"https://tme-aws.nv.run.ai/api/v1\",  # Base URL to send API requests\n",
+    "        app_id=\"aws-app\",  # Name of the Application\n",
+    "        app_secret=\"<APP_SECRET>\",  # Application secret token\n",
+    "        project_name=\"aws-demo-project\",  # Name of the project within Run:ai\n",
+    "        nodes=nodes,  # Number of nodes to run on\n",
+    "        gpus_per_node=devices,  # Number of processes per node to use\n",
+    "        container_image=\"nvcr.io/nvidia/nemo:25.02\",  # Which container to deploy\n",
+    "        pvcs=pvcs,  # Attach the PVC(s) to the pod\n",
+    "        launcher=\"torchrun\",  # Use torchrun to launch the processes\n",
+    "        env_vars={\n",
+    "            \"PYTHONPATH\": \"/demo-workspace/nemo-run:$PYTHONPATH\",  # Add the NeMo-Run directory to the PYTHONPATH\n",
+    "            \"HF_TOKEN\": \"<HF_TOKEN>\",  # Add your Hugging Face API token here\n",
+    "            \"FI_EFA_USE_HUGE_PAGE\":\"0\",\n",
+    "            \"TORCH_HOME\":\"/demo-workspace/.cache\",\n",
+    "            \"NEMORUN_HOME\":\"/demo-workspace/nemo-run\",\n",
+    "            \"OMP_NUM_THREADS\":\"1\",\n",
+    "        },\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "executor = dgxc_executor(nodes=4, devices=8)\n",
+    "run.config.set_nemorun_home(\"/demo-workspace/nemo-run\")\n",
+    "\n",
+    "with run.Experiment(\"sft-finetuning\") as exp:\n",
+    "    exp.add(run.Script(\"/demo-workspace/sft-finetune-llama3.1-70b.sh\"),executor=executor)\n",
+    "\n",
+    "    # Launch the experiment on the cluster\n",
+    "    exp.run(sequential=True)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import Model to AWS S3\n",
+    "\n",
+    "To prepare the model for use with BedRock, we must first convert our finetuned model weights back to HF safetensors. The model and the original llama 3.0 tokens will then be sent to your S3 bucket. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "python /opt/NeMo/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py \\\n",
+    "--input_name_or_path /demo-workspace/llama3.1-70b-daring-anteater-sft/checkpoints/megatron_gpt_sft.nemo \\\n",
+    "--output_path /demo-workspace/llama-output-weights.bin \\\n",
+    "--hf_input_path /demo-workspace/Meta-Llama-3.1-70B \\\n",
+    "--hf_output_path /demo-workspace/sft-llama-3.1-hf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "export BUCKET_NAME=hf-llama3-1-70b\n",
+    "\n",
+    "export AWS_ACCESS_KEY_ID=<AWS_ACCESS_KEY_ID>\n",
+    "export AWS_SECRET_ACCESS_KEY=<AWS_SECRET_ACCESS_KEY>\n",
+    "./s5cmd cp /demo-workspace/sft-llama-3.1-hf s3://$BUCKET_NAME\n",
+    "\n",
+    "./s5cmd cp /demo-workspace/Meta-Llama-3.1-70B/tokenizer.json s3://$BUCKET_NAME/sft-llama-3.1-hf/\n",
+    "./s5cmd cp /demo-workspace/Meta-Llama-3.1-70B/tokenizer_config.json s3://$BUCKET_NAME/sft-llama-3.1-hf/\n",
+    "./s5cmd cp /demo-workspace/Meta-Llama-3.1-70B/original/tokenizer.model s3://$BUCKET_NAME/sft-llama-3.1-hf/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To run with BedRock, go to the Custom Model import feature and load your model from your S3 bucket. Once the model is ready, it can directly be used for your production inference. "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/dgxcloud/sft-finetune/sft-finetune-llama3.1-70b.sh b/examples/dgxcloud/sft-finetune/sft-finetune-llama3.1-70b.sh
new file mode 100644
index 00000000..e262978e
--- /dev/null
+++ b/examples/dgxcloud/sft-finetune/sft-finetune-llama3.1-70b.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Set paths to the model, train, validation and test sets.
+MODEL="/demo-workspace/Meta-Llama-3.1-70B.nemo"
+TRAIN_DS="/demo-workspace/datasets/daring-anteater-train.jsonl"
+VALID_DS="/demo-workspace/datasets/daring-anteater-val.jsonl"
+TEST_DS="/demo-workspace/datasets/daring-anteater-val.jsonl"
+TEST_NAMES="[daring-anteater]"
+
+SCHEME="none"  # SFT is none
+TP_SIZE=4
+PP_SIZE=4
+
+OUTPUT_DIR="/demo-workspace/llama3.1-70b-daring-anteater-sft"
+
+export HYDRA_FULL_ERROR=1
+
+torchrun /opt/NeMo-Aligner/examples/nlp/gpt/train_gpt_sft.py \
+   trainer.precision=bf16 \
+   trainer.num_nodes=4 \
+   trainer.devices=8 \
+   trainer.sft.max_steps=-1 \
+   trainer.sft.limit_val_batches=40 \
+   trainer.sft.val_check_interval=1000 \
+   model.megatron_amp_O2=True \
+   model.restore_from_path=${MODEL} \
+   model.optim.lr=5e-6 \
+   model.tensor_model_parallel_size=${TP_SIZE} \
+   model.pipeline_model_parallel_size=${PP_SIZE} \
+   model.context_parallel_size=2 \
+   model.data.chat=True \
+   model.data.num_workers=0 \
+   model.data.train_ds.micro_batch_size=1 \
+   model.data.train_ds.global_batch_size=32 \
+   model.data.train_ds.max_seq_length=8192 \
+   model.data.train_ds.file_path=${TRAIN_DS} \
+   model.data.validation_ds.micro_batch_size=1 \
+   model.data.validation_ds.global_batch_size=4 \
+   model.data.validation_ds.file_path=${VALID_DS} \
+   model.data.validation_ds.max_seq_length=8192 \
+   exp_manager.create_wandb_logger=False \
+   exp_manager.explicit_log_dir=${OUTPUT_DIR} \
+   exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True \
+   exp_manager.checkpoint_callback_params.save_top_k=1 \
+   exp_manager.checkpoint_callback_params.monitor=val_loss \
+

From 1085b19242bc805a9bb12856f8fe98c12e21599f Mon Sep 17 00:00:00 2001
From: Zoey Zhang <zozhang@nvidia.com>
Date: Wed, 9 Jul 2025 13:12:11 -0700
Subject: [PATCH 2/4] removing trailing slash

Signed-off-by: Zoey Zhang <zozhang@nvidia.com>
---
 examples/dgxcloud/sft-finetune/sft-finetune-llama3.1-70b.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/dgxcloud/sft-finetune/sft-finetune-llama3.1-70b.sh b/examples/dgxcloud/sft-finetune/sft-finetune-llama3.1-70b.sh
index e262978e..cdb2c4ca 100644
--- a/examples/dgxcloud/sft-finetune/sft-finetune-llama3.1-70b.sh
+++ b/examples/dgxcloud/sft-finetune/sft-finetune-llama3.1-70b.sh
@@ -42,5 +42,5 @@ torchrun /opt/NeMo-Aligner/examples/nlp/gpt/train_gpt_sft.py \
    exp_manager.explicit_log_dir=${OUTPUT_DIR} \
    exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True \
    exp_manager.checkpoint_callback_params.save_top_k=1 \
-   exp_manager.checkpoint_callback_params.monitor=val_loss \
+   exp_manager.checkpoint_callback_params.monitor=val_loss
 

From 1e9b6d14efbea02752041699ff8f0a28183d7a84 Mon Sep 17 00:00:00 2001
From: Zoey Zhang <zozhang@nvidia.com>
Date: Wed, 9 Jul 2025 13:29:22 -0700
Subject: [PATCH 3/4] reformatting notebook

Signed-off-by: Zoey Zhang <zozhang@nvidia.com>
---
 .../finetune-llama3.1-70b-sft.ipynb           | 27 +++++++++++--------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/examples/dgxcloud/sft-finetune/finetune-llama3.1-70b-sft.ipynb b/examples/dgxcloud/sft-finetune/finetune-llama3.1-70b-sft.ipynb
index b72b93fa..dc619df8 100644
--- a/examples/dgxcloud/sft-finetune/finetune-llama3.1-70b-sft.ipynb
+++ b/examples/dgxcloud/sft-finetune/finetune-llama3.1-70b-sft.ipynb
@@ -52,8 +52,10 @@
     "\n",
     "# Set your Hugging Face access token\n",
     "huggingface_hub.login(\"<HF_TOKEN>\")\n",
-    "os.makedirs(\"/demo-workspace/Meta-Llama-3.1-70B\" ,exist_ok=True)\n",
-    "huggingface_hub.snapshot_download(repo_id=\"meta-llama/Llama-3.1-70B\", repo_type=\"model\", local_dir=\"Meta-Llama-3.1-70B\")"
+    "os.makedirs(\"/demo-workspace/Meta-Llama-3.1-70B\", exist_ok=True)\n",
+    "huggingface_hub.snapshot_download(\n",
+    "    repo_id=\"meta-llama/Llama-3.1-70B\", repo_type=\"model\", local_dir=\"Meta-Llama-3.1-70B\"\n",
+    ")"
    ]
   },
   {
@@ -119,10 +121,10 @@
     "                }\n",
     "                if count < train_limit:\n",
     "                    json.dump(desired_data, train)\n",
-    "                    train.write('\\n')\n",
+    "                    train.write(\"\\n\")\n",
     "                else:\n",
     "                    json.dump(desired_data, val)\n",
-    "                    val.write('\\n')"
+    "                    val.write(\"\\n\")"
    ]
   },
   {
@@ -138,7 +140,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%%bash \n",
+    "%%bash\n",
+    "\n",
     "chmod +x /demo-workspace/sft-finetune-llama3.1-70b.sh\n",
     "ls -l /demo-workspace/sft-finetune-llama3.1-70b.sh"
    ]
@@ -151,6 +154,7 @@
    "source": [
     "import nemo_run as run\n",
     "\n",
+    "\n",
     "def dgxc_executor(nodes: int = 1, devices: int = 1) -> run.DGXCloudExecutor:\n",
     "    pvcs = [\n",
     "        {\n",
@@ -174,10 +178,10 @@
     "        env_vars={\n",
     "            \"PYTHONPATH\": \"/demo-workspace/nemo-run:$PYTHONPATH\",  # Add the NeMo-Run directory to the PYTHONPATH\n",
     "            \"HF_TOKEN\": \"<HF_TOKEN>\",  # Add your Hugging Face API token here\n",
-    "            \"FI_EFA_USE_HUGE_PAGE\":\"0\",\n",
-    "            \"TORCH_HOME\":\"/demo-workspace/.cache\",\n",
-    "            \"NEMORUN_HOME\":\"/demo-workspace/nemo-run\",\n",
-    "            \"OMP_NUM_THREADS\":\"1\",\n",
+    "            \"FI_EFA_USE_HUGE_PAGE\": \"0\",\n",
+    "            \"TORCH_HOME\": \"/demo-workspace/.cache\",\n",
+    "            \"NEMORUN_HOME\": \"/demo-workspace/nemo-run\",\n",
+    "            \"OMP_NUM_THREADS\": \"1\",\n",
     "        },\n",
     "    )"
    ]
@@ -194,10 +198,10 @@
     "run.config.set_nemorun_home(\"/demo-workspace/nemo-run\")\n",
     "\n",
     "with run.Experiment(\"sft-finetuning\") as exp:\n",
-    "    exp.add(run.Script(\"/demo-workspace/sft-finetune-llama3.1-70b.sh\"),executor=executor)\n",
+    "    exp.add(run.Script(\"/demo-workspace/sft-finetune-llama3.1-70b.sh\"), executor=executor)\n",
     "\n",
     "    # Launch the experiment on the cluster\n",
-    "    exp.run(sequential=True)\n"
+    "    exp.run(sequential=True)"
    ]
   },
   {
@@ -218,6 +222,7 @@
    "outputs": [],
    "source": [
     "%%bash\n",
+    "\n",
     "python /opt/NeMo/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py \\\n",
     "--input_name_or_path /demo-workspace/llama3.1-70b-daring-anteater-sft/checkpoints/megatron_gpt_sft.nemo \\\n",
     "--output_path /demo-workspace/llama-output-weights.bin \\\n",

From fbbeb352315dfb441c0f4cf9835f21315df4fe56 Mon Sep 17 00:00:00 2001
From: Zoey Zhang <zozhang@nvidia.com>
Date: Wed, 9 Jul 2025 13:55:04 -0700
Subject: [PATCH 4/4] adding EOF

Signed-off-by: Zoey Zhang <zozhang@nvidia.com>
---
 examples/dgxcloud/sft-finetune/convert.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/dgxcloud/sft-finetune/convert.sh b/examples/dgxcloud/sft-finetune/convert.sh
index 73d3605f..5ae284c4 100644
--- a/examples/dgxcloud/sft-finetune/convert.sh
+++ b/examples/dgxcloud/sft-finetune/convert.sh
@@ -4,4 +4,4 @@ torchrun /opt/NeMo/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py \
 --input_name_or_path /demo-workspace/llama3.1-70b-daring-anteater-sft/checkpoints/megatron_gpt_sft.nemo \
 --output_path /demo-workspace/llama-output-weights.bin \
 --hf_input_path /demo-workspace/Meta-Llama-3.1-70B \
---hf_output_path /demo-workspace/sft-llama-3.1-hf
\ No newline at end of file
+--hf_output_path /demo-workspace/sft-llama-3.1-hf