diff --git a/examples/pytorch/text-classification/Fine-Tune-Llama3-LLM.ipynb b/examples/pytorch/text-classification/Fine-Tune-Llama3-LLM.ipynb new file mode 100644 index 0000000000..259cb0cce2 --- /dev/null +++ b/examples/pytorch/text-classification/Fine-Tune-Llama3-LLM.ipynb @@ -0,0 +1,399 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "270ce448", + "metadata": {}, + "source": [ + "# Fine-Tune & Serve Llama3 with Kubeflow PytorchJob in a Kubeflow Pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "73619a20", + "metadata": {}, + "source": [ + "This Notebook will do the following:\n", + "1. Fine-tune meta-llama/Llama-3.1-8B-Instruct model on KubeCon, India 2024 dataset using distributed training with [Kubeflow PytorchJob](https://www.kubeflow.org/docs/components/training/overview/).\n", + "2. Serve the fine-tuned model using Kserve.\n", + " \n", + "We are using [Kubeflow Pipelines](https://www.kubeflow.org/docs/components/pipelines/) to run this end-to-end LLM pipeline.\n", + "\n", + "\n", + "Llama3 model: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct\n", + "\n", + "KubeCon, India 2024 dataset: https://huggingface.co/datasets/aishwaryayyy/events_data\n", + "\n", + "This Notebook requires:\n", + "1. 1 GPU on your Kubernetes cluster for fine-tuning and later serving the fine-tuned model\n", + "2. 1 GPU on your Notebook node to load the fine-tuned model by merging PEFT weights.\n", + "\n", + "We need to install Kubeflow Pipeline packages and import the dependencies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e002b46-a18d-4805-ab57-9be5ed7a07eb", + "metadata": {}, + "outputs": [], + "source": [ + "pip install kfp kfp-kubernetes" + ] + }, + { + "cell_type": "markdown", + "id": "af5e73b2", + "metadata": {}, + "source": [ + "from typing import List\n", + "from kfp import client\n", + "from kfp import dsl\n", + "from kfp.dsl import Dataset\n", + "from kfp.dsl import Input\n", + "from kfp.dsl import Model\n", + "from kfp.dsl import Output" + ] + }, + { + "cell_type": "markdown", + "id": "1199e8b6", + "metadata": {}, + "source": [ + "## Fine-Tune Llama3 model with KubeCon dataset\n", + "\n", + "In this component, use TrainingClient() to create PyTorchJob which will fine-tune Llama3 model on 1 worker with 1 GPU.\n", + "\n", + "Specify the required packages in the *dsl.component* decorator. We would need kubeflow-pytorchjob, kubeflow-training[huggingface] and numpy packages in this Kubeflow component.\n", + "\n", + "Replace the HUGGINGFACE_TOKEN with your own token. It should have access to [Llama3 model](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a4d1541-0892-490b-b9b7-fc4057cce174", + "metadata": {}, + "outputs": [], + "source": [ + "@dsl.component(packages_to_install=['kubeflow-pytorchjob', 'kubeflow-training[huggingface]','numpy<1.24'])\n", + "def finetune_model():\n", + "\n", + " from kubeflow.training.api.training_client import TrainingClient\n", + " from kubeflow.storage_initializer.s3 import S3DatasetParams\n", + " from kubeflow.storage_initializer.hugging_face import (\n", + " HuggingFaceModelParams,\n", + " HuggingFaceTrainerParams,\n", + " HuggingFaceDatasetParams,\n", + " )\n", + " from kubeflow.storage_initializer.constants import INIT_CONTAINER_MOUNT_PATH\n", + " from peft import LoraConfig\n", + " import transformers\n", + " from transformers import TrainingArguments\n", + " from kubeflow.training import constants\n", + " \n", + " # create a training client, pass config_file parameter if you want to use kubeconfig other than \"~/.kube/config\"\n", + " client = TrainingClient()\n", + " OUTPUT = INIT_CONTAINER_MOUNT_PATH + \"/output/llama-3.1-8B-kubecon\"\n", + " HUGGINGFACE_TOKEN = \"YOUR_HUGGINGFACE_TOKEN\"\n", + " \n", + " # mention the model, datasets and training parameters\n", + " client.train(\n", + " name=\"llama-3-1-8b-kubecon\",\n", + " num_workers=1,\n", + " num_procs_per_worker=1,\n", + " # specify the storage class if you don't want to use the default one for the storage-initializer PVC\n", + " storage_config={\n", + " \"size\": \"100Gi\",\n", + " \"storage_class\": \"nfs-storage\",\n", + " },\n", + " model_provider_parameters=HuggingFaceModelParams(\n", + " model_uri=\"hf://meta-llama/Llama-3.1-8B-Instruct\",\n", + " transformer_type=transformers.AutoModelForCausalLM,\n", + " access_token=HUGGINGFACE_TOKEN,\n", + " ),\n", + " # it is assumed for text related tasks, you have 'text' column in the dataset.\n", + " # for more info on how dataset is loaded check load_and_preprocess_data function in sdk/python/kubeflow/trainer/hf_llm_training.py\n", + " dataset_provider_parameters=HuggingFaceDatasetParams(repo_id=\"aishwaryayyy/events_data\"),\n", + " trainer_parameters=HuggingFaceTrainerParams(\n", + " lora_config=LoraConfig(\n", + " r=16,\n", + " lora_alpha=32,\n", + " lora_dropout=0.1,\n", + " bias=\"none\",\n", + " task_type=\"CAUSAL_LM\",\n", + " target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"]\n", + " ),\n", + " training_parameters=TrainingArguments(\n", + " max_grad_norm=0.4,\n", + " num_train_epochs=3,\n", + " per_device_train_batch_size=4,\n", + " gradient_accumulation_steps=8,\n", + " gradient_checkpointing=True,\n", + " gradient_checkpointing_kwargs={\n", + " \"use_reentrant\": False\n", + " }, # this is mandatory if checkpointng is enabled\n", + " warmup_steps=8,\n", + " learning_rate=2e-4,\n", + " lr_scheduler_type=\"cosine\",\n", + " bf16=True,\n", + " logging_steps=0.01,\n", + " output_dir=OUTPUT,\n", + " optim=f\"paged_adamw_32bit\",\n", + " save_steps=0.01,\n", + " save_total_limit=3,\n", + " disable_tqdm=False,\n", + " resume_from_checkpoint=True,\n", + " remove_unused_columns=True,\n", + " # ddp_backend=\"gloo\", # change the backend to gloo if you want cpu based training and remove the gpu key in resources_per_worker\n", + " ),\n", + " ),\n", + " resources_per_worker={\n", + " \"gpu\": 1,\n", + " \"cpu\": 28,\n", + " \"memory\": \"60Gi\",\n", + " }, # remove the gpu key if you don't want to attach gpus to the pods\n", + " )\n", + " \n", + " # check the status of the job\n", + " from kubeflow.pytorchjob import PyTorchJobClient\n", + " import time\n", + "\n", + " time.sleep(30)\n", + "\n", + " pytorchjob_client = PyTorchJobClient()\n", + "\n", + " while True:\n", + " status = pytorchjob_client.get_job_status('llama-3-1-8b-kubecon')\n", + " print(f\"job status {status}\")\n", + " if status != \"Running\" and status != \"Created\" and status != \"Restarting\":\n", + " if status == \"Succeeded\":\n", + " print(\"pytorch job has succeeded :)\")\n", + " elif status == \"Failed\" or status == \"Terminated\":\n", + " print(\"pytorch job has failed :(\")\n", + " else:\n", + " continue\n", + " break\n", + " print(\"waiting for pytorch job to finish\")\n", + " time.sleep(10)" + ] + }, + { + "cell_type": "markdown", + "id": "b3ef449f", + "metadata": {}, + "source": [ + "Merge (Parameter Efficient Fine-Tuning) PEFT model weights with the pretrained model to form the fine-tuned model.\n", + " \n", + "Store it on a Persistent Volume shared across Kubeflow Pipeline tasks.\n", + "Also, save the tokenizer along with the fine-tuned model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3915012b-87ed-4d4c-a8a8-8fc106fd3e6b", + "metadata": {}, + "outputs": [], + "source": [ + "@dsl.component(base_image='quay.io/aishquaya/kfp-python:latest')\n", + "def store_model():\n", + " from transformers import AutoTokenizer, AutoModelForCausalLM\n", + " import torch\n", + " from peft import PeftModelForCausalLM\n", + "\n", + " HUGGINGFACE_TOKEN = \"YOUR_HUGGINGFACE_TOKEN\"\n", + " \n", + " model = AutoModelForCausalLM.from_pretrained(\n", + " \"meta-llama/Llama-3.1-8B-Instruct\",\n", + " torch_dtype=torch.float16,\n", + " low_cpu_mem_usage=True,\n", + " device_map=\"cuda:0\",\n", + " token=HUGGINGFACE_TOKEN,\n", + " )\n", + " \n", + " model = PeftModelForCausalLM.from_pretrained(\n", + " model, \"storage-initializer/output/llama-3.1-8B-kubecon/checkpoint-12\"\n", + " )\n", + "\n", + " finetuned_model = model.merge_and_unload()\n", + " finetuned_model.save_pretrained(\"storage-initializer/serve_model/llama-3.1-8B-kubecon\")\n", + "\n", + " pretrained_model = \"meta-llama/Llama-3.1-8B-Instruct\"\n", + " tokenizer = AutoTokenizer.from_pretrained(pretrained_model, token=HUGGINGFACE_TOKEN)\n", + " tokenizer.save_pretrained(\"storage-initializer/serve_model/llama-3.1-8B-kubecon\")" + ] + }, + { + "cell_type": "markdown", + "id": "7cdbc6d3", + "metadata": {}, + "source": [ + "## Serve the Fine-Tuned Model\n", + "\n", + "This component serves the fine-tuned model using Kserve. Create an InferenceService with HuggingFace runtime and *[6 vCPUs, 24Gi Memory and 1 GPU]* resource configuration. \n", + "\n", + "Specify the fine-tuned model's location in the storage_uri field." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b17a2b56-0f05-4551-8186-a5ebb97a202f", + "metadata": {}, + "outputs": [], + "source": [ + "@dsl.component(packages_to_install=['kserve', 'git+https://github.com/kubernetes-client/python.git'])\n", + "def serve_model():\n", + " from kubernetes import client \n", + " from kserve import KServeClient\n", + " from kserve import constants\n", + " from kserve import utils\n", + " from kserve import V1beta1InferenceService\n", + " from kserve import V1beta1InferenceServiceSpec\n", + " from kserve import V1beta1PredictorSpec\n", + " from kserve import V1beta1ModelSpec\n", + " from kserve import V1beta1ModelFormat\n", + " import kubernetes.client\n", + " from kubernetes.client import V1ResourceRequirements\n", + "\n", + " namespace = utils.get_default_target_namespace()\n", + "\n", + " api_version = constants.KSERVE_GROUP + '/' + \"v1beta1\"\n", + " \n", + " isvc = V1beta1InferenceService(\n", + " api_version=api_version,\n", + " \tkind=\"InferenceService\",\n", + " metadata=client.V1ObjectMeta(name='llama-3-1-8b-kubecon', namespace=namespace),\n", + " spec=V1beta1InferenceServiceSpec(\n", + " predictor=V1beta1PredictorSpec(\n", + " model=V1beta1ModelSpec(\n", + " model_format=V1beta1ModelFormat(name='huggingface'),\n", + " image='kserve/huggingfaceserver:latest',\n", + " storage_uri='pvc://storage-initializer/serve_model/llama-3.1-8B-kubecon',\n", + " resources=V1ResourceRequirements(\n", + " limits={'cpu': '6','memory': '24Gi', 'nvidia.com/gpu': '1'},\n", + " requests={'cpu': '6','memory': '24Gi', 'nvidia.com/gpu': '1'}\n", + " )\n", + " )\n", + " )))\n", + "\n", + " KServe = KServeClient()\n", + " KServe.create(isvc)" + ] + }, + { + "cell_type": "markdown", + "id": "24084f0b", + "metadata": {}, + "source": [ + "Initialize the Pipeline and link all the above declared tasks specifying their dependencies with each other.\n", + "\n", + "We have mounted a Persistent Volume Claim (PVC) to share storage space across Kubeflow components. The fine-tuned model will be stored here." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da30183e-b310-4d79-b6b9-32ced98a8511", + "metadata": {}, + "outputs": [], + "source": [ + "@dsl.pipeline(name='finetune-llama3-llm-pipeline')\n", + "def e2e_ml_pipeline():\n", + " from kfp import kubernetes\n", + " provision_model_storage = kubernetes.CreatePVC(\n", + " # can also use pvc_name instead of pvc_name_suffix to use a pre-existing PVC\n", + " pvc_name='storage-initializer',\n", + " access_modes=['ReadWriteOnce'],\n", + " size='100Gi',\n", + " storage_class_name='nai-nfs-storage',\n", + " )\n", + "\n", + " training_task = finetune_model()\n", + " merging_task = store_model()\n", + " serving_task = serve_model()\n", + " training_task.after(provision_model_storage)\n", + " merging_task.after(training_task)\n", + " serving_task.after(merging_task)\n", + "\n", + " serving_task.set_caching_options(False)\n", + " merging_task.set_caching_options(False)\n", + " # training_task.set_caching_options(False)\n", + " \n", + " merging_task.add_node_selector_constraint('nvidia.com/gpu')\n", + " merging_task.set_gpu_limit(1)\n", + " \n", + " kubernetes.mount_pvc(\n", + " merging_task,\n", + " pvc_name=provision_model_storage.outputs['name'],\n", + " mount_path='/storage-initializer',\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "4b26aa0d", + "metadata": {}, + "source": [ + "Create a run for the pipeline using Kubeflow Pipeline Client." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65f439d1-6589-4c22-be7d-f1b033b5b20f", + "metadata": {}, + "outputs": [], + "source": [ + "kfp_client = client.Client()\n", + "run = kfp_client.create_run_from_pipeline_func(\n", + " e2e_ml_pipeline,\n", + " arguments={},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b1e8c41a", + "metadata": {}, + "source": [ + "Once all the Kubeflow tasks in the pipeline are completed, the fine-tuned model should be ready for inference requests. You can port-forward the Inference pod and execute inference requests as shown below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "524474db-a122-4f6d-903c-d4e252a30cbe", + "metadata": {}, + "outputs": [], + "source": [ + "# ! curl --location 'http://localhost:8083/openai/v1/chat/completions' \\\n", + "# --header 'Content-Type: application/json' \\\n", + "# --data '{ \"model\": \"llama-3-1-8b-kubecon\", \"messages\": [{ \"role\": \"user\", \"content\": \"Can you tell me when is KubeCon + CloudNativeCon India 2024 scheduled?\"}], \"max_tokens\": 200, \"stream\": false}' | grep -o '\"content\":\"[^\"]*\"' \\\n", + "# | sed 's/\"content\":\"\\(.*\\)\"/\\1/'" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}