diff --git a/colabs/azure/azure_gpt_medical_notes.ipynb b/colabs/azure/azure_gpt_medical_notes.ipynb
new file mode 100644
index 00000000..ebdf038c
--- /dev/null
+++ b/colabs/azure/azure_gpt_medical_notes.ipynb
@@ -0,0 +1,376 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Dict, List, Literal, Optional, Tuple\n",
+    "\n",
+    "import instructor\n",
+    "import openai\n",
+    "import pandas as pd\n",
+    "import weave\n",
+    "from pydantic import BaseModel, Field\n",
+    "from set_env import set_env\n",
+    "import json\n",
+    "import asyncio"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "set_env(\"OPENAI_API_KEY\")\n",
+    "set_env(\"WANDB_API_KEY\")\n",
+    "set_env(\"AZURE_OPENAI_ENDPOINT\")\n",
+    "set_env(\"AZURE_OPENAI_API_KEY\")\n",
+    "print(\"Env set\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from utils.config import ENTITY, WEAVE_PROJECT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "weave.init(f\"{ENTITY}/{WEAVE_PROJECT}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "N_SAMPLES = 67"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = openai.OpenAI()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_medical_data(url: str, num_samples: int = N_SAMPLES) -> Tuple[pd.DataFrame, pd.DataFrame]:\n",
+    "    \"\"\"\n",
+    "    Load medical data and split into train and test sets\n",
+    "    \n",
+    "    Args:\n",
+    "        url: URL of the CSV file\n",
+    "        num_samples: Number of samples to load\n",
+    "        \n",
+    "    Returns:\n",
+    "        Tuple of (train_df, test_df)\n",
+    "    \"\"\"\n",
+    "    df = pd.read_csv(url)\n",
+    "    df = df.sample(n=num_samples, random_state=42)  # Sample and shuffle data\n",
+    "    \n",
+    "    # Split into 80% train, 20% test\n",
+    "    train_size = int(0.8 * len(df))\n",
+    "    train_df = df[:train_size]\n",
+    "    test_df = df[train_size:]\n",
+    "    \n",
+    "    return train_df, test_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "medical_dataset_url = \"https://raw.githubusercontent.com/wyim/aci-bench/main/data/challenge_data/train.csv\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_df, test_df = load_medical_data(medical_dataset_url)\n",
+    "train_samples = train_df.to_dict(\"records\")\n",
+    "test_samples = test_df.to_dict(\"records\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_samples[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_samples[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def convert_to_jsonl(df: pd.DataFrame, output_file: str = \"medical_conversations.jsonl\"):\n",
+    "    \"\"\"\n",
+    "    Convert medical dataset to JSONL format with conversation structure\n",
+    "    \n",
+    "    Args:\n",
+    "        df: DataFrame to convert\n",
+    "        output_file: Output JSONL filename\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    with open(output_file, 'w', encoding='utf-8') as f:\n",
+    "        for _, row in df.iterrows():\n",
+    "            # Create the conversation structure\n",
+    "            conversation = {\n",
+    "                \"messages\": [\n",
+    "                    {\n",
+    "                        \"role\": \"system\",\n",
+    "                        \"content\": \"You are a medical scribe assistant. Your task is to accurately document medical conversations between doctors and patients, creating detailed medical notes that capture all relevant clinical information.\"\n",
+    "                    },\n",
+    "                    {\n",
+    "                        \"role\": \"user\",\n",
+    "                        \"content\": row['dialogue']\n",
+    "                    },\n",
+    "                    {\n",
+    "                        \"role\": \"assistant\",\n",
+    "                        \"content\": row['note']\n",
+    "                    }\n",
+    "                ]\n",
+    "            }\n",
+    "            \n",
+    "            # Write as JSON line\n",
+    "            json_line = json.dumps(conversation, ensure_ascii=False)\n",
+    "            f.write(json_line + '\\n')\n",
+    "    \n",
+    "    print(f\"Converted {len(df)} records to {output_file}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "convert_to_jsonl(train_df, \"medical_conversations_train.jsonl\")\n",
+    "convert_to_jsonl(test_df, \"medical_conversations_test.jsonl\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from utils.prompts import medical_task, medical_system_prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def format_dialogue(dialogue: str):\n",
+    "    dialogue = dialogue.replace(\"\\n\", \" \")\n",
+    "    transcript = f\"Dialogue: {dialogue}\"\n",
+    "    return transcript\n",
+    "\n",
+    "\n",
+    "@weave.op()\n",
+    "def process_medical_record(dialogue: str) -> Dict:\n",
+    "    transcript = format_dialogue(dialogue)\n",
+    "    prompt = medical_task.format(transcript=transcript)\n",
+    "\n",
+    "    response = client.chat.completions.create(\n",
+    "        model=\"gpt-3.5-turbo\",\n",
+    "        messages=[\n",
+    "            {\"role\": \"system\", \"content\": medical_system_prompt},\n",
+    "            {\"role\": \"user\", \"content\": prompt},\n",
+    "        ],\n",
+    "    )\n",
+    "\n",
+    "    extracted_info = response.choices[0].message.content\n",
+    "\n",
+    "    return {\n",
+    "        \"input\": transcript,\n",
+    "        \"output\": extracted_info,\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the LLM scoring function\n",
+    "@weave.op()\n",
+    "async def medical_note_accuracy(note: str, output: dict) -> dict:\n",
+    "    scoring_prompt = \"\"\"Compare the generated medical note with the ground truth note and evaluate accuracy.\n",
+    "    Score as 1 if the generated note captures the key medical information accurately, 0 if not.\n",
+    "    Output in valid JSON format with just a \"score\" field.\n",
+    "    \n",
+    "    Ground Truth Note:\n",
+    "    {ground_truth}\n",
+    "    \n",
+    "    Generated Note:\n",
+    "    {generated}\"\"\"\n",
+    "    \n",
+    "    prompt = scoring_prompt.format(\n",
+    "        ground_truth=note,\n",
+    "        generated=output['output']\n",
+    "    )\n",
+    "    \n",
+    "    response = client.chat.completions.create(\n",
+    "        model=\"gpt-4o\",\n",
+    "        messages=[{\"role\": \"user\", \"content\": prompt}],\n",
+    "        response_format={ \"type\": \"json_object\" }\n",
+    "    )\n",
+    "    return json.loads(response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create evaluation for test samples\n",
+    "test_evaluation = weave.Evaluation(\n",
+    "    name='medical_record_extraction_test',\n",
+    "    dataset=test_samples,\n",
+    "    scorers=[medical_note_accuracy]\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    in_jupyter = True\n",
+    "except ImportError:\n",
+    "    in_jupyter = False\n",
+    "if in_jupyter:\n",
+    "    import nest_asyncio\n",
+    "\n",
+    "    nest_asyncio.apply()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_results = asyncio.run(test_evaluation.evaluate(process_medical_record))\n",
+    "print(f\"Completed test evaluation\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from openai import AzureOpenAI\n",
+    "\n",
+    "# Initialize Azure client\n",
+    "azure_client = AzureOpenAI(\n",
+    "    azure_endpoint = os.getenv(\"AZURE_OPENAI_ENDPOINT\"), \n",
+    "    api_key=os.getenv(\"AZURE_OPENAI_API_KEY\"),  \n",
+    "    api_version=\"2024-02-01\"\n",
+    ")\n",
+    "\n",
+    "@weave.op()\n",
+    "def process_medical_record_azure(dialogue: str) -> Dict:\n",
+    "\n",
+    "    response = azure_client.chat.completions.create(\n",
+    "        model=\"gpt-35-turbo-0125-ft-d30b3aee14864c29acd9ac54eb92457f\",\n",
+    "        messages=[\n",
+    "            {\"role\": \"system\", \"content\": \"You are a medical scribe assistant. Your task is to accurately document medical conversations between doctors and patients, creating detailed medical notes that capture all relevant clinical information.\"},\n",
+    "            {\"role\": \"user\", \"content\": dialogue},\n",
+    "        ],\n",
+    "    )\n",
+    "\n",
+    "    extracted_info = response.choices[0].message.content\n",
+    "\n",
+    "    return {\n",
+    "        \"input\": dialogue,\n",
+    "        \"output\": extracted_info,\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_results_azure = asyncio.run(test_evaluation.evaluate(process_medical_record_azure))\n",
+    "print(f\"Completed test evaluation\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}