diff --git a/notebooks/.gitignore b/notebooks/.gitignore
new file mode 100644
index 0000000..0f44316
--- /dev/null
+++ b/notebooks/.gitignore
@@ -0,0 +1,18 @@
+# Created by https://www.toptal.com/developers/gitignore/api/jupyternotebooks
+# Edit at https://www.toptal.com/developers/gitignore?templates=jupyternotebooks
+
+### JupyterNotebooks ###
+# gitignore template for Jupyter Notebooks
+# website: http://jupyter.org/
+
+.ipynb_checkpoints
+*/.ipynb_checkpoints/*
+
+# IPython
+profile_default/
+ipython_config.py
+
+# Remove previous ipynb_checkpoints
+#   git rm -r .ipynb_checkpoints/
+
+# End of https://www.toptal.com/developers/gitignore/api/jupyternotebooks
\ No newline at end of file
diff --git a/notebooks/doc-preprocessing-to-sdg/requirements.txt b/notebooks/doc-preprocessing-to-sdg/requirements.txt
deleted file mode 100644
index 3e31635..0000000
--- a/notebooks/doc-preprocessing-to-sdg/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-docling
-elyra
-jupyterlab
diff --git a/notebooks/doc-preprocessing-to-sdg/chunking.ipynb b/notebooks/instructlab-knowledge/chunking.ipynb
similarity index 99%
rename from notebooks/doc-preprocessing-to-sdg/chunking.ipynb
rename to notebooks/instructlab-knowledge/chunking.ipynb
index 3bb61d6..4229232 100644
--- a/notebooks/doc-preprocessing-to-sdg/chunking.ipynb
+++ b/notebooks/instructlab-knowledge/chunking.ipynb
@@ -5,12 +5,20 @@
    "id": "7b33678f-67d2-48a1-801f-302622e43e0f",
    "metadata": {},
    "source": [
-    "## Goal\n",
+    "## Chunking\n",
     "The goal of chunking for InstructLab SDG is to provide the teacher model small and logical pieces of the source document to generate data off of.\n",
     "\n",
     "In this notebook we are doing chunking with Docling[https://docling-project.github.io/docling/examples/hybrid_chunking/#hybrid-chunking].\n",
     "\n",
-    "First let's ensure docling is installed."
+    "The input to this notebook is a docling JSON file created after a docling conversion, or a directory of docling JSON files."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d9f268fd-35d2-4c7a-8cfa-47630de00837",
+   "metadata": {},
+   "source": [
+    "### Dependencies"
    ]
   },
   {
@@ -272,8 +280,7 @@
     "            c = dict(chunk=chunk, file=file.stem)\n",
     "            all_chunks.append(c)\n",
     "    except ConversionError as e:\n",
-    "        print(f\"Skipping file {file}\")\n",
-    "# print(all_chunks)"
+    "        print(f\"Skipping file {file}\")"
    ]
   },
   {
@@ -286,6 +293,16 @@
     "To view the chunks, run through the following cell. As you can see the document is broken into small pieces with metadata about the chunk based on the document's format"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "ff88cf5c-1315-4eca-afcd-25706eaf7d6b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print(all_chunks)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "84826055-a7f1-4334-a12b-bbc07a523199",
@@ -293,7 +310,9 @@
     "tags": []
    },
    "source": [
-    "## Save the chunks to a text file each"
+    "## Save the chunks to a text file for each chunk\n",
+    "\n",
+    "Each chunk is saved to an individual text file in the format: `{docling-json-file-name}-{chunk #}.txt`. Having chunking in this format is important as an input to create-sdg-seed-data notebook."
    ]
   },
   {
diff --git a/notebooks/doc-preprocessing-to-sdg/docling-conversion.ipynb b/notebooks/instructlab-knowledge/docling-conversion.ipynb
similarity index 100%
rename from notebooks/doc-preprocessing-to-sdg/docling-conversion.ipynb
rename to notebooks/instructlab-knowledge/docling-conversion.ipynb
diff --git a/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb b/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb
new file mode 100644
index 0000000..954b64e
--- /dev/null
+++ b/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb
@@ -0,0 +1,678 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "af99f876-0ffd-4079-aeb7-4cead05daaf4",
+   "metadata": {},
+   "source": [
+    "# 🐶 Data Pre-Processing: From source PDF to SDG-ready\n",
+    "\n",
+    "This notebook goes through each of the stages of data pre-processing. Directory-based conventions are used to save intermediate results as a PDF is converted and chunked, QA generation is performed to create a `qna.yaml` file, and finally everything is combined into the inputs for SDG.\n",
+    "\n",
+    "Once a SDG seed dataset is created, a user can run through an SDG notebook and generate samples.\n",
+    "\n",
+    "**NOTE**: Starting the notebook using Python 3.11 is recommended. Python 3.12 or later are not yet supported. \n",
+    "\n",
+    "1. [Document Conversion](#Document-Conversion)\n",
+    "1. [Chunking](#Chunking)\n",
+    "1. [Authoring](#Authoring)\n",
+    "1. [Create Seed Dataset](#Create-Seed-Dataset-for-SDG)\n",
+    "\n",
+    "***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0acd026f-65bd-4393-bb40-f8aa8bd6828b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "WORKSPACE_NAME = \"default\"\n",
+    "\n",
+    "WORKSPACE_ROOT = Path(\"workspaces\")\n",
+    "WORKSPACE_ROOT.mkdir(exist_ok=True)\n",
+    "\n",
+    "WORKSPACE_DIR = WORKSPACE_ROOT / WORKSPACE_NAME\n",
+    "WORKSPACE_DIR.mkdir(exist_ok=True)\n",
+    "\n",
+    "SOURCE_DOCUMENT = None # to process a specific document, set its path here; otherwise, the entire source documents repository will be used\n",
+    "SOURCE_DOCUMENT_DIR = WORKSPACE_DIR / \"source_documents\"\n",
+    "SOURCE_DOCUMENT_DIR.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "CONVERSION_OUTPUT_DIR = WORKSPACE_DIR / \"conversion\"\n",
+    "CONVERSION_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "CHUNKING_OUTPUT_DIR = WORKSPACE_DIR / \"chunking\"\n",
+    "CHUNKING_OUTPUT_DIR.mkdir(exist_ok=True)\n",
+    "\n",
+    "AUTHORING_OUTPUT_DIR = WORKSPACE_DIR / \"authoring\"\n",
+    "AUTHORING_OUTPUT_DIR.mkdir(exist_ok=True)\n",
+    "\n",
+    "SEED_EXAMPLE_INPUT_DIR = WORKSPACE_DIR / \"sdg_inputs\"\n",
+    "SEED_EXAMPLE_INPUT_DIR.mkdir(exist_ok=True)\n",
+    "\n",
+    "SEED_EXAMPLE_OUTPUT_DIR = WORKSPACE_DIR / \"seed_examples\"\n",
+    "SEED_EXAMPLE_OUTPUT_DIR.mkdir(exist_ok=True)\n",
+    "\n",
+    "SDG_OUTPUT_DIR = WORKSPACE_DIR / \"sdg\"\n",
+    "SDG_OUTPUT_DIR.mkdir(exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "344b7ac5-fc2a-40a8-8e1f-e8dd8b1153e7",
+   "metadata": {},
+   "source": [
+    "## Document Conversion\n",
+    "\n",
+    "This notebook uses [Docling](https://github.com/docling-project/docling) to convert any type of document into a Docling Document. A Docling Document is the representation of the document after conversion that can be exported as JSON. The JSON output of this notebook can then be used in others such as one that uses Docling's chunking methods."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b91d4b2e-19cd-46e7-a912-ba9b2904c7cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -qq docling"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3f3804ef-4961-44b1-91c9-62929f422702",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = []\n",
+    "\n",
+    "if SOURCE_DOCUMENT:\n",
+    "    files.append(Path(SOURCE_DOCUMENT))\n",
+    "else:\n",
+    "    print(\"***** WARNING! Only one file at a time is supported at this time.\")\n",
+    "    files = list(SOURCE_DOCUMENT_DIR.rglob(\"*.pdf\"))\n",
+    "    print(f\"***** Using {files[0]})\")\n",
+    "\n",
+    "print(f\"Files to convert: {files}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "749fb64b-d089-4844-9330-7f3639819e7a",
+   "metadata": {},
+   "source": [
+    "Next we set the configuration options for our conversion pipeline. The PDF Conversion options set here are the defaults. More information about pipeline configuration can be found on Docling."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "157c5e02-edd1-44f6-b20f-f6b4bda1aae7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from docling.document_converter import DocumentConverter, PdfFormatOption\n",
+    "from docling.datamodel.base_models import InputFormat\n",
+    "from docling.datamodel.pipeline_options import PdfPipelineOptions\n",
+    "\n",
+    "pipeline_options = PdfPipelineOptions() # TODO: show the options that can be set\n",
+    "\n",
+    "doc_converter = DocumentConverter(\n",
+    "    format_options={\n",
+    "        InputFormat.PDF: PdfFormatOption(\n",
+    "            pipeline_options=pipeline_options\n",
+    "        )\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "73400c74-dead-4998-aee2-ddb00ddaa276",
+   "metadata": {},
+   "source": [
+    "Finally, we convert every document into Docling JSON as long as it is a valid file type to be converted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a200039c-b8b2-4087-88ba-7bfb0e393cc9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "for file in files:\n",
+    "    doc = doc_converter.convert(source=file).document\n",
+    "    doc_dict = doc.export_to_dict()\n",
+    "\n",
+    "    json_output_path = CONVERSION_OUTPUT_DIR / f\"{file.stem}.json\"\n",
+    "    with open(json_output_path, \"w\") as f:\n",
+    "        json.dump(doc_dict, f)\n",
+    "        print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cafad55e-a4c0-4d6e-9da0-49519fa9bf74",
+   "metadata": {},
+   "source": [
+    "## Chunking\n",
+    "\n",
+    "The goal of chunking the converted documents is to provide the teacher model small and logical pieces of the source document to generate data off of.\n",
+    "\n",
+    "In this notebook we are doing chunking with [Docling](https://docling-project.github.io/docling/examples/hybrid_chunking/#hybrid-chunking).\n",
+    "\n",
+    "The input to this notebook is a docling JSON file created after a docling conversion, or a directory of docling JSON files."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2482060c-a49f-4345-aa47-d54301939387",
+   "metadata": {},
+   "source": [
+    "### Initialize the Chunker\n",
+    "\n",
+    "Docling provides two chunkers, the `HierarchicalChunker` and the `HybridChunker`.\n",
+    "The `HierarchicalChunker` creates chunks based on the hierarchy in the Docling document\n",
+    "\n",
+    "The `HybridChunker` builds on the `HierarchicalChunker` and by making it tokenization aware.\n",
+    "\n",
+    "The `HybridChunker` has options for a `tokenizer`, the `max_tokens` in a chunk, and whether to merge undersized peer chunks. Uncomment the commented out code to configure these."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "50df9d91-add4-46a1-a69d-0f7f9f69542e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer\n",
+    "#from transformers import AutoTokenizer\n",
+    "\n",
+    "from docling.chunking import HybridChunker\n",
+    "\n",
+    "#EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
+    "#MAX_TOKENS = 1024\n",
+    "#\n",
+    "# tokenizer = HuggingFaceTokenizer(\n",
+    "#     tokenizer=AutoTokenizer.from_pretrained(EMBED_MODEL_ID),\n",
+    "#     max_tokens=MAX_TOKENS,  # optional, by default derived from `tokenizer` for HF case\n",
+    "#     merge_peers=True # \n",
+    "# )\n",
+    "\n",
+    "chunker = HybridChunker(\n",
+    "    #tokenizer=tokenizer,\n",
+    "    #merge_peers=True,  # whether to merge undersized chunks - defaults to True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "54ce1d6f-b8d3-470c-b3c9-675911f0ee92",
+   "metadata": {},
+   "source": [
+    "### Load and chunk the converted docling document\n",
+    "\n",
+    "Next lets convert the document we want to chunk up into a Docling Document."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "db983c05-4aa6-4261-9283-2adab69bfbd3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_chunks = []\n",
+    "docs = []\n",
+    "for file in files:\n",
+    "    doc = DocumentConverter().convert(source=file)\n",
+    "    docs.append(doc)\n",
+    "    \n",
+    "    chunk_iter = chunker.chunk(dl_doc=doc.document)\n",
+    "    chunk_objs = list(chunk_iter)\n",
+    "    chunks = [chunker.contextualize(chunk=chunk) for chunk in chunk_objs]\n",
+    "\n",
+    "    print(f\"Extracted {len(chunks)} chunks from {doc.document.name}\")\n",
+    "    \n",
+    "    for chunk in chunks:\n",
+    "        c = dict(chunk=chunk, file=file.stem)\n",
+    "        all_chunks.append(c)\n",
+    "\n",
+    "# TODO: support multiple files save all chunks to single file for review"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0fb38545-eb84-4923-8fc4-d10ed08eab26",
+   "metadata": {},
+   "source": [
+    "### View the Chunks\n",
+    "\n",
+    "To view the chunks, run through the following cell. As you can see the document is broken into small pieces with metadata about the chunk based on the document's format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4fdf34c7-9829-43d2-bf9f-7d1d55bb6a4c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#print(all_chunks)\n",
+    "print(chunks[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "42c4160f-7508-4c72-b28d-b56aa4975b26",
+   "metadata": {},
+   "source": [
+    "### Save the chunks to a text file for each chunk\n",
+    "\n",
+    "Each chunk is saved to an individual text file in the format: `{docling-json-file-name}-{chunk #}.txt`. Having chunking in this format is important as an input to create-sdg-seed-data notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6e70d576-a2bc-4274-b660-1cbe051968b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i, chunk in enumerate(all_chunks):\n",
+    "    chunk_path = CHUNKING_OUTPUT_DIR / f\"{chunk['file']}-{i}.txt\"\n",
+    "    with open(chunk_path, \"w\") as file:\n",
+    "        file.write(chunk[\"chunk\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a510f8c7-8cd3-4867-8742-9f4f9cda9e9f",
+   "metadata": {},
+   "source": [
+    "## Authoring"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "86c48e52-cda7-48ac-84dc-0b844aed5f98",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -qq docling-sdg\n",
+    "\n",
+    "# TODO: replace with above after https://github.com/docling-project/docling-sdg/pull/31 merges\n",
+    "#!pip install -qq git+https://github.com/anastasds/docling-sdg@d15de2c5a81bfe166f66f412fc4b23728065f396"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a165c38-843b-4c89-a8ad-6195b998e284",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from docling_sdg.qa.utils import get_qa_chunks\n",
+    "\n",
+    "filters = [\n",
+    "    lambda chunk: len(str(chunk.text)) > 500\n",
+    "]\n",
+    "\n",
+    "dataset = {}\n",
+    "for doc in docs:\n",
+    "    print(f\"Chunking and filtering document {doc.document.name}\")\n",
+    "\n",
+    "    chunks = list(chunker.chunk(dl_doc=doc.document))\n",
+    "    qa_chunks = list(get_qa_chunks(doc.document.name, chunk_objs, filters)) #TODO: decouple reference to chunk_objs from above)\n",
+    "    dataset[doc.document.name] = qa_chunks\n",
+    "    \n",
+    "    print(f\"Created dataset {doc.document.name} with {len(qa_chunks)} QA chunks\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d65ec755-e3de-40ab-bf3a-23ebb29a705d",
+   "metadata": {},
+   "source": [
+    "### Initialize QA generator, supplying details for which model to use\n",
+    "\n",
+    "GenerateOptions controls which model is used for QA generation by setting generate_options.provider below. Three options are available:\n",
+    "\n",
+    "* LlmProviders.WATSONX for watsonx\n",
+    "* LlmProviders.OPENAI for OpenAI\n",
+    "* LlmProviders.OPENAI_LIKE for any model provider with OpenAI compatible APIs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b702267e-f550-4bc2-bce4-c0fcecbbd292",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from docling_sdg.qa.generate import Generator\n",
+    "from docling_sdg.qa.base import GenerateOptions, LlmProvider\n",
+    "from pydantic import SecretStr\n",
+    "\n",
+    "API_KEY = \"your-api-key-here\"   # set to any string if no API key is needed, such as if using ollama\n",
+    "\n",
+    "generate_options = GenerateOptions(api_key=API_KEY, project_id=\"project_id\")\n",
+    "generate_options.provider = LlmProvider.OPENAI_LIKE\n",
+    "generate_options.url = SecretStr(API_KEY)\n",
+    "generate_options.api_base = \"http://127.0.0.1:11434/v1\"    # default\n",
+    "generate_options.model_id = \"granite3.3\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "919199c0-3747-409a-85ab-0155ef3ebe9d",
+   "metadata": {},
+   "source": [
+    "### Configure subset selection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f1197d4e-8354-45e3-9ec9-85c78ba36548",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "NUM_CHUNKS_TO_SELECT_FOR_AUTHORING = 5"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d2421d07-3e6c-4355-95f4-da8e157557c7",
+   "metadata": {},
+   "source": [
+    "### Run QA generation on selected chunks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e57edff5-9a13-47fb-9248-9140ae5baaca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random #TODO: replace random sampling with subset selection\n",
+    "\n",
+    "for doc, chunks in dataset.items(): # TODO: multiple file support\n",
+    "    generate_options.generated_file = AUTHORING_OUTPUT_DIR / f\"qagen-{doc}.json\" \n",
+    "    gen = Generator(generate_options=generate_options)\n",
+    "    \n",
+    "    print(f\"processing chunks that looks like:\\n{chunks[0].text}\")\n",
+    "    selected_chunks = random.sample(chunks, NUM_CHUNKS_TO_SELECT_FOR_AUTHORING)\n",
+    "    print(f\"Selected {len(selected_chunks)} contexts\")\n",
+    "\n",
+    "    Path.unlink(generate_options.generated_file, missing_ok=True)\n",
+    "    results = gen.generate_from_chunks(selected_chunks) # automatically saves to file\n",
+    "    \n",
+    "    print(f\"{doc}: {results.status}\")\n",
+    "    break"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ea64b8f0-dd6c-4776-8646-9731433f909b",
+   "metadata": {},
+   "source": [
+    "### Read generated QAs and restructure"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9df2c533-30d7-4c30-9907-7c5655fd2246",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import yaml\n",
+    "from textwrap import wrap\n",
+    "\n",
+    "qnas = {}\n",
+    "chunk_id_to_text = {}\n",
+    "with open(generate_options.generated_file, \"rt\") as f:\n",
+    "    for line in f.readlines():\n",
+    "        entry = json.loads(line)\n",
+    "        chunk_id = entry['chunk_id']\n",
+    "        if chunk_id not in chunk_id_to_text:\n",
+    "            chunk_id_to_text[chunk_id] = entry['context']\n",
+    "        if chunk_id not in qnas:\n",
+    "            qnas[chunk_id] = []\n",
+    "        qnas[chunk_id].append({'question': entry['question'], 'answer': entry['answer']})\n",
+    "\n",
+    "print(f\"Generated QA pairs for {len(qnas)} contexts\")\n",
+    "print(list(qnas.values())[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9b6d6c26-f4d5-420d-ae78-ac28cf39efd3",
+   "metadata": {},
+   "source": [
+    "### Define metadata for qna.yaml"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c7130e90-2b65-4008-86f7-194da74a9523",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DOCUMENT_OUTLINE = \"A Probabilistic Inference Approach to Inference-Time Scaling of LLMs using Particle-Based Monte Carlo Methods\"\n",
+    "DOMAIN = \"artificial intelligence\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dafa8927-e56c-448b-b88b-f8d854c25d4d",
+   "metadata": {},
+   "source": [
+    "### Output qna.yaml"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7f26460-737f-4940-b58a-ef6caea313d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qna_output_path = AUTHORING_OUTPUT_DIR / \"qna.yaml\"\n",
+    "\n",
+    "# The following creates a data structure for outputting in the expected format for qna.yaml\n",
+    "# TODO: extract into utils library\n",
+    "\n",
+    "def str_presenter(dumper, data):\n",
+    "  if len(data.splitlines()) > 1:  # check for multiline string\n",
+    "    return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')\n",
+    "  elif len(data) > 80:\n",
+    "    data = \"\\n\".join(wrap(data, 80))\n",
+    "    return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')\n",
+    "  return dumper.represent_scalar('tag:yaml.org,2002:str', data)\n",
+    "\n",
+    "yaml.add_representer(str, str_presenter)\n",
+    "\n",
+    "# to use with safe_dump:\n",
+    "yaml.representer.SafeRepresenter.add_representer(str, str_presenter)\n",
+    "\n",
+    "class IndentedDumper(yaml.Dumper):\n",
+    "    def increase_indent(self, flow=False, indentless=False):\n",
+    "        return super(IndentedDumper, self).increase_indent(flow, False)\n",
+    "\n",
+    "data = {'seed_examples': []}\n",
+    "for chunk_id, context in chunk_id_to_text.items():\n",
+    "    data['seed_examples'].append({\n",
+    "        'context': context,\n",
+    "        'questions_and_answers': [\n",
+    "            {\n",
+    "                'question': example['question'],\n",
+    "                'answer': example['answer'],\n",
+    "            } for example in qnas[chunk_id]\n",
+    "        ]\n",
+    "    })\n",
+    "\n",
+    "data['document_outline'] = DOCUMENT_OUTLINE\n",
+    "data['domain'] = DOMAIN\n",
+    "\n",
+    "Path.unlink(qna_output_path, missing_ok=True) # shouldn't be necessary but was. jupyter caching thing?\n",
+    "with open(qna_output_path, 'w') as yaml_file:\n",
+    "    yaml.dump(data, yaml_file, Dumper=IndentedDumper, default_flow_style=False, sort_keys=False, width=80)\n",
+    "\n",
+    "print(f\"qna.yaml saved to: {qna_output_path}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ed9ea149-844b-4330-90ec-d0ca7ab12b90",
+   "metadata": {},
+   "source": [
+    "### View generated qna.yaml"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1293d445-b826-4b92-ad20-9b121ac60e20",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(qna_output_path) as yaml_file:\n",
+    "    print(yaml_file.read())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6c574f96-5860-48b9-b4ac-01d367c7717b",
+   "metadata": {},
+   "source": [
+    "### Revise QAs\n",
+    "\n",
+    "Open the generated `qna.yaml` in your preferred text editor to ensure the quality of generated questions and answers. If the generation step has failed to generated three questions and answers for each of five contexts, supplant until that required number of QA pairs is reached."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1f101076-a50f-49ea-a83b-46eaa8b39cc4",
+   "metadata": {},
+   "source": [
+    "## Create Seed Dataset for SDG\n",
+    "\n",
+    "This notebook combines the contents from the qna.yaml and the chunks from the source document to create a seed dataset for the synthetic data generation process.\n",
+    "\n",
+    "To run this notebook you need a directory that contains N chunks named `{original-file-name}-{N}.txt` and a `qna.yaml` in the same directory.\n",
+    "\n",
+    "This notebook outputs a `seed.jsonl` file in the `output_dir` that you set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e2c6e31b-e8a9-406c-b2dc-27433c8fd8ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -qq datasets transformers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab2c9ed2-8ba8-4959-8e01-81625b81d286",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import shutil\n",
+    "\n",
+    "from utils.create_seed_dataset import get_seed_dataset\n",
+    "\n",
+    "src_files = os.listdir(CHUNKING_OUTPUT_DIR)\n",
+    "\n",
+    "for file_name in src_files:\n",
+    "    full_file_name = os.path.join(CHUNKING_OUTPUT_DIR, file_name)\n",
+    "    if os.path.isfile(full_file_name):\n",
+    "        shutil.copy(full_file_name, SEED_EXAMPLE_INPUT_DIR)\n",
+    "\n",
+    "shutil.copy(qna_output_path, SEED_EXAMPLE_INPUT_DIR)\n",
+    "\n",
+    "seed_data = get_seed_dataset(SEED_EXAMPLE_INPUT_DIR)\n",
+    "output_path = f'{SEED_EXAMPLE_OUTPUT_DIR}/seed_data.jsonl'\n",
+    "seed_data.to_json(output_path, orient='records', lines=True)\n",
+    "\n",
+    "print(f\"Generated {seed_data.data.num_rows} rows\")\n",
+    "print(f\"Results saved to: {output_path}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "50ff36f4-19fc-4a27-b51a-3688e7b630e4",
+   "metadata": {},
+   "source": [
+    "### Inspect the generated data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6936825-31c1-4b46-a1af-2fb46f50158d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(seed_data.data.table.slice(length=1))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "24a8fcdb-8035-4f30-b856-46afe9f928a1",
+   "metadata": {},
+   "source": [
+    "# Summary\n",
+    "\n",
+    "To recap, given a source document in PDF format, this notebook:\n",
+    "\n",
+    "1. Converted the document using document and saved it to JSON for inspection\n",
+    "2. Split the extracted text into chunks\n",
+    "3. Generated QA pairs for a subset of those chunks\n",
+    "4. Created a `qna.yaml` available for inspection and revision\n",
+    "5. Combined the chunks and `qna.yaml` to create a `seed.jsonl` for use with SDG\n",
+    "\n",
+    "The next step is to use the resulting `seed.jsonl` for SDG, such as illustrated in [this notebook](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/blob/main/examples/instructlab/knowledge/knowledge_generation_and_mixing.ipynb)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/instructlab-knowledge/utils/create_seed_dataset.py b/notebooks/instructlab-knowledge/utils/create_seed_dataset.py
new file mode 100644
index 0000000..f1c8da0
--- /dev/null
+++ b/notebooks/instructlab-knowledge/utils/create_seed_dataset.py
@@ -0,0 +1,182 @@
+# Standard
+from pathlib import Path
+import json
+import re
+from typing import List, Dict
+
+# Third Party
+from datasets import Dataset, concatenate_datasets
+from transformers import AutoTokenizer
+import yaml
+
+def get_seed_dataset(path: str) -> Dataset:
+    """
+    Creates a seed dataset from a path
+    Args:
+        path (str):   Path to directory of qna.yaml and chunks
+    Returns:
+        ds (Dataset): Transformers Dataset to be used to create a jsonl
+                      of seed data for the knowledge generation pipeline in
+                      SDG.
+    """
+    valid_path = is_dir_valid(path)
+    ds = create_dataset_from_dir(valid_path)
+
+    return ds
+
+def is_dir_valid(path: str) -> Path:
+    """
+    Returns whether or not a directory contains a qna.yaml and one or more .txt chunks
+    Args:
+        path (str):       Path to directory of qna.yaml and chunks
+    Returns:
+        base_path (Path): pathlib.Path to a directory that can create a jsonl
+                          of seed data
+    """
+    base_path = Path(path)
+    if not base_path.is_dir():
+        raise ValueError("Base path must be a directory")
+
+    files = list(base_path.iterdir())
+    has_qna = any(f.name == 'qna.yaml' for f in files)
+    has_txt = any(f.suffix == '.txt' for f in files)
+    if not has_qna or not has_txt:
+        raise ValueError("Directory does not contain a qna.yaml and chunks")
+
+    return base_path
+
+def read_chunks(path: Path) -> Dict[str, str]:
+    """
+    Returns a dictionary with all of the .txt chunks in a directory
+    The chunks may originate from one or more different files
+    Args:
+        path (Path): Path to directory of chunks
+    Returns:
+        chunks_dict (Dict[str,str]: Dictionary with key of the original file name
+                                    and a list of chunks as the value
+    """
+    chunk_files = path.glob('*.txt')
+
+    chunks_dict = {}
+    for file in chunk_files:
+        chunks = []
+        match = re.match(r"^(.*?)[-_]\d+\.txt$", file.name)
+        if match:
+            orig_filename = match.group(1)
+
+            with file.open('r', encoding='utf-8') as f:
+                chunk = f.read()
+
+            if orig_filename not in chunks_dict:
+                chunks_dict[orig_filename] = []
+            chunks_dict[orig_filename].append(chunk)
+
+        else:
+            print(f"Ignoring .txt file {file}, file name is not the right format")
+
+    return chunks_dict
+
+def create_dataset_from_dir(path: Path) -> Dataset:
+    """
+    Process a directory with chunks and a qna.yaml return a dataset.
+    Args:
+        path (Path): Path to directory of chunks and qna.yaml.
+    Returns:
+        Dataset: Dataset object.
+    """
+
+    qna_yaml_path = path / "qna.yaml"
+
+    with open(qna_yaml_path, 'r') as f:
+      qna_yaml = yaml.safe_load(f)
+
+    # Check for required fields
+    if not all(key in qna_yaml for key in ['document_outline', 'domain', 'seed_examples']):
+        raise ValueError("qna.yaml file is missing document_outline, domain, or seed_examples fields")
+
+    chunks_dict = read_chunks(path)
+    
+    datasets = []
+    for filename in chunks_dict.keys():
+      chunks = chunks_dict[filename]
+      chunk_ds = Dataset.from_dict(
+          {
+              "document": chunks,
+              "document_outline": [qna_yaml["document_outline"]]
+              * len(chunks),
+              "document_title": [filename] * len(chunks), # TODO: is this really a necessary field?
+              "domain": [qna_yaml["domain"]] * len(chunks),
+          }
+      )
+      chunk_ds_with_icls = add_icls(qna_yaml, chunk_ds)
+      datasets.append(chunk_ds_with_icls)
+
+    return safe_concatenate_datasets(datasets)
+
+def safe_concatenate_datasets(datasets: list[Dataset]) -> Dataset:
+    """
+    Concatenate datasets safely, ignoring any datasets that are None or empty.
+    Args:
+        datasets (list[Dataset]): List of Dataset objects to concatenate.
+    Returns:
+        Dataset: Dataset object with concatenated datasets.
+    """
+    filtered_datasets = [ds for ds in datasets if ds is not None and ds.num_rows > 0]
+
+    if not filtered_datasets:
+        return None
+
+    return concatenate_datasets(filtered_datasets)
+
+def get_token_count(text, tokenizer):
+    return len(tokenizer.tokenize(text))
+
+def add_icls(qna_yaml: Dict[str, str], chunked_document: Dataset) -> Dataset:
+    """
+    Add the ICLS label to the dataset.
+    Args:
+        qna_yaml (Dict): object representing qna.yaml file.
+        dataset (Dataset): Dataset object.
+    Returns:
+        Dataset: Dataset object with ICLS label.
+    """
+    # TODO: make the tokenizer configurable at some level
+    tokenizer = AutoTokenizer.from_pretrained("instructlab/granite-7b-lab")
+    icl = qna_yaml["seed_examples"]
+    chunked_document_all_icl = []
+    for icl_ in icl:
+        chunked_document_all_icl.append(
+            chunked_document.map(
+                lambda x: {
+                    "icl_document": icl_["context"],
+                    "icl_query_1": icl_["questions_and_answers"][0]["question"],
+                    "icl_response_1": icl_["questions_and_answers"][0]["answer"],
+                    "icl_query_2": icl_["questions_and_answers"][1]["question"],
+                    "icl_response_2": icl_["questions_and_answers"][1]["answer"],
+                    "icl_query_3": icl_["questions_and_answers"][2]["question"],
+                    "icl_response_3": icl_["questions_and_answers"][2]["answer"],
+                }
+            )
+        )
+    chunked_document_all_icl = safe_concatenate_datasets(chunked_document_all_icl)
+    chunked_document_all_icl = chunked_document_all_icl.map(
+        lambda x: {
+            "chunks": chunk_document(
+                [x["document"]], server_ctx_size=4096, chunk_word_count=1024
+            )
+            if get_token_count(x["document"], tokenizer) > 1024
+            else [x["document"]]
+        }
+    )
+    df = chunked_document_all_icl.to_pandas()
+    df_exploded = df.explode("chunks").reset_index(drop=True)
+    new_ds = Dataset.from_pandas(df_exploded)
+    new_ds = new_ds.remove_columns("document").rename_columns(
+        {"chunks": "document"}
+    )
+
+    # Only keep document greater than 100 tokens
+    new_ds = new_ds.filter(
+        lambda x: get_token_count(x["document"], tokenizer) > 100
+    )
+    return new_ds
diff --git a/notebooks/instructlab-knowledge/workspaces/default/source_documents/2502.01618v3.pdf b/notebooks/instructlab-knowledge/workspaces/default/source_documents/2502.01618v3.pdf
new file mode 100644
index 0000000..f0ccc3d
Binary files /dev/null and b/notebooks/instructlab-knowledge/workspaces/default/source_documents/2502.01618v3.pdf differ