Initial commit

anastasds · anastasds · commit 3ffc30abf389 · 2025-05-05T15:31:49.000-04:00
Signed-off-by: Anastas Stoyanovsky &lt;astoyano@redhat.com&gt;
diff --git a/notebooks/.ipynb_checkpoints/level1-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/level1-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/authoring/.ipynb_checkpoints/qna-customization-checkpoint.ipynb b/notebooks/authoring/.ipynb_checkpoints/qna-customization-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/authoring/qna-customization.ipynb b/notebooks/authoring/qna-customization.ipynb
@@ -0,0 +1,186 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "e98f3b99-5996-47a2-ac5c-b6ed9ca19c1c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -qq llama_index llama-index-llms-openai-like"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "33c2a1c7-aa33-4261-8e0f-857377b97409",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.llms.openai_like import OpenAILike\n",
+    "\n",
+    "client = OpenAI()\n",
+    "\n",
+    "llm = OpenAILike(\n",
+    "    model=\"granite3.3\",\n",
+    "    api_base=\"http://localhost:11434/v1\",\n",
+    "    temperature=0,\n",
+    ")\n",
+    "\n",
+    "llm = OpenAILike(\n",
+    "   model=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n",
+    "   api_base=\"https://mixtral-8x7b-instruct-v0-1-maas-apicast-production.apps.prod.rhoai.rh-aiservices-bu.com:443/v1\",\n",
+    "   api_key=\"a8230601c7cfc3c891ab744108417f8e\"\n",
+    "   temperature=0,\n",
+    ")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "faef8d7a-aa25-46a9-9598-7413e149d4bc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\"fact_single\": \"What type of document can be verified for authenticity using the given method?\", \"fact_single_answer\": \"Government-issued photo identification documents\", \"summary\": \"What aspects should be checked to ensure a government-issued photo identification document is valid and current?\", \"summary_answer\": \"The original physical document's characteristics, security features, and its status in relation to alterations or expiration\", \"reasoning\": \"Why is it important to verify the authenticity, validity, and currency of a government-issued photo identification document in person with the person being identified?\"}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from llama_index.core.prompts import PromptTemplate\n",
+    "\n",
+    "context_str = \"\"\"You can determine whether a government-issued photo identification\n",
+    "document is authentic, valid and current by viewing it in person, and by\n",
+    "looking at the characteristics of the original physical document and its\n",
+    "security features (or markers, as applicable) in the presence of the person\n",
+    "being identified. This will allow you to be satisfied that the identification\n",
+    "document is authentic, as issued by the competent authority (federal,\n",
+    "provincial, or territorial government), valid (unaltered, not counterfeit) and\n",
+    "current (not expired).\"\"\"\n",
+    "\n",
+    "prompt_template = (\n",
+    "    \"I will provide you a text passage. I need you to generate three questions that \"\n",
+    "    \"must be answered only with information contained in this passage, and nothing \"\n",
+    "    \"else.\\n\"\n",
+    "    'The first question is of type \"fact_single\", which means that the answer to this '\n",
+    "    \"question is a simple, single piece of factual information contained in the \"\n",
+    "    \"context.\\n\"\n",
+    "    'The second question is of type \"summary\", which means that the answer to this '\n",
+    "    \"question summarizes different pieces of factual information contained in the \"\n",
+    "    \"context.\\n\"\n",
+    "    'The third question is of type \"reasoning\", which is a question that requires the '\n",
+    "    \"reader to think critically and make an inference or draw a conclusion based on \"\n",
+    "    \"the information provided in the passage.\\n\"\n",
+    "    \"Make sure that the three questions are different.\\n\"\n",
+    "    \"\\n\"\n",
+    "    \"You will format your generation as a python dictionary, such as:\\n\"\n",
+    "    '{\"fact_single\": <The \"fact_single\" type question you thought of>, '\n",
+    "    '\"fact_single_answer: <Answer to the \"fact_single\" question>, \"summary\": <the '\n",
+    "    '\"summary\" type question you thought of>, \"summary_answer\": <Answer to the '\n",
+    "    '\"summary\" question>, \"reasoning\": <the \"reasoning\" type question you thought '\n",
+    "    'of>, \"reasoning_answer\": <Answer to the \"reasoning\" question>}\\n'\n",
+    "    \"\\n\"\n",
+    "    \"Only provide the python dictionary as your output.\\n\"\n",
+    "    \"\\n\"\n",
+    "    \"Context: {context_str}\"\n",
+    ")\n",
+    "\n",
+    "prompt = PromptTemplate(prompt_template).format(context_str=context_str, customization_statement=customization_statement)\n",
+    "\n",
+    "#print(prompt)\n",
+    "response = llm.complete(prompt)\n",
+    "print(str(response))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "98fcd2b4-498a-4af2-bdce-88b3d6dd21d0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\"fact_single\": \"What must I do to check if a government ID is real?\", \"fact_single_answer\": \"You need to view the ID in person and examine its characteristics and security features.\", \"summary\": \"How can I ensure an ID isn't fake or expired?\", \"summary_answer\": \"Check for original physical document traits, unaltered state, and that it's not past its expiration date.\", \"reasoning\": \"If I see these traits and features, can I be sure the ID is legitimate and still in use?\" , \"reasoning_answer\": \"Yes, by verifying the document's authenticity through its characteristics and security features in the person's presence, you can confirm it's a valid, current, and genuine government-issued ID.\"}\n"
+     ]
+    }
+   ],
+   "source": [
+    "domain = \"banking\"\n",
+    "audience = \"junior internal employees\"\n",
+    "use_case = \"chatbot\"\n",
+    "\n",
+    "customization_statement = f\"\"\"\n",
+    "Stylize the generated questions and answers in the format that {audience} for a {domain} organization might write when using a {use_case}.\n",
+    "Write at the fifth grade level. Prefer casual style and the first or second person.\n",
+    "\"\"\"\n",
+    "\n",
+    "# customization_statement = f\"\"\"\n",
+    "# Stylize the generated questions and answers in the format that {audience} for a {domain} organization might write when using a {use_case}.\n",
+    "# Adjust the writing style and level of sophistication accordingly. Prefer casual style and the first or second person.\n",
+    "# \"\"\"\n",
+    "\n",
+    "prompt_template = (\n",
+    "    \"I will provide you a text passage. I need you to generate three questions that \"\n",
+    "    \"must be answered only with information contained in this passage, and nothing \"\n",
+    "    \"else.\\n\"\n",
+    "    'The first question is of type \"fact_single\", which means that the answer to this '\n",
+    "    \"question is a simple, single piece of factual information contained in the \"\n",
+    "    \"context.\\n\"\n",
+    "    'The second question is of type \"summary\", which means that the answer to this '\n",
+    "    \"question summarizes different pieces of factual information contained in the \"\n",
+    "    \"context.\\n\"\n",
+    "    'The third question is of type \"reasoning\", which is a question that requires the '\n",
+    "    \"reader to think critically and make an inference or draw a conclusion based on \"\n",
+    "    \"the information provided in the passage.\\n\"\n",
+    "    \"Make sure that the three questions are different.\\n\"\n",
+    "    \"\\n\"\n",
+    "    \"You will format your generation as a python dictionary, such as:\\n\"\n",
+    "    '{\"fact_single\": <The \"fact_single\" type question you thought of>, '\n",
+    "    '\"fact_single_answer: <Answer to the \"fact_single\" question>, \"summary\": <the '\n",
+    "    '\"summary\" type question you thought of>, \"summary_answer\": <Answer to the '\n",
+    "    '\"summary\" question>, \"reasoning\": <the \"reasoning\" type question you thought '\n",
+    "    'of>, \"reasoning_answer\": <Answer to the \"reasoning\" question>}\\n'\n",
+    "    \"\\n\"\n",
+    "    \"Only provide the python dictionary as your output. Make sure you provide an answer for each question.\\n\"\n",
+    "    \"\\n\"\n",
+    "    \"{customization_statement}\\n\"\n",
+    "    \"\\n\"\n",
+    "    \"Context: {context_str}\"\n",
+    ")\n",
+    "\n",
+    "prompt = PromptTemplate(prompt_template).format(context_str=context_str, customization_statement=customization_statement)\n",
+    "\n",
+    "#print(prompt)\n",
+    "response = llm.complete(prompt)\n",
+    "print(str(response))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/instructlab-knowledge/.ipynb_checkpoints/level1-checkpoint.ipynb b/notebooks/instructlab-knowledge/.ipynb_checkpoints/level1-checkpoint.ipynb
@@ -0,0 +1,97 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5da70a08-1895-4d1f-8f50-93e2134b2e23",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# design goals:\n",
+    "#\n",
+    "# - understandability\n",
+    "# - modularity\n",
+    "# - configurability"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0acd026f-65bd-4393-bb40-f8aa8bd6828b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# config\n",
+    "\n",
+    "WORKSPACE_DIR=\"workspaces/default\"\n",
+    "# replace with pathlib\n",
+    "\n",
+    "# mkdir etc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "012dbcdf-93d7-474f-9826-77866ceb815e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# conversion\n",
+    "\n",
+    "CONVERSION_OUTPUT_DIR = f\"{WORKSPACE_DIR}/conversion\"\n",
+    "# replace with pathlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be9bbcbd-9bdd-474e-9d44-5c9d5a2c03e5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# chunking"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a165c38-843b-4c89-a8ad-6195b998e284",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# authoring"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5149dbbf-5601-4aa5-b1e9-e454ea0a4529",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# sdg"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/instructlab-knowledge/level1.ipynb b/notebooks/instructlab-knowledge/level1.ipynb
@@ -0,0 +1,97 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5da70a08-1895-4d1f-8f50-93e2134b2e23",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# design goals:\n",
+    "#\n",
+    "# - understandability\n",
+    "# - modularity\n",
+    "# - configurability"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0acd026f-65bd-4393-bb40-f8aa8bd6828b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# config\n",
+    "\n",
+    "WORKSPACE_DIR=\"workspaces/default\"\n",
+    "# replace with pathlib\n",
+    "\n",
+    "# mkdir etc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "012dbcdf-93d7-474f-9826-77866ceb815e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# conversion\n",
+    "\n",
+    "CONVERSION_OUTPUT_DIR = f\"{WORKSPACE_DIR}/conversion\"\n",
+    "# replace with pathlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be9bbcbd-9bdd-474e-9d44-5c9d5a2c03e5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# chunking"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a165c38-843b-4c89-a8ad-6195b998e284",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# authoring"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5149dbbf-5601-4aa5-b1e9-e454ea0a4529",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# sdg"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}