Merge pull request #892 from khoj-ai/upgrade-offline-chat-models-support

Upgrade offline chat model support. Default to Llama 3.1
khoj-ai · Aug 20, 2024 · 4808ce7 · 4808ce7
2 parents 2d9dd81 + 58c8068
commit 4808ce7
Show file tree

Hide file tree

Showing 16 changed files with 42 additions and 15 deletions.
diff --git a/documentation/docs/features/chat.md b/documentation/docs/features/chat.md
@@ -25,7 +25,7 @@ Offline chat stays completely private and can work without internet using open-s
 >  - An Nvidia, AMD GPU or a Mac M1+ machine would significantly speed up chat response times
 
 1. Open your [Khoj offline settings](http://localhost:42110/server/admin/database/offlinechatprocessorconversationconfig/) and click *Enable* on the Offline Chat configuration.
-2. Open your [Chat model options settings](http://localhost:42110/server/admin/database/chatmodeloptions/) and add any [GGUF chat model](https://huggingface.co/models?library=gguf) to use for offline chat. Make sure to use `Offline` as its type. For a balanced chat model that runs well on standard consumer hardware we recommend using [Hermes-2-Pro-Mistral-7B by NousResearch](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) by default.
+2. Open your [Chat model options settings](http://localhost:42110/server/admin/database/chatmodeloptions/) and add any [GGUF chat model](https://huggingface.co/models?library=gguf) to use for offline chat. Make sure to use `Offline` as its type. For a balanced chat model that runs well on standard consumer hardware we recommend using [Llama 3.1 by Meta](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) by default.
 
 
 :::tip[Note]

diff --git a/documentation/docs/get-started/setup.mdx b/documentation/docs/get-started/setup.mdx
@@ -222,7 +222,7 @@ Using Ollama? See the [Ollama Integration](/advanced/ollama) section for more cu
 Any chat model on Huggingface in GGUF format can be used for local chat. Here's how you can set it up:
 
 1. No need to setup a conversation processor config!
-2. Go over to configure your [chat model options](http://localhost:42110/server/admin/database/chatmodeloptions/). Set the `chat-model` field to a supported chat model[^1] of your choice. For example, we recommend `NousResearch/Hermes-2-Pro-Mistral-7B-GGUF`, but [any gguf model on huggingface](https://huggingface.co/models?library=gguf) should work.
+2. Go over to configure your [chat model options](http://localhost:42110/server/admin/database/chatmodeloptions/). Set the `chat-model` field to a supported chat model[^1] of your choice. For example, we recommend `bartowski/Meta-Llama-3.1-8B-Instruct-GGUF`, but [any gguf model on huggingface](https://huggingface.co/models?library=gguf) should work.
   - Make sure to set the `model-type` to `Offline`. Do not set `openai config`.
   - The `tokenizer` and `max-prompt-size` fields are optional. You can set these for non-standard models (i.e not Mistral or Llama based models) or when you know the token limit of the model to improve context stuffing.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -66,7 +66,7 @@ dependencies = [
     "pymupdf >= 1.23.5",
     "django == 5.0.7",
     "authlib == 1.2.1",
-    "llama-cpp-python == 0.2.82",
+    "llama-cpp-python == 0.2.88",
     "itsdangerous == 2.1.2",
     "httpx == 0.25.0",
     "pgvector == 0.2.4",

diff --git a/src/interface/web/app/components/chatMessage/chatMessage.tsx b/src/interface/web/app/components/chatMessage/chatMessage.tsx
@@ -262,7 +262,7 @@ export function TrainOfThought(props: TrainOfThoughtProps) {
     let markdownRendered = DOMPurify.sanitize(md.render(props.message));
     return (
         <div
-            className={`${styles.trainOfThoughtElement} items-center ${props.primary ? "text-gray-400" : "text-gray-300"} ${styles.trainOfThought} ${props.primary ? styles.primary : ""}`}
+            className={`${styles.trainOfThoughtElement} break-all items-center ${props.primary ? "text-gray-400" : "text-gray-300"} ${styles.trainOfThought} ${props.primary ? styles.primary : ""}`}
         >
             {icon}
             <div dangerouslySetInnerHTML={{ __html: markdownRendered }} />

diff --git a/src/khoj/database/migrations/0058_alter_chatmodeloptions_chat_model.py b/src/khoj/database/migrations/0058_alter_chatmodeloptions_chat_model.py
@@ -0,0 +1,17 @@
+# Generated by Django 5.0.7 on 2024-08-19 12:37
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("database", "0057_remove_serverchatsettings_default_model_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="chatmodeloptions",
+            name="chat_model",
+            field=models.CharField(default="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", max_length=200),
+        ),
+    ]
diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py
@@ -91,7 +91,7 @@ class ModelType(models.TextChoices):
     max_prompt_size = models.IntegerField(default=None, null=True, blank=True)
     subscribed_max_prompt_size = models.IntegerField(default=None, null=True, blank=True)
     tokenizer = models.CharField(max_length=200, default=None, null=True, blank=True)
-    chat_model = models.CharField(max_length=200, default="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF")
+    chat_model = models.CharField(max_length=200, default="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF")
     model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OFFLINE)
     openai_config = models.ForeignKey(
         OpenAIProcessorConversationConfig, on_delete=models.CASCADE, default=None, null=True, blank=True

diff --git a/src/khoj/processor/conversation/offline/chat_model.py b/src/khoj/processor/conversation/offline/chat_model.py
@@ -24,7 +24,7 @@
 
 def extract_questions_offline(
     text: str,
-    model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
+    model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
     loaded_model: Union[Any, None] = None,
     conversation_log={},
     use_history: bool = True,
@@ -103,6 +103,9 @@ def extract_questions_offline(
             .replace("']", '"]')
             .replace("', '", '", "')
         )
+        # Remove any markdown json codeblock formatting if present (useful for gemma-2)
+        if response.startswith("```json"):
+            response = response[7:-3]
         questions: List[str] = json.loads(questions_str)
         questions = filter_questions(questions)
     except:
@@ -138,7 +141,7 @@ def converse_offline(
     references=[],
     online_results=[],
     conversation_log={},
-    model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
+    model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
     loaded_model: Union[Any, None] = None,
     completion_func=None,
     conversation_commands=[ConversationCommand.Default],
@@ -237,7 +240,7 @@ def llm_thread(g, messages: List[ChatMessage], model: Any, max_prompt_size: int
 def send_message_to_model_offline(
     messages: List[ChatMessage],
     loaded_model=None,
-    model="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
+    model="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
     temperature: float = 0.2,
     streaming=False,
     stop=[],

diff --git a/src/khoj/processor/conversation/offline/utils.py b/src/khoj/processor/conversation/offline/utils.py
@@ -75,6 +75,6 @@ def load_model_from_cache(repo_id: str, filename: str, repo_type="models"):
 def infer_max_tokens(model_context_window: int, configured_max_tokens=None) -> int:
     """Infer max prompt size based on device memory and max context window supported by the model"""
     configured_max_tokens = math.inf if configured_max_tokens is None else configured_max_tokens
-    vram_based_n_ctx = int(get_device_memory() / 2e6)  # based on heuristic
+    vram_based_n_ctx = int(get_device_memory() / 1e6)  # based on heuristic
     configured_max_tokens = configured_max_tokens or math.inf  # do not use if set to None
     return min(configured_max_tokens, vram_based_n_ctx, model_context_window)
diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py
@@ -587,7 +587,7 @@
 - Official, up-to-date information about you, Khoj, is available at site:khoj.dev, github or pypi.
 
 What Google searches, if any, will you need to perform to answer the user's question?
-Provide search queries as a list of strings in a JSON object. Do not wrap the json in a codeblock.
+Provide search queries as a list of strings in a JSON object.
 Current Date: {current_date}
 User's Location: {location}
 {username}

diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py
@@ -25,6 +25,7 @@
     "gpt-4-turbo-preview": 20000,
     "TheBloke/Mistral-7B-Instruct-v0.2-GGUF": 3500,
     "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF": 3500,
+    "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF": 20000,
 }
 model_to_tokenizer: Dict[str, str] = {}
 

diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py
@@ -279,6 +279,9 @@ async def aget_relevant_information_sources(query: str, conversation_history: di
 
     try:
         response = response.strip()
+        # Remove any markdown json codeblock formatting if present (useful for gemma-2)
+        if response.startswith("```json"):
+            response = response[7:-3]
         response = json.loads(response)
         response = [q.strip() for q in response["source"] if q.strip()]
         if not isinstance(response, list) or not response or len(response) == 0:
@@ -401,6 +404,9 @@ async def generate_online_subqueries(
     # Validate that the response is a non-empty, JSON-serializable list
     try:
         response = response.strip()
+        # Remove any markdown json codeblock formatting if present (useful for gemma-2)
+        if response.startswith("```json") and response.endswith("```"):
+            response = response[7:-3]
         response = json.loads(response)
         response = [q.strip() for q in response["queries"] if q.strip()]
         if not isinstance(response, list) or not response or len(response) == 0:

diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py
@@ -70,7 +70,7 @@ class OfflineChatProcessorConfig:
 
 
 class OfflineChatProcessorModel:
-    def __init__(self, chat_model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", max_tokens: int = None):
+    def __init__(self, chat_model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", max_tokens: int = None):
         self.chat_model = chat_model
         self.loaded_model = None
         try:

diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py
@@ -8,7 +8,7 @@
 app_env_filepath = "~/.khoj/env"
 telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry"
 content_directory = "~/.khoj/content/"
-default_offline_chat_model = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF"
+default_offline_chat_model = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
 default_online_chat_model = "gpt-4-turbo-preview"
 
 empty_config = {

diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py
@@ -93,7 +93,7 @@ class OpenAIProcessorConfig(ConfigBase):
 
 
 class OfflineChatProcessorConfig(ConfigBase):
-    chat_model: Optional[str] = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF"
+    chat_model: Optional[str] = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
 
 
 class ConversationProcessorConfig(ConfigBase):

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -378,7 +378,7 @@ def client_offline_chat(search_config: SearchConfig, default_user2: KhojUser):
 
     # Initialize Processor from Config
     ChatModelOptionsFactory(
-        chat_model="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
+        chat_model="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
         tokenizer=None,
         max_prompt_size=None,
         model_type="offline",

diff --git a/tests/helpers.py b/tests/helpers.py
@@ -49,7 +49,7 @@ class Meta:
 
     max_prompt_size = 3500
     tokenizer = None
-    chat_model = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF"
+    chat_model = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
     model_type = "offline"
     openai_config = factory.LazyAttribute(
         lambda obj: OpenAIProcessorConversationConfigFactory() if os.getenv("OPENAI_API_KEY") else None