improve grammar correction adding a check for similarity with thefuzz

alovg · alovg · commit aff596134f69 · 2022-03-29T15:55:40.000+02:00
diff --git a/chatbot/grammar_correction.py b/chatbot/grammar_correction.py
@@ -2,6 +2,7 @@
 import torch
 import spacy
 import random
+from thefuzz import fuzz
 
 class GrammarModel(Gramformer):
     """
@@ -39,17 +40,19 @@ def add_correction_to_chat_history(self, chat_history):
         last_user_input = chat_history[-1].get('text')
         corrected_sentence, correction_message = self.grammar_correction(last_user_input)
         error_types = self.get_edits(last_user_input, corrected_sentence)
-        overlap_ignore_errors = any(item in error_types for item in self.ignore_errors)
-
-        if correction_message and (overlap_ignore_errors is False):
+        relevant_error = any(error not in self.ignore_errors for error in error_types) # check if there is an error in the sentence which is not in the ignore list 
+        token_sort_ratio = fuzz.token_sort_ratio(corrected_sentence, last_user_input) # calculate token similarity (ignoring punctuation and casing)
+        
+        if correction_message and relevant_error and token_sort_ratio != 100:
             chat_history.append(
                 {
                     'sender': 'bot',
                     'text': correction_message,
                     'correction': True
                 }
             )
-        return chat_history       
+        
+        return chat_history      
 
 
     def _get_edits(self, input_sentence, corrected_sentence):
diff --git a/chatbot/notebooks/grammar_model_improvements.ipynb b/chatbot/notebooks/grammar_model_improvements.ipynb
@@ -2,21 +2,46 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 344,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "Collecting thefuzz\n",
+      "  Downloading thefuzz-0.19.0-py2.py3-none-any.whl (17 kB)\n",
+      "Installing collected packages: thefuzz\n",
+      "Successfully installed thefuzz-0.19.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "#!pip install thefuzz"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 345,
    "metadata": {},
    "outputs": [],
    "source": [
     "from gramformer import Gramformer\n",
     "import torch\n",
     "import spacy\n",
-    "import random"
+    "import random\n",
+    "from thefuzz import fuzz\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Original GrammarModel"
+    "## Original GrammarModel"
    ]
   },
   {
@@ -210,12 +235,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Remove correction for error types ORTH, OTHER (and PUNCT?)"
+    "## Improvements to grammar correction"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 327,
+   "execution_count": 413,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -256,17 +281,19 @@
     "        last_user_input = chat_history[-1].get('text')\n",
     "        corrected_sentence, correction_message = self.grammar_correction(last_user_input)\n",
     "        error_types = self.get_edits(last_user_input, corrected_sentence)\n",
-    "        overlap_ignore_errors = any(item in error_types for item in self.ignore_errors)\n",
-    "\n",
-    "        if correction_message and (overlap_ignore_errors is False):\n",
+    "        relevant_error = any(error not in self.ignore_errors for error in error_types) # check if there is an error in the sentence which is not in the ignore list \n",
+    "        token_sort_ratio = fuzz.token_sort_ratio(corrected_sentence, last_user_input) # calculate token similarity (ignoring punctuation and casing)\n",
+    "        print(f\"correction_message: {correction_message}\\nErrors detected: {error_types}\\nPresence of a relevant error: {relevant_error}\\nSimilarity Score: {token_sort_ratio}\") # for debugging only\n",
+    "        \n",
+    "        if correction_message and relevant_error and token_sort_ratio != 100:\n",
     "            chat_history.append(\n",
     "                {\n",
     "                    'sender': 'bot',\n",
     "                    'text': correction_message,\n",
-    "                    'correction': True,\n",
-    "                    'error_type': error_types\n",
+    "                    'correction': True\n",
     "                }\n",
     "            )\n",
+    "        \n",
     "        return chat_history       \n",
     "\n",
     "\n",
@@ -295,7 +322,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 328,
+   "execution_count": 414,
    "metadata": {},
    "outputs": [
     {
@@ -310,40 +337,57 @@
     "gm2 = GrammarModel2(models=1, use_gpu=False)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1. Remove correction when no relevant errors are detected(other than those in self.ignore_errors)"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 341,
+   "execution_count": 450,
    "metadata": {},
    "outputs": [],
    "source": [
-    "chat_history_ex1 = [{'sender': 'User', 'text': 'Hi bot'}]"
+    "chat_history_ex1 = [{'sender': 'User', 'text': 'where are you goin?'}]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 342,
+   "execution_count": 451,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "correction_message: I think you meant:  \"where are you going?\" \n",
+      "Errors detected: ['PUNCT']\n",
+      "Presence of a relevant error: True\n",
+      "Similarity Score: 97\n"
+     ]
+    }
+   ],
    "source": [
     "chat_history = gm2.add_correction_to_chat_history(chat_history_ex1)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 343,
+   "execution_count": 452,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[{'sender': 'User', 'text': 'Hi bot'},\n",
+       "[{'sender': 'User', 'text': 'where are you goin?'},\n",
        " {'sender': 'bot',\n",
-       "  'text': 'This would be better said like this:  \"Hi booch!\" ',\n",
-       "  'correction': True,\n",
-       "  'error_type': ['NOUN']}]"
+       "  'text': 'I think you meant:  \"where are you going?\" ',\n",
+       "  'correction': True}]"
       ]
      },
-     "execution_count": 343,
+     "execution_count": 452,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -352,12 +396,83 @@
     "chat_history"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2. Remove correction when input and correction are very similar"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 369,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# Example of similar sentences which should not be corrected\n",
+    "ex1 = \"Hi bot!\"\n",
+    "ex2 = \"Hi bot\"\n",
+    "ex3 = \"Hi bot.\"\n",
+    "ex4 = \"Hi Bot\"\n",
+    "ex5 = \"Hi Bot.\"\n",
+    "ex6 = \"Hi Bot!\"\n",
+    "ex7 = \"Hi Bot Bot!\" # should lead to lower token sort ratio, but same token set ratio compared to ex1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 364,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Measure the similarity between 0 and 100 to define a threshold.\n",
+    "\n",
+    "def measure_similarity(sentence1, sentence2):\n",
+    "    simple_ratio = fuzz.ratio(sentence1, sentence2)\n",
+    "    print(f\"simple ratio similarity score: {simple_ratio}\")\n",
+    "\n",
+    "    partial_ratio = fuzz.partial_ratio(sentence1, sentence2) # Return the ratio of the most similar substring.\n",
+    "    print(f\"partial ratio similarity score: {partial_ratio}\")\n",
+    "\n",
+    "    ratio = fuzz.ratio(sentence1, sentence2)\n",
+    "    print(f\"ratio similarity score: {ratio}\")\n",
+    "\n",
+    "    token_sort_ratio = fuzz.token_sort_ratio(sentence1, sentence2) # Return a measure of the sequences' similarity sorting the token before comparing. This is what we want to set as threshold.\n",
+    "    print(f\"token sort ratio similarity score: {token_sort_ratio}\")\n",
+    "\n",
+    "    token_set_ratio = fuzz.token_set_ratio(sentence1, sentence2) # Measures similarity between unique tokens.\n",
+    "    print(f\"token set ratio similarity score: {token_set_ratio}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 378,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "simple ratio similarity score: 67\n",
+      "partial ratio similarity score: 71\n",
+      "ratio similarity score: 67\n",
+      "token sort ratio similarity score: 75\n",
+      "token set ratio similarity score: 100\n"
+     ]
+    }
+   ],
+   "source": [
+    "measure_similarity(ex1, ex7)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 426,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Correction accuracy check"
+   ]
   },
   {
    "cell_type": "code",
diff --git a/requirements.txt b/requirements.txt
@@ -13,4 +13,5 @@ ipykernel~=6.9.2
 openai~=0.16.0
 waitress~=2.1.1
 plotly~=5.6.0
-nbformat~=5.2.0
+nbformat~=5.2.0
+thefuzz~=0.19.0