Naive Bayes

bodhwani · Sep 15, 2017 · c79d013 · c79d013
1 parent 7b060eb
commit c79d013
Show file tree

Hide file tree

Showing 4 changed files with 340 additions and 0 deletions.
diff --git a/Naive Bayes/Bayes_3.png b/Naive Bayes/Bayes_3.png
diff --git a/Naive Bayes/Bayes_rule.png b/Naive Bayes/Bayes_rule.png
diff --git a/Naive Bayes/Naive Bayes.ipynb b/Naive Bayes/Naive Bayes.ipynb
@@ -0,0 +1,340 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Lets have a look at a new classifier:\n",
+    "![alt image](another.gif 'another')\n",
+    "\n",
+    "# Naive Bayes classifier:\n",
+    "\n",
+    "In machine learning, naive Bayes classifiers are a family of simple probabilistic classifiers based on applying Bayes' theorem with strong (naive) independence assumptions between the features.\n",
+    "\n",
+    "The Naive Bayesian classifier is based on Bayes’ theorem with independence assumptions between predictors. A Naive Bayesian model is easy to build, with no complicated iterative parameter estimation which makes it particularly useful for very large datasets. Despite its simplicity, the Naive Bayesian classifier often does surprisingly well and is widely used because it often outperforms more sophisticated classification methods.\n",
+    "\n",
+    "![alt image](Bayes_rule.png 'bayes rule')\n",
+    "\n",
+    "![alt image](Bayes_3.png 'bayes rule')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[1]\n",
+      "[1]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n",
+    "Y = np.array([1, 1, 1, 2, 2, 2])\n",
+    "\n",
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "clf = GaussianNB()\n",
+    "clf.fit(X, Y)\n",
+    "\n",
+    "print(clf.predict([[-0.8, -1]]))\n",
+    "\n",
+    "clf_pf = GaussianNB()\n",
+    "clf_pf.partial_fit(X, Y, np.unique(Y))\n",
+    "\n",
+    "print(clf_pf.predict([[-2, 1]]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# use natural language toolkit\n",
+    "import nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem.lancaster import LancasterStemmer\n",
+    "# word stemmer\n",
+    "stemmer = LancasterStemmer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "12 sentences of training data\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 3 classes of training data\n",
+    "training_data = []\n",
+    "training_data.append({\"class\":\"greeting\", \"sentence\":\"how are you?\"})\n",
+    "training_data.append({\"class\":\"greeting\", \"sentence\":\"how is your day?\"})\n",
+    "training_data.append({\"class\":\"greeting\", \"sentence\":\"good day\"})\n",
+    "training_data.append({\"class\":\"greeting\", \"sentence\":\"how is it going today?\"})\n",
+    "\n",
+    "training_data.append({\"class\":\"goodbye\", \"sentence\":\"have a nice day\"})\n",
+    "training_data.append({\"class\":\"goodbye\", \"sentence\":\"see you later\"})\n",
+    "training_data.append({\"class\":\"goodbye\", \"sentence\":\"have a nice day\"})\n",
+    "training_data.append({\"class\":\"goodbye\", \"sentence\":\"talk to you soon\"})\n",
+    "\n",
+    "training_data.append({\"class\":\"sandwich\", \"sentence\":\"make me a sandwich\"})\n",
+    "training_data.append({\"class\":\"sandwich\", \"sentence\":\"can you make a sandwich?\"})\n",
+    "training_data.append({\"class\":\"sandwich\", \"sentence\":\"having a sandwich today?\"})\n",
+    "training_data.append({\"class\":\"sandwich\", \"sentence\":\"what's for lunch?\"})\n",
+    "print (\"%s sentences of training data\" % len(training_data))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Corpus words and counts: {'how': 3, 'ar': 1, 'you': 4, 'is': 2, 'yo': 1, 'day': 4, 'good': 1, 'it': 1, 'going': 1, 'today': 2, 'hav': 3, 'a': 5, 'nic': 2, 'see': 1, 'lat': 1, 'talk': 1, 'to': 1, 'soon': 1, 'mak': 2, 'me': 1, 'sandwich': 3, 'can': 1, 'what': 1, 'for': 1, 'lunch': 1} \n",
+      "\n",
+      "Class words: {'goodbye': ['hav', 'a', 'nic', 'day', 'see', 'you', 'lat', 'hav', 'a', 'nic', 'day', 'talk', 'to', 'you', 'soon'], 'greeting': ['how', 'ar', 'you', 'how', 'is', 'yo', 'day', 'good', 'day', 'how', 'is', 'it', 'going', 'today'], 'sandwich': ['mak', 'me', 'a', 'sandwich', 'can', 'you', 'mak', 'a', 'sandwich', 'hav', 'a', 'sandwich', 'today', 'what', 'for', 'lunch']}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# capture unique stemmed words in the training corpus\n",
+    "corpus_words = {}\n",
+    "class_words = {}\n",
+    "# turn a list into a set (of unique items) and then a list again (this removes duplicates)\n",
+    "classes = list(set([a['class'] for a in training_data]))\n",
+    "for c in classes:\n",
+    "    # prepare a list of words within each class\n",
+    "    class_words[c] = []\n",
+    "\n",
+    "# loop through each sentence in our training data\n",
+    "for data in training_data:\n",
+    "    # tokenize each sentence into words\n",
+    "    for word in nltk.word_tokenize(data['sentence']):\n",
+    "        # ignore a some things\n",
+    "        if word not in [\"?\", \"'s\"]:\n",
+    "            # stem and lowercase each word\n",
+    "            stemmed_word = stemmer.stem(word.lower())\n",
+    "            # have we not seen this word already?\n",
+    "            if stemmed_word not in corpus_words:\n",
+    "                corpus_words[stemmed_word] = 1\n",
+    "            else:\n",
+    "                corpus_words[stemmed_word] += 1\n",
+    "\n",
+    "            # add the word to our words in class list\n",
+    "            class_words[data['class']].extend([stemmed_word])\n",
+    "\n",
+    "# we now have each stemmed word and the number of occurances of the word in our training corpus (the word's commonality)\n",
+    "print (\"Corpus words and counts: %s \\n\" % corpus_words)\n",
+    "# also we have all words in each class\n",
+    "print (\"Class words: %s\" % class_words)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# we can now calculate a score for a new sentence\n",
+    "sentence = \"good day for us to have lunch?\"\n",
+    "\n",
+    "# calculate a score for a given class\n",
+    "def calculate_class_score(sentence, class_name, show_details=True):\n",
+    "    score = 0\n",
+    "    # tokenize each word in our new sentence\n",
+    "    for word in nltk.word_tokenize(sentence):\n",
+    "        # check to see if the stem of the word is in any of our classes\n",
+    "        if stemmer.stem(word.lower()) in class_words[class_name]:\n",
+    "            # treat each word with same weight\n",
+    "            score += 1\n",
+    "            \n",
+    "            if show_details:\n",
+    "                print (\"   match: %s\" % stemmer.stem(word.lower() ))\n",
+    "    return score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   match: day\n",
+      "   match: to\n",
+      "   match: hav\n",
+      "Class: goodbye  Score: 3 \n",
+      "\n",
+      "   match: good\n",
+      "   match: day\n",
+      "Class: greeting  Score: 2 \n",
+      "\n",
+      "   match: for\n",
+      "   match: hav\n",
+      "   match: lunch\n",
+      "Class: sandwich  Score: 3 \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# now we can find the class with the highest score\n",
+    "for c in class_words.keys():\n",
+    "    print (\"Class: %s  Score: %s \\n\" % (c, calculate_class_score(sentence, c)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# calculate a score for a given class taking into account word commonality\n",
+    "def calculate_class_score_commonality(sentence, class_name, show_details=True):\n",
+    "    score = 0\n",
+    "    # tokenize each word in our new sentence\n",
+    "    for word in nltk.word_tokenize(sentence):\n",
+    "        # check to see if the stem of the word is in any of our classes\n",
+    "        if stemmer.stem(word.lower()) in class_words[class_name]:\n",
+    "            # treat each word with relative weight\n",
+    "            score += (1 / corpus_words[stemmer.stem(word.lower())])\n",
+    "\n",
+    "            if show_details:\n",
+    "                print (\"   match: %s (%s)\" % (stemmer.stem(word.lower()), 1 / corpus_words[stemmer.stem(word.lower())]))\n",
+    "    return score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   match: day (0.25)\n",
+      "   match: to (1.0)\n",
+      "   match: hav (0.3333333333333333)\n",
+      "Class: goodbye  Score: 1.5833333333333333 \n",
+      "\n",
+      "   match: good (1.0)\n",
+      "   match: day (0.25)\n",
+      "Class: greeting  Score: 1.25 \n",
+      "\n",
+      "   match: for (1.0)\n",
+      "   match: hav (0.3333333333333333)\n",
+      "   match: lunch (1.0)\n",
+      "Class: sandwich  Score: 2.333333333333333 \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "# now we can find the class with the highest score\n",
+    "for c in class_words.keys():\n",
+    "    print (\"Class: %s  Score: %s \\n\" % (c, calculate_class_score_commonality(sentence, c)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "# return the class with highest score for sentence\n",
+    "def classify(sentence):\n",
+    "    high_class = None\n",
+    "    high_score = 0\n",
+    "    # loop through our classes\n",
+    "    for c in class_words.keys():\n",
+    "        # calculate score of sentence for each class\n",
+    "        score = calculate_class_score_commonality(sentence, c, show_details=False)\n",
+    "        # keep track of highest score\n",
+    "        if score > high_score:\n",
+    "            high_class = c\n",
+    "            high_score = score\n",
+    "\n",
+    "    return high_class, high_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('sandwich', 2.5)"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "classify(\"make me some lunch?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Naive Bayes/another.gif b/Naive Bayes/another.gif