From 56295a9ed9a053fb166846232e2c138a21dd069c Mon Sep 17 00:00:00 2001
From: sagar-sehgal <sagar.r16@iiits.in>
Date: Wed, 13 Mar 2019 03:20:47 +0530
Subject: [PATCH 1/2] added text classification in nlp_apps

---
 nlp_apps.ipynb | 371 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 293 insertions(+), 78 deletions(-)

diff --git a/nlp_apps.ipynb b/nlp_apps.ipynb
index 458c55700..928e38a5f 100644
--- a/nlp_apps.ipynb
+++ b/nlp_apps.ipynb
@@ -17,7 +17,8 @@
     "\n",
     "* Language Recognition\n",
     "* Author Recognition\n",
-    "* The Federalist Papers"
+    "* The Federalist Papers\n",
+    "* Text Classification"
    ]
   },
   {
@@ -37,10 +38,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 1,
+   "metadata": {},
    "outputs": [],
    "source": [
     "from utils import open_data\n",
@@ -68,10 +67,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 2,
+   "metadata": {},
    "outputs": [],
    "source": [
     "from learning import NaiveBayesLearner\n",
@@ -92,10 +89,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 3,
+   "metadata": {},
    "outputs": [],
    "source": [
     "def recognize(sentence, nBS, n):\n",
@@ -122,7 +117,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -138,7 +133,7 @@
        "'German'"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -149,7 +144,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -165,7 +160,7 @@
        "'English'"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -176,7 +171,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -192,7 +187,7 @@
        "'German'"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -203,7 +198,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -219,7 +214,7 @@
        "'English'"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -254,10 +249,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 8,
+   "metadata": {},
    "outputs": [],
    "source": [
     "from utils import open_data\n",
@@ -285,10 +278,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 9,
+   "metadata": {},
    "outputs": [],
    "source": [
     "from learning import NaiveBayesLearner\n",
@@ -307,10 +298,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 10,
+   "metadata": {},
    "outputs": [],
    "source": [
     "def recognize(sentence, nBS):\n",
@@ -329,7 +318,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -338,7 +327,7 @@
        "'Abbott'"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -358,7 +347,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -367,7 +356,7 @@
        "'Austen'"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -402,10 +391,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 13,
+   "metadata": {},
    "outputs": [],
    "source": [
     "from utils import open_data\n",
@@ -423,7 +410,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -432,7 +419,7 @@
        "'The Project Gutenberg EBook of The Federalist Papers, by \\nAlexander Hamilton and John Jay and James Madison\\n\\nThis eBook is for the use of anyone anywhere at no cost and with\\nalmost no restrictions whatsoever.  You may copy it, give it away or\\nre-use it under the terms of the Project Gutenberg License included\\nwith this eBook or online at www.gutenberg.net\\n\\n\\nTitle: The Federalist Papers\\n\\nAuthor: Alexander Hamilton\\n        John Jay\\n        James Madison\\n\\nPosting Date: December 12, 2011 [EBook #18]'"
       ]
      },
-     "execution_count": 2,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -450,10 +437,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 15,
+   "metadata": {},
    "outputs": [],
    "source": [
     "wordseq = words(federalist)\n",
@@ -469,7 +454,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -478,7 +463,7 @@
        "'federalist no 1 general introduction for the independent journal hamilton to the people of the state of new york after an unequivocal experience of the inefficacy of the subsisting federal government you are called upon to deliberate on a new constitution for the united states of america the subject speaks its own importance comprehending in its consequences nothing less than the existence of the union the safety and welfare of the parts of which it is composed the fate of an empire in many respects the most interesting in the world it has been frequently remarked that it seems to'"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -500,10 +485,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 17,
+   "metadata": {},
    "outputs": [],
    "source": [
     "wordseq = [w for w in wordseq if w != 'publius']"
@@ -522,7 +505,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -531,7 +514,7 @@
        "(4, 16, 52)"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -568,10 +551,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 19,
+   "metadata": {},
    "outputs": [],
    "source": [
     "hamilton = ''.join(hamilton)\n",
@@ -603,10 +584,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 20,
+   "metadata": {},
    "outputs": [],
    "source": [
     "import random\n",
@@ -687,10 +666,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 21,
+   "metadata": {},
    "outputs": [],
    "source": [
     "dist = {('Madison', 1): P_madison, ('Hamilton', 1): P_hamilton, ('Jay', 1): P_jay}\n",
@@ -707,10 +684,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 22,
+   "metadata": {},
    "outputs": [],
    "source": [
     "def recognize(sentence, nBS):\n",
@@ -726,7 +701,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
@@ -737,7 +712,7 @@
       "Straightforward Naive Bayes Learner\n",
       "\n",
       "Paper No. 49: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n",
-      "Paper No. 50: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n",
+      "Paper No. 50: Hamilton: 0.0000 Madison: 0.0000 Jay: 1.0000\n",
       "Paper No. 51: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n",
       "Paper No. 52: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n",
       "Paper No. 53: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n",
@@ -746,8 +721,8 @@
       "Paper No. 56: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n",
       "Paper No. 57: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n",
       "Paper No. 58: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n",
-      "Paper No. 18: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n",
-      "Paper No. 19: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n",
+      "Paper No. 18: Hamilton: 0.0000 Madison: 0.0000 Jay: 1.0000\n",
+      "Paper No. 19: Hamilton: 0.0000 Madison: 0.0000 Jay: 1.0000\n",
       "Paper No. 20: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n",
       "Paper No. 64: Hamilton: 1.0000 Madison: 0.0000 Jay: 0.0000\n",
       "\n",
@@ -797,11 +772,251 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {
     "collapsed": true
    },
+   "source": [
+    "## Text Classification"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Text Classification** is assigning a category to a document based on the content of the document. Text Classification is one of the most popular and fundamental tasks of Natural Language Processing. Text classification can be applied on a variety of texts like *Short Documents* (like tweets, customer reviews, etc.) and *Long Document* (like emails, media articles, etc.).\n",
+    "\n",
+    "We already have seen an example of Text Classification in the above tasks like Language Identification and Author Identification and Federalist Paper Identification.\n",
+    "\n",
+    "### Applications\n",
+    "Some of the broad applications of Text Classification are:-\n",
+    "- Language Identification\n",
+    "- Author Identification\n",
+    "- Sentiment Analysis\n",
+    "- Spam Mail Detection\n",
+    "- Topic Labelling \n",
+    "- Word Sense Disambiguation\n",
+    "\n",
+    "### Use Cases\n",
+    "Some of the use cases of Text classification are:-\n",
+    "- Social Media Monitoring\n",
+    "- Brand Monitoring\n",
+    "- Auto-tagging of user queries\n",
+    "\n",
+    "For Text Classification, we would be using Naive Bayes Classifier. The reason for using Naive Bayes Classifier is:-\n",
+    "- Being a probabilistic classifier, therefore will calculate the probability of each category\n",
+    "- It is fast, reliable and accurate \n",
+    "- Naive Bayes Classifiers have already been used to solve many Natural Language Processing(NLP) applications.\n",
+    "\n",
+    "Here we would here be covering an example of **Word Sense Disambiguation** as an application of Text Classification. It is used to remove the ambiquity of a given word, if the word has 2 different meanings.\n",
+    "\n",
+    "As we know that we would be working on determining weather the word *apple* in a sentence reffers to `fruit` or to a `company`.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Step 1:- Defining the dataset** \n",
+    "\n",
+    "The dataset has been defined here itself so that everything is clear and can be tested with other things as well."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data = [\n",
+    "    \"Apple targets big business with new iOS 7 features. Finally... A corp iTunes account!\",\n",
+    "    \"apple inc is searching for people to help and try out all their upcoming tablet within our own net page No.\",\n",
+    "    \"Microsoft to bring Xbox and PC games to Apple, Android phones: Report: Microsoft Corp\",\n",
+    "    \"When did green skittles change from lime to green apple?\",\n",
+    "    \"Myra Oltman is the best. I told her I wanted to learn how to make apple pie, so she made me a kit!\",\n",
+    "    \"Surreal Sat in a sewing room, surrounded by crap, listening to beautiful music eating apple pie.\"\n",
+    "]\n",
+    "\n",
+    "train_target = [\n",
+    "    \"company\",\n",
+    "    \"company\",\n",
+    "    \"company\",\n",
+    "    \"fruit\",\n",
+    "    \"fruit\",\n",
+    "    \"fruit\",\n",
+    "]\n",
+    "\n",
+    "class_0 = \"company\"\n",
+    "class_1 = \"fruit\"\n",
+    "\n",
+    "test_data = [\n",
+    "    \"Apple Inc. supplier Foxconn demos its own iPhone-compatible smartwatch\",\n",
+    "    \"I now know how to make a delicious apple pie thanks to the best teachers ever\"\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Step 2:- Preprocessing the dataset**\n",
+    "\n",
+    "In this step, we would be doing some preprocessing on the dataset like breaking the sentence into words and converting to lower case.\n",
+    "\n",
+    "We already have a `words(sent)` function defined in `text.py` which does the task of splitting the sentence into words."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data_processed = [words(i) for i in train_data]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Step 3:- Feature Extraction from the text**\n",
+    "\n",
+    "Now we would be extracting features from the text like extracting the set of words used in both the categories i.e. `company` and `fruit`.\n",
+    "\n",
+    "This frequency of a word would help in calculating the probability of that word being in a particular class. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of words in `company` class: 49\n",
+      "Number of words in `fruit` class: 49\n"
+     ]
+    }
+   ],
+   "source": [
+    "words_0 = []\n",
+    "words_1 = []\n",
+    "\n",
+    "for sent, tag in zip(train_data_processed, train_target):\n",
+    "    if(tag == class_0):\n",
+    "        words_0 += sent\n",
+    "    elif(tag == class_1):\n",
+    "        words_1 += sent\n",
+    "    \n",
+    "print(\"Number of words in `\" + class_0 + \"` class:\", len(words_0))\n",
+    "print(\"Number of words in `\" + class_1 + \"` class:\", len(words_1))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As you might have observed that our dataset is equally balanced i.e. we have an equal number of words in both the classes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Step 4:-Making the Naive Bayes Model**\n",
+    "\n",
+    "Using Naive Bayes classifier we can calculate the probability of a word in `company` and `fruit` class and then multiplying all of them to get the probability of that sentence belonging each of the given classes. But if a word is not in our dictionary then this leads to the probability of that class becoming zero. For eg:- the word *Foxconn* is not in the dictionary of any of the classes. Due to this the \n",
+    "\n",
+    "To solve the problem we need to use **smoothing**, i.e. providing a minimum non-zero threshold probability to every word that we come across.\n",
+    "\n",
+    "The `UnigramWordModel` class has implemented smoothing by taking an additional argument from the user i.e. the minimum frequency that we would be giving to every word even if it is new to the dictionary."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_words_0 = UnigramWordModel(words_0, 1)\n",
+    "model_words_1 = UnigramWordModel(words_1, 1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we would be making the Naive Bayes model. For that, we would be making `dist` as we had done earlier in the Authorship Recognition Task."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from learning import NaiveBayesLearner\n",
+    "\n",
+    "dist = {('company', 1): model_words_0, ('fruit', 1): model_words_1}\n",
+    "\n",
+    "nBS = NaiveBayesLearner(dist, simple = True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Step 5:- Predict the class of the label**\n",
+    "\n",
+    "Now we would be making a function that does pre-procesing of the sentences which we have taken for testing. And then predicting the class of every sentence in the document."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def recognize(sentence, nBS):\n",
+    "    sentence_words = words(sentence)\n",
+    "    return nBS(sentence_words)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Apple Inc. supplier Foxconn demos its own iPhone-compatible smartwatch\t-company\n",
+      "I now know how to make a delicious apple pie thanks to the best teachers ever\t-fruit\n"
+     ]
+    }
+   ],
+   "source": [
+    "# prediction the class of every sentence in the test set\n",
+    "for i in test_data:\n",
+    "    print(i + \"\\t-\" + recognize(i, nBS))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You might have observed that our predictions are correct. Though they might not give correct results because of lack of data. And we are clearly able to differentiate between sentences in a much better way. \n",
+    "\n",
+    "As you might have observed that the above method is very much similar to the Authorship prediction, which is also a type of Text Classification. Like this most of Text Classification have the same underlying structure and follow a similar procedure."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": []
   }
@@ -822,7 +1037,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.1"
+   "version": "3.6.7"
   }
  },
  "nbformat": 4,

From a81a43e41a96366df0c57a8c1910c0deb0b51e03 Mon Sep 17 00:00:00 2001
From: sagar-sehgal <sagar.r16@iiits.in>
Date: Fri, 15 Mar 2019 19:00:25 +0530
Subject: [PATCH 2/2] updated as per the changes suggested.

---
 nlp_apps.ipynb | 51 ++++++++++++++++++++++----------------------------
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/nlp_apps.ipynb b/nlp_apps.ipynb
index 928e38a5f..2f4796b7a 100644
--- a/nlp_apps.ipynb
+++ b/nlp_apps.ipynb
@@ -786,12 +786,12 @@
    "source": [
     "**Text Classification** is assigning a category to a document based on the content of the document. Text Classification is one of the most popular and fundamental tasks of Natural Language Processing. Text classification can be applied on a variety of texts like *Short Documents* (like tweets, customer reviews, etc.) and *Long Document* (like emails, media articles, etc.).\n",
     "\n",
-    "We already have seen an example of Text Classification in the above tasks like Language Identification and Author Identification and Federalist Paper Identification.\n",
+    "We already have seen an example of Text Classification in the above tasks like Language Identification, Author Recognition and Federalist Paper Identification.\n",
     "\n",
     "### Applications\n",
     "Some of the broad applications of Text Classification are:-\n",
     "- Language Identification\n",
-    "- Author Identification\n",
+    "- Author Recognition\n",
     "- Sentiment Analysis\n",
     "- Spam Mail Detection\n",
     "- Topic Labelling \n",
@@ -803,14 +803,14 @@
     "- Brand Monitoring\n",
     "- Auto-tagging of user queries\n",
     "\n",
-    "For Text Classification, we would be using Naive Bayes Classifier. The reason for using Naive Bayes Classifier is:-\n",
-    "- Being a probabilistic classifier, therefore will calculate the probability of each category\n",
+    "For Text Classification, we would be using the Naive Bayes Classifier. The reasons for using Naive Bayes Classifier are:-\n",
+    "- Being a probabilistic classifier, therefore, will calculate the probability of each category\n",
     "- It is fast, reliable and accurate \n",
-    "- Naive Bayes Classifiers have already been used to solve many Natural Language Processing(NLP) applications.\n",
+    "- Naive Bayes Classifiers have already been used to solve many Natural Language Processing (NLP) applications.\n",
     "\n",
-    "Here we would here be covering an example of **Word Sense Disambiguation** as an application of Text Classification. It is used to remove the ambiquity of a given word, if the word has 2 different meanings.\n",
+    "Here we would here be covering an example of **Word Sense Disambiguation** as an application of Text Classification. It is used to remove the ambiguity of a given word if the word has two different meanings.\n",
     "\n",
-    "As we know that we would be working on determining weather the word *apple* in a sentence reffers to `fruit` or to a `company`.\n"
+    "As we know that we would be working on determining whether the word *apple* in a sentence refers to `fruit` or to a `company`."
    ]
   },
   {
@@ -819,7 +819,7 @@
    "source": [
     "**Step 1:- Defining the dataset** \n",
     "\n",
-    "The dataset has been defined here itself so that everything is clear and can be tested with other things as well."
+    "The dataset has been defined here so that everything is clear and can be tested with other things as well."
    ]
   },
   {
@@ -883,7 +883,7 @@
     "\n",
     "Now we would be extracting features from the text like extracting the set of words used in both the categories i.e. `company` and `fruit`.\n",
     "\n",
-    "This frequency of a word would help in calculating the probability of that word being in a particular class. "
+    "The frequency of a word would help in calculating the probability of that word being in a particular class. "
    ]
   },
   {
@@ -910,28 +910,28 @@
     "    elif(tag == class_1):\n",
     "        words_1 += sent\n",
     "    \n",
-    "print(\"Number of words in `\" + class_0 + \"` class:\", len(words_0))\n",
-    "print(\"Number of words in `\" + class_1 + \"` class:\", len(words_1))"
+    "print(\"Number of words in `{}` class: {}\".format(class_0, len(words_0)))\n",
+    "print(\"Number of words in `{}` class: {}\".format(class_1, len(words_1)))"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As you might have observed that our dataset is equally balanced i.e. we have an equal number of words in both the classes."
+    "As you might have observed, that our dataset is equally balanced, i.e. we have an equal number of words in both the classes."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Step 4:-Making the Naive Bayes Model**\n",
+    "**Step 4:- Building the Naive Bayes Model**\n",
     "\n",
-    "Using Naive Bayes classifier we can calculate the probability of a word in `company` and `fruit` class and then multiplying all of them to get the probability of that sentence belonging each of the given classes. But if a word is not in our dictionary then this leads to the probability of that class becoming zero. For eg:- the word *Foxconn* is not in the dictionary of any of the classes. Due to this the \n",
+    "Using the Naive Bayes classifier we can calculate the probability of a word in `company` and `fruit` class and then multiplying all of them to get the probability of that sentence belonging each of the given classes. But if a word is not in our dictionary then this leads to the probability of that word belonging to that class becoming zero. For example:- the word *Foxconn* is not in the dictionary of any of the classes. Due to this, the probability of word *Foxconn* being in any of these classes becomes zero, and since all the probabilities are multiplied, this leads to the probability of that sentence belonging to any of the classes becoming zero.   \n",
     "\n",
     "To solve the problem we need to use **smoothing**, i.e. providing a minimum non-zero threshold probability to every word that we come across.\n",
     "\n",
-    "The `UnigramWordModel` class has implemented smoothing by taking an additional argument from the user i.e. the minimum frequency that we would be giving to every word even if it is new to the dictionary."
+    "The `UnigramWordModel` class has implemented smoothing by taking an additional argument from the user, i.e. the minimum frequency that we would be giving to every word even if it is new to the dictionary."
    ]
   },
   {
@@ -948,7 +948,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now we would be making the Naive Bayes model. For that, we would be making `dist` as we had done earlier in the Authorship Recognition Task."
+    "Now we would be building the Naive Bayes model. For that, we would be making `dist` as we had done earlier in the Authorship Recognition Task."
    ]
   },
   {
@@ -961,16 +961,16 @@
     "\n",
     "dist = {('company', 1): model_words_0, ('fruit', 1): model_words_1}\n",
     "\n",
-    "nBS = NaiveBayesLearner(dist, simple = True)"
+    "nBS = NaiveBayesLearner(dist, simple=True)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Step 5:- Predict the class of the label**\n",
+    "**Step 5:- Predict the class of a sentence**\n",
     "\n",
-    "Now we would be making a function that does pre-procesing of the sentences which we have taken for testing. And then predicting the class of every sentence in the document."
+    "Now we will be writing a function that does pre-process of the sentences which we have taken for testing. And then predicting the class of every sentence in the document."
    ]
   },
   {
@@ -999,7 +999,7 @@
     }
    ],
    "source": [
-    "# prediction the class of every sentence in the test set\n",
+    "# predicting the class of sentences in the test set\n",
     "for i in test_data:\n",
     "    print(i + \"\\t-\" + recognize(i, nBS))"
    ]
@@ -1008,17 +1008,10 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "You might have observed that our predictions are correct. Though they might not give correct results because of lack of data. And we are clearly able to differentiate between sentences in a much better way. \n",
+    "You might have observed that the predictions made by the model are correct and we are able to differentiate between sentences of different classes. You can try more sentences on your own. Unfortunately though, since the datasets are pretty small, chances are the guesses will not always be correct.\n",
     "\n",
-    "As you might have observed that the above method is very much similar to the Authorship prediction, which is also a type of Text Classification. Like this most of Text Classification have the same underlying structure and follow a similar procedure."
+    "As you might have observed, the above method is very much similar to the Author Recognition, which is also a type of Text Classification. Like this most of Text Classification have the same underlying structure and follow a similar procedure."
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {