diff --git a/nlp_apps.ipynb b/nlp_apps.ipynb index 458c55700..9cc7e2ef4 100644 --- a/nlp_apps.ipynb +++ b/nlp_apps.ipynb @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "collapsed": true }, @@ -68,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "collapsed": true }, @@ -92,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "collapsed": true }, @@ -122,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -138,7 +138,7 @@ "'German'" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -149,7 +149,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -165,7 +165,7 @@ "'English'" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -176,7 +176,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -192,7 +192,7 @@ "'German'" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -203,7 +203,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -219,7 +219,7 @@ "'English'" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -254,7 +254,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 8, "metadata": { "collapsed": true }, @@ -285,7 +285,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 9, "metadata": { "collapsed": true }, @@ -307,7 +307,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "metadata": { "collapsed": true }, @@ -329,7 +329,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -338,7 +338,7 @@ "'Abbott'" ] }, - "execution_count": 4, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -358,7 +358,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -367,7 +367,7 @@ "'Austen'" ] }, - "execution_count": 5, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -402,10 +402,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, + "execution_count": 34, + "metadata": {}, "outputs": [], "source": [ "from utils import open_data\n", @@ -423,16 +421,16 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'The Project Gutenberg EBook of The Federalist Papers, by \\nAlexander Hamilton and John Jay and James Madison\\n\\nThis eBook is for the use of anyone anywhere at no cost and with\\nalmost no restrictions whatsoever. You may copy it, give it away or\\nre-use it under the terms of the Project Gutenberg License included\\nwith this eBook or online at www.gutenberg.net\\n\\n\\nTitle: The Federalist Papers\\n\\nAuthor: Alexander Hamilton\\n John Jay\\n James Madison\\n\\nPosting Date: December 12, 2011 [EBook #18]'" + "'The Project Gutenberg EBook of The Federalist Papers, by Alexander Hamilton and John Jay and James Madison\\n\\nThis eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.net\\n\\nTitle: The Federalist Papers\\n\\nAuthor: Alexander Hamilton\\n John Jay\\n James Madison\\nPosting Date: December 12, 2011 [EBook #18] Rel'" ] }, - "execution_count": 2, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -450,14 +448,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 36, "metadata": { "collapsed": true }, "outputs": [], "source": [ - "wordseq = words(federalist)\n", - "wordseq = wordseq[114:-3098]" + "wordseqs = words(federalist)\n", + "wordseqs = wordseqs[114:-3098]" ] }, { @@ -469,7 +467,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -478,13 +476,13 @@ "'federalist no 1 general introduction for the independent journal hamilton to the people of the state of new york after an unequivocal experience of the inefficacy of the subsisting federal government you are called upon to deliberate on a new constitution for the united states of america the subject speaks its own importance comprehending in its consequences nothing less than the existence of the union the safety and welfare of the parts of which it is composed the fate of an empire in many respects the most interesting in the world it has been frequently remarked that it seems to'" ] }, - "execution_count": 4, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "' '.join(wordseq[:100])" + "' '.join(wordseqs[:100])" ] }, { @@ -500,13 +498,13 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 38, "metadata": { "collapsed": true }, "outputs": [], "source": [ - "wordseq = [w for w in wordseq if w != 'publius']" + "wordseqs = [w for w in wordseqs if w != 'publius']" ] }, { @@ -522,7 +520,37 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 39, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#removing stopwords\n", + "from nltk.corpus import stopwords\n", + "stop_words = set(stopwords.words('english')) \n", + "\n", + "wordseqs = [w for w in wordseqs if not w in stop_words] " + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#stemming and lemmatization\n", + "from nltk.stem.wordnet import WordNetLemmatizer\n", + "lmtzr = WordNetLemmatizer()\n", + "\n", + "wordseq = [lmtzr.lemmatize(w) for w in wordseqs ]" + ] + }, + { + "cell_type": "code", + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -531,7 +559,7 @@ "(4, 16, 52)" ] }, - "execution_count": 6, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -568,7 +596,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 42, "metadata": { "collapsed": true }, @@ -603,7 +631,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 43, "metadata": { "collapsed": true }, @@ -687,7 +715,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 44, "metadata": { "collapsed": true }, @@ -707,7 +735,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 45, "metadata": { "collapsed": true }, @@ -726,7 +754,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 46, "metadata": {}, "outputs": [ { @@ -736,8 +764,8 @@ "\n", "Straightforward Naive Bayes Learner\n", "\n", - "Paper No. 49: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n", - "Paper No. 50: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n", + "Paper No. 49: Hamilton: 0.0000 Madison: 0.0000 Jay: 1.0000\n", + "Paper No. 50: Hamilton: 0.0000 Madison: 0.0000 Jay: 1.0000\n", "Paper No. 51: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n", "Paper No. 52: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n", "Paper No. 53: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n", @@ -746,27 +774,27 @@ "Paper No. 56: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n", "Paper No. 57: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n", "Paper No. 58: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n", - "Paper No. 18: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n", - "Paper No. 19: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n", - "Paper No. 20: Hamilton: 0.0000 Madison: 1.0000 Jay: 0.0000\n", + "Paper No. 18: Hamilton: 0.0000 Madison: 0.0000 Jay: 1.0000\n", + "Paper No. 19: Hamilton: 0.0000 Madison: 0.0000 Jay: 1.0000\n", + "Paper No. 20: Hamilton: 0.0000 Madison: 0.0000 Jay: 1.0000\n", "Paper No. 64: Hamilton: 1.0000 Madison: 0.0000 Jay: 0.0000\n", "\n", "Logarithmic Naive Bayes Learner\n", "\n", - "Paper No. 49: Hamilton: -0.330591 Madison: -0.327717 Jay: -0.341692\n", - "Paper No. 50: Hamilton: -0.333119 Madison: -0.328454 Jay: -0.338427\n", - "Paper No. 51: Hamilton: -0.330246 Madison: -0.325758 Jay: -0.343996\n", - "Paper No. 52: Hamilton: -0.331094 Madison: -0.327491 Jay: -0.341415\n", - "Paper No. 53: Hamilton: -0.330942 Madison: -0.328364 Jay: -0.340693\n", - "Paper No. 54: Hamilton: -0.329566 Madison: -0.327157 Jay: -0.343277\n", - "Paper No. 55: Hamilton: -0.330821 Madison: -0.328143 Jay: -0.341036\n", - "Paper No. 56: Hamilton: -0.330333 Madison: -0.327496 Jay: -0.342171\n", - "Paper No. 57: Hamilton: -0.330625 Madison: -0.328602 Jay: -0.340772\n", - "Paper No. 58: Hamilton: -0.330271 Madison: -0.327215 Jay: -0.342515\n", - "Paper No. 18: Hamilton: -0.337781 Madison: -0.330932 Jay: -0.331287\n", - "Paper No. 19: Hamilton: -0.335635 Madison: -0.331774 Jay: -0.332590\n", - "Paper No. 20: Hamilton: -0.334911 Madison: -0.331866 Jay: -0.333223\n", - "Paper No. 64: Hamilton: -0.331004 Madison: -0.332968 Jay: -0.336028\n" + "Paper No. 49: Hamilton: -0.332458 Madison: -0.327181 Jay: -0.340361\n", + "Paper No. 50: Hamilton: -0.334297 Madison: -0.328498 Jay: -0.337205\n", + "Paper No. 51: Hamilton: -0.331115 Madison: -0.325023 Jay: -0.343862\n", + "Paper No. 52: Hamilton: -0.332564 Madison: -0.326864 Jay: -0.340573\n", + "Paper No. 53: Hamilton: -0.332436 Madison: -0.328330 Jay: -0.339234\n", + "Paper No. 54: Hamilton: -0.331439 Madison: -0.326235 Jay: -0.342326\n", + "Paper No. 55: Hamilton: -0.332681 Madison: -0.327443 Jay: -0.339876\n", + "Paper No. 56: Hamilton: -0.331028 Madison: -0.326893 Jay: -0.342079\n", + "Paper No. 57: Hamilton: -0.331728 Madison: -0.328400 Jay: -0.339872\n", + "Paper No. 58: Hamilton: -0.332311 Madison: -0.327076 Jay: -0.340613\n", + "Paper No. 18: Hamilton: -0.341690 Madison: -0.333232 Jay: -0.325078\n", + "Paper No. 19: Hamilton: -0.338835 Madison: -0.334063 Jay: -0.327101\n", + "Paper No. 20: Hamilton: -0.336637 Madison: -0.333725 Jay: -0.329637\n", + "Paper No. 64: Hamilton: -0.331532 Madison: -0.334943 Jay: -0.333525\n" ] } ], @@ -795,15 +823,6 @@ "\n", "Unfortunately, it misses paper 64. Consensus is that the paper was written by John Jay, while our classifier believes it was written by Hamilton. The classifier is wrong there because it does not have much information on Jay's writing; only 4 papers. This is one of the problems with using unbalanced datasets such as this one, where information on some classes is sparser than information on the rest. To avoid this, we can add more writings for Jay and Madison to end up with an equal amount of data for each author." ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -822,7 +841,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.1" + "version": "3.6.3" } }, "nbformat": 4,