From a3d7e5f5f397b39ff2807661112d0c35099c9866 Mon Sep 17 00:00:00 2001 From: Anthony Marakis Date: Fri, 11 Aug 2017 01:28:08 +0300 Subject: [PATCH 1/3] Update nlp.py --- nlp.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/nlp.py b/nlp.py index 2810d9910..f34d088b5 100644 --- a/nlp.py +++ b/nlp.py @@ -1,8 +1,5 @@ """Natural Language Processing; Chart Parsing and PageRanking (Chapter 22-23)""" -# (Written for the second edition of AIMA; expect some discrepanciecs -# from the third edition until this gets reviewed.) - from collections import defaultdict from utils import weighted_choice import urllib.request @@ -274,7 +271,7 @@ def __repr__(self): class Chart: - """Class for parsing sentences using a chart data structure. [Figure 22.7] + """Class for parsing sentences using a chart data structure. >>> chart = Chart(E0); >>> len(chart.parses('the stench is in 2 2')) 1 From 3d596db7c5c0a26c0cbb6e0b7d438e3d727ea2ef Mon Sep 17 00:00:00 2001 From: Anthony Marakis Date: Fri, 11 Aug 2017 01:29:00 +0300 Subject: [PATCH 2/3] add chart parsing test --- tests/test_nlp.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_nlp.py b/tests/test_nlp.py index ae7c52822..1d8320cdc 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -5,7 +5,7 @@ from nlp import expand_pages, relevant_pages, normalize, ConvergenceDetector, getInlinks from nlp import getOutlinks, Page, determineInlinks, HITS from nlp import Rules, Lexicon, Grammar, ProbRules, ProbLexicon, ProbGrammar -from nlp import CYK_parse +from nlp import Chart, CYK_parse # Clumsy imports because we want to access certain nlp.py globals explicitly, because # they are accessed by functions within nlp.py @@ -101,6 +101,12 @@ def test_prob_generation(): assert len(sentence) == 2 +def test_chart_parsing(): + chart = Chart(nlp.E0) + parses = chart.parses('the stench is in 2 2') + assert len(parses) == 1 + + def test_CYK_parse(): grammar = nlp.E_Prob_Chomsky words = ['the', 'robot', 'is', 'good'] From 37b71cbba86101b75c2c033c22c68c0bb3f7d175 Mon Sep 17 00:00:00 2001 From: Anthony Marakis Date: Fri, 11 Aug 2017 01:29:40 +0300 Subject: [PATCH 3/3] add chart parsing section --- nlp.ipynb | 253 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 250 insertions(+), 3 deletions(-) diff --git a/nlp.ipynb b/nlp.ipynb index 432107673..fba613ef7 100644 --- a/nlp.ipynb +++ b/nlp.ipynb @@ -22,7 +22,7 @@ "import nlp\n", "from nlp import Page, HITS\n", "from nlp import Lexicon, Rules, Grammar, ProbLexicon, ProbRules, ProbGrammar\n", - "from nlp import CYK_parse" + "from nlp import CYK_parse, Chart" ] }, { @@ -36,7 +36,9 @@ "* Overview\n", "* Languages\n", "* HITS\n", - "* Question Answering" + "* Question Answering\n", + "* CYK Parse\n", + "* Chart Parsing" ] }, { @@ -45,7 +47,11 @@ "source": [ "## OVERVIEW\n", "\n", - "`TODO...`" + "**Natural Language Processing (NLP)** is a field of AI concerned with understanding, analyzing and using natural languages. This field is considered a difficult yet intriguing field of study, since it is connected to how humans and their languages work.\n", + "\n", + "Applications of the field include translation, speech recognition, topic segmentation, information extraction and retrieval, and a lot more.\n", + "\n", + "Below we take a look at some algorithms in the field. Before we get right into it though, we will take a look at a very useful form of language, **context-free** languages. Even though they are a bit restrictive, they have been used a lot in research in natural language processing." ] }, { @@ -908,6 +914,247 @@ "\n", "Notice how the probability for the whole string (given by the key `('S', 0, 4)`) is 0.015. This means the most probable parsing of the sentence has a probability of 0.015." ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## CHART PARSING\n", + "\n", + "### Overview\n", + "\n", + "Let's now take a look at a more general chart parsing algorithm. Given a non-probabilistic grammar and a sentence, this algorithm builds a parse tree in a top-down manner, with the words of the sentence as the leaves. It works with a dynamic programming approach, building a chart to store parses for substrings so that it doesn't have to analyze them again (just like the CYK algorithm). Each non-terminal, starting from S, gets replaced by its right-hand side rules in the chart, until we end up with the correct parses.\n", + "\n", + "### Implementation\n", + "\n", + "A parse is in the form `[start, end, non-terminal, sub-tree, expected-transformation]`, where `sub-tree` is a tree with the corresponding `non-terminal` as its root and `expected-transformation` is a right-hand side rule of the `non-terminal`.\n", + "\n", + "The chart parsing is implemented in a class, `Chart`. It is initialized with a grammar and can return the list of all the parses of a sentence with the `parses` function.\n", + "\n", + "The chart is a list of lists. The lists correspond to the lengths of substrings (including the empty string), from start to finish. When we say 'a point in the chart', we refer to a list of a certain length.\n", + "\n", + "A quick rundown of the class functions:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "* `parses`: Returns a list of parses for a given sentence. If the sentence can't be parsed, it will return an empty list. Initializes the process by calling `parse` from the starting symbol.\n", + "\n", + "\n", + "* `parse`: Parses the list of words and builds the chart.\n", + "\n", + "\n", + "* `add_edge`: Adds another edge to the chart at a given point. Also, examines whether the edge extends or predicts another edge. If the edge itself is not expecting a transformation, it will extend other edges and it will predict edges otherwise.\n", + "\n", + "\n", + "* `scanner`: Given a word and a point in the chart, it extends edges that were expecting a transformation that can result in the given word. For example, if the word 'the' is an 'Article' and we are examining two edges at a chart's point, with one expecting an 'Article' and the other a 'Verb', the first one will be extended while the second one will not.\n", + "\n", + "\n", + "* `predictor`: If an edge can't extend other edges (because it is expecting a transformation itself), we will add to the chart rules/transformations that can help extend the edge. The new edges come from the right-hand side of the expected transformation's rules. For example, if an edge is expecting the transformation 'Adjective Noun', we will add to the chart an edge for each right-hand side rule of the non-terminal 'Adjective'.\n", + "\n", + "\n", + "* `extender`: Extends edges given an edge (called `E`). If `E`'s non-terminal is the same as the expected transformation of another edge (let's call it `A`), add to the chart a new edge with the non-terminal of `A` and the transformations of `A` minus the non-terminal that matched with `E`'s non-terminal. For example, if an edge `E` has 'Article' as its non-terminal and is expecting no transformation, we need to see what edges it can extend. Let's examine the edge `N`. This expects a transformation of 'Noun Verb'. 'Noun' does not match with 'Article', so we move on. Another edge, `A`, expects a transformation of 'Article Noun' and has a non-terminal of 'NP'. We have a match! A new edge will be added with 'NP' as its non-terminal (the non-terminal of `A`) and 'Noun' as the expected transformation (the rest of the expected transformation of `A`)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example\n", + "\n", + "We will use the grammar `E0` to parse the sentence \"the stench is in 2 2\".\n", + "\n", + "First we need to build a `Chart` object:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "chart = Chart(nlp.E0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And then we simply call the `parses` function:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0, 6, 'S', [[0, 2, 'NP', [('Article', 'the'), ('Noun', 'stench')], []], [2, 6, 'VP', [[2, 3, 'VP', [('Verb', 'is')], []], [3, 6, 'PP', [('Preposition', 'in'), [4, 6, 'NP', [('Digit', '2'), ('Digit', '2')], []]], []]], []]], []]]\n" + ] + } + ], + "source": [ + "print(chart.parses('the stench is in 2 2'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can see which edges get added by setting the optional initialization argument `trace` to true." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Chart: added [0, 0, 'S_', [], ['S']]\n", + "Chart: added [0, 0, 'S', [], ['NP', 'VP']]\n", + "Chart: added [0, 0, 'NP', [], ['Pronoun']]\n", + "Chart: added [0, 0, 'NP', [], ['Name']]\n", + "Chart: added [0, 0, 'NP', [], ['Noun']]\n", + "Chart: added [0, 0, 'NP', [], ['Article', 'Noun']]\n", + "Chart: added [0, 0, 'NP', [], ['Digit', 'Digit']]\n", + "Chart: added [0, 0, 'NP', [], ['NP', 'PP']]\n", + "Chart: added [0, 0, 'NP', [], ['NP', 'RelClause']]\n", + "Chart: added [0, 0, 'S', [], ['S', 'Conjunction', 'S']]\n", + "Chart: added [0, 1, 'NP', [('Article', 'the')], ['Noun']]\n", + "Chart: added [0, 2, 'NP', [('Article', 'the'), ('Noun', 'stench')], []]\n", + "Chart: added [0, 2, 'S', [[0, 2, 'NP', [('Article', 'the'), ('Noun', 'stench')], []]], ['VP']]\n", + "Chart: added [2, 2, 'VP', [], ['Verb']]\n", + "Chart: added [2, 2, 'VP', [], ['VP', 'NP']]\n", + "Chart: added [2, 2, 'VP', [], ['VP', 'Adjective']]\n", + "Chart: added [2, 2, 'VP', [], ['VP', 'PP']]\n", + "Chart: added [2, 2, 'VP', [], ['VP', 'Adverb']]\n", + "Chart: added [0, 2, 'NP', [[0, 2, 'NP', [('Article', 'the'), ('Noun', 'stench')], []]], ['PP']]\n", + "Chart: added [2, 2, 'PP', [], ['Preposition', 'NP']]\n", + "Chart: added [0, 2, 'NP', [[0, 2, 'NP', [('Article', 'the'), ('Noun', 'stench')], []]], ['RelClause']]\n", + "Chart: added [2, 2, 'RelClause', [], ['That', 'VP']]\n", + "Chart: added [2, 3, 'VP', [('Verb', 'is')], []]\n", + "Chart: added [0, 3, 'S', [[0, 2, 'NP', [('Article', 'the'), ('Noun', 'stench')], []], [2, 3, 'VP', [('Verb', 'is')], []]], []]\n", + "Chart: added [0, 3, 'S_', [[0, 3, 'S', [[0, 2, 'NP', [('Article', 'the'), ('Noun', 'stench')], []], [2, 3, 'VP', [('Verb', 'is')], []]], []]], []]\n", + "Chart: added [0, 3, 'S', [[0, 3, 'S', [[0, 2, 'NP', [('Article', 'the'), ('Noun', 'stench')], []], [2, 3, 'VP', [('Verb', 'is')], []]], []]], ['Conjunction', 'S']]\n", + "Chart: added [2, 3, 'VP', [[2, 3, 'VP', [('Verb', 'is')], []]], ['NP']]\n", + "Chart: added [3, 3, 'NP', [], ['Pronoun']]\n", + "Chart: added [3, 3, 'NP', [], ['Name']]\n", + "Chart: added [3, 3, 'NP', [], ['Noun']]\n", + "Chart: added [3, 3, 'NP', [], ['Article', 'Noun']]\n", + "Chart: added [3, 3, 'NP', [], ['Digit', 'Digit']]\n", + "Chart: added [3, 3, 'NP', [], ['NP', 'PP']]\n", + "Chart: added [3, 3, 'NP', [], ['NP', 'RelClause']]\n", + "Chart: added [2, 3, 'VP', [[2, 3, 'VP', [('Verb', 'is')], []]], ['Adjective']]\n", + "Chart: added [2, 3, 'VP', [[2, 3, 'VP', [('Verb', 'is')], []]], ['PP']]\n", + "Chart: added [3, 3, 'PP', [], ['Preposition', 'NP']]\n", + "Chart: added [2, 3, 'VP', [[2, 3, 'VP', [('Verb', 'is')], []]], ['Adverb']]\n", + "Chart: added [3, 4, 'PP', [('Preposition', 'in')], ['NP']]\n", + "Chart: added [4, 4, 'NP', [], ['Pronoun']]\n", + "Chart: added [4, 4, 'NP', [], ['Name']]\n", + "Chart: added [4, 4, 'NP', [], ['Noun']]\n", + "Chart: added [4, 4, 'NP', [], ['Article', 'Noun']]\n", + "Chart: added [4, 4, 'NP', [], ['Digit', 'Digit']]\n", + "Chart: added [4, 4, 'NP', [], ['NP', 'PP']]\n", + "Chart: added [4, 4, 'NP', [], ['NP', 'RelClause']]\n", + "Chart: added [4, 5, 'NP', [('Digit', '2')], ['Digit']]\n", + "Chart: added [4, 6, 'NP', [('Digit', '2'), ('Digit', '2')], []]\n", + "Chart: added [3, 6, 'PP', [('Preposition', 'in'), [4, 6, 'NP', [('Digit', '2'), ('Digit', '2')], []]], []]\n", + "Chart: added [2, 6, 'VP', [[2, 3, 'VP', [('Verb', 'is')], []], [3, 6, 'PP', [('Preposition', 'in'), [4, 6, 'NP', [('Digit', '2'), ('Digit', '2')], []]], []]], []]\n", + "Chart: added [0, 6, 'S', [[0, 2, 'NP', [('Article', 'the'), ('Noun', 'stench')], []], [2, 6, 'VP', [[2, 3, 'VP', [('Verb', 'is')], []], [3, 6, 'PP', [('Preposition', 'in'), [4, 6, 'NP', [('Digit', '2'), ('Digit', '2')], []]], []]], []]], []]\n", + "Chart: added [0, 6, 'S_', [[0, 6, 'S', [[0, 2, 'NP', [('Article', 'the'), ('Noun', 'stench')], []], [2, 6, 'VP', [[2, 3, 'VP', [('Verb', 'is')], []], [3, 6, 'PP', [('Preposition', 'in'), [4, 6, 'NP', [('Digit', '2'), ('Digit', '2')], []]], []]], []]], []]], []]\n", + "Chart: added [0, 6, 'S', [[0, 6, 'S', [[0, 2, 'NP', [('Article', 'the'), ('Noun', 'stench')], []], [2, 6, 'VP', [[2, 3, 'VP', [('Verb', 'is')], []], [3, 6, 'PP', [('Preposition', 'in'), [4, 6, 'NP', [('Digit', '2'), ('Digit', '2')], []]], []]], []]], []]], ['Conjunction', 'S']]\n", + "Chart: added [2, 6, 'VP', [[2, 6, 'VP', [[2, 3, 'VP', [('Verb', 'is')], []], [3, 6, 'PP', [('Preposition', 'in'), [4, 6, 'NP', [('Digit', '2'), ('Digit', '2')], []]], []]], []]], ['NP']]\n", + "Chart: added [6, 6, 'NP', [], ['Pronoun']]\n", + "Chart: added [6, 6, 'NP', [], ['Name']]\n", + "Chart: added [6, 6, 'NP', [], ['Noun']]\n", + "Chart: added [6, 6, 'NP', [], ['Article', 'Noun']]\n", + "Chart: added [6, 6, 'NP', [], ['Digit', 'Digit']]\n", + "Chart: added [6, 6, 'NP', [], ['NP', 'PP']]\n", + "Chart: added [6, 6, 'NP', [], ['NP', 'RelClause']]\n", + "Chart: added [2, 6, 'VP', [[2, 6, 'VP', [[2, 3, 'VP', [('Verb', 'is')], []], [3, 6, 'PP', [('Preposition', 'in'), [4, 6, 'NP', [('Digit', '2'), ('Digit', '2')], []]], []]], []]], ['Adjective']]\n", + "Chart: added [2, 6, 'VP', [[2, 6, 'VP', [[2, 3, 'VP', [('Verb', 'is')], []], [3, 6, 'PP', [('Preposition', 'in'), [4, 6, 'NP', [('Digit', '2'), ('Digit', '2')], []]], []]], []]], ['PP']]\n", + "Chart: added [6, 6, 'PP', [], ['Preposition', 'NP']]\n", + "Chart: added [2, 6, 'VP', [[2, 6, 'VP', [[2, 3, 'VP', [('Verb', 'is')], []], [3, 6, 'PP', [('Preposition', 'in'), [4, 6, 'NP', [('Digit', '2'), ('Digit', '2')], []]], []]], []]], ['Adverb']]\n", + "Chart: added [4, 6, 'NP', [[4, 6, 'NP', [('Digit', '2'), ('Digit', '2')], []]], ['PP']]\n", + "Chart: added [4, 6, 'NP', [[4, 6, 'NP', [('Digit', '2'), ('Digit', '2')], []]], ['RelClause']]\n", + "Chart: added [6, 6, 'RelClause', [], ['That', 'VP']]\n" + ] + }, + { + "data": { + "text/plain": [ + "[[0,\n", + " 6,\n", + " 'S',\n", + " [[0, 2, 'NP', [('Article', 'the'), ('Noun', 'stench')], []],\n", + " [2,\n", + " 6,\n", + " 'VP',\n", + " [[2, 3, 'VP', [('Verb', 'is')], []],\n", + " [3,\n", + " 6,\n", + " 'PP',\n", + " [('Preposition', 'in'),\n", + " [4, 6, 'NP', [('Digit', '2'), ('Digit', '2')], []]],\n", + " []]],\n", + " []]],\n", + " []]]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chart_trace = Chart(nlp.E0, trace=True)\n", + "chart_trace.parses('the stench is in 2 2')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's try and parse a sentence that is not recognized by the grammar:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n" + ] + } + ], + "source": [ + "print(chart.parses('the stench 2 2'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An empty list was returned." + ] } ], "metadata": {