Implementing HITS algorithm (aimacode#244)

Jonathon Belotti · norvig · commit 04c7d51c8df8 · 2016-09-07T01:03:05.000-07:00
* Implementing HITS algorithm

* Moving HITS work to nlp.py and test_nlp.py
diff --git a/nlp.py b/nlp.py
@@ -4,6 +4,8 @@
 # from the third edition until this gets reviewed.)
 
 from collections import defaultdict
+import urllib.request
+import re
 
 # ______________________________________________________________________________
 # Grammars and Lexicons
@@ -206,3 +208,178 @@ def CYK_parse(words, grammar):
                     P[X, start, length] = max(P[X, start, length],
                                               P[Y, start, len1] * P[Z, start+len1, len2] * p)
     return P
+
+
+# ______________________________________________________________________________
+# Page Ranking
+
+# First entry in list is the base URL, and then following are relative URL pages
+examplePagesSet = ["https://en.wikipedia.org/wiki/", "Aesthetics", "Analytic_philosophy",
+                   "Ancient_Greek", "Aristotle", "Astrology","Atheism", "Baruch_Spinoza",
+                   "Belief", "Betrand Russell", "Confucius", "Consciousness",
+                   "Continental Philosophy", "Dialectic", "Eastern_Philosophy",
+                   "Epistemology", "Ethics", "Existentialism", "Friedrich_Nietzsche",
+                   "Idealism", "Immanuel_Kant", "List_of_political_philosophers", "Logic",
+                   "Metaphysics", "Philosophers", "Philosophy", "Philosophy_of_mind", "Physics",
+                   "Plato", "Political_philosophy", "Pythagoras", "Rationalism","Social_philosophy",
+                   "Socrates", "Subjectivity", "Theology", "Truth", "Western_philosophy"]
+
+
+def loadPageHTML( addressList ):
+    """Download HTML page content for every URL address passed as argument"""
+    contentDict = {}
+    for addr in addressList:
+        with urllib.request.urlopen(addr) as response:
+            raw_html = response.read().decode('utf-8')
+            # Strip raw html of unnessecary content. Basically everything that isn't link or text
+            html = stripRawHTML(raw_html)
+            contentDict[addr] = html
+    return contentDict
+
+def initPages( addressList ):
+    """Create a dictionary of pages from a list of URL addresses"""
+    pages = {}
+    for addr in addressList:
+        pages[addr] = Page(addr)
+    return pages
+
+def stripRawHTML( raw_html ):
+    """Remove the <head> section of the HTML which contains links to stylesheets etc.,
+    and remove all other unnessecary HTML"""
+    # TODO: Strip more out of the raw html
+    return re.sub("<head>.*?</head>", "", raw_html, flags=re.DOTALL) # remove <head> section
+
+def determineInlinks( page ):
+    """Given a set of pages that have their outlinks determined, we can fill
+    out a page's inlinks by looking through all other page's outlinks"""
+    inlinks = []
+    for addr, indexPage in pagesIndex.items():
+        if page.address == indexPage.address:
+            continue
+        elif page.address in indexPage.outlinks:
+            inlinks.append(addr)
+    return inlinks
+
+def findOutlinks( page, handleURLs=None ):
+    """Search a page's HTML content for URL links to other pages"""
+    urls = re.findall(r'href=[\'"]?([^\'" >]+)', pagesContent[page.address])
+    if handleURLs:
+        urls = handleURLs(urls)
+    return urls
+
+def onlyWikipediaURLS( urls ):
+    """Some example HTML page data is from wikipedia. This function converts
+    relative wikipedia links to full wikipedia URLs"""
+    wikiURLs = [url for url in urls if url.startswith('/wiki/')]
+    return ["https://en.wikipedia.org"+url for url in wikiURLs]
+
+
+# ______________________________________________________________________________
+# HITS Helper Functions
+
+def expand_pages( pages ):
+    """From Textbook: adds in every page that links to or is linked from one of
+    the relevant pages."""
+    expanded = {}
+    for addr,page in pages.items():
+        if addr not in expanded:
+            expanded[addr] = page
+        for inlink in page.inlinks:
+            if inlink not in expanded:
+                expanded[inlink] = pagesIndex[inlink]
+        for outlink in page.outlinks:
+            if outlink not in expanded:
+                expanded[outlink] = pagesIndex[outlink]
+    return expanded
+
+def relevant_pages(query):
+    """relevant pages are pages that contain the query in its entireity.
+    If a page's content contains the query it is returned by the function"""
+    relevant = {}
+    print("pagesContent in function: ", pagesContent)
+    for addr, page in pagesIndex.items():
+        if query.lower() in pagesContent[addr].lower():
+            relevant[addr] = page
+    return relevant
+
+def normalize( pages ):
+    """From the pseudocode: Normalize divides each page's score by the sum of
+    the squares of all pages' scores (separately for both the authority and hubs scores).
+    """
+    summed_hub = sum(page.hub**2 for _,page in pages.items())
+    summed_auth = sum(page.authority**2 for _,page in pages.items())
+    for _, page in pages.items():
+        page.hub /= summed_hub
+        page.authority /= summed_auth
+
+class ConvergenceDetector(object):
+    """If the hub and authority values of the pages are no longer changing, we have
+    reached a convergence and further iterations will have no effect. This detects convergence
+    so that we can stop the HITS algorithm as early as possible."""
+    def __init__(self):
+        self.hub_history = None
+        self.auth_history = None
+
+    def __call__(self):
+        return self.detect()
+
+    def detect(self):
+        curr_hubs = [page.hub for addr, page in pagesIndex.items()]
+        curr_auths = [page.authority for addr, page in pagesIndex.items()]
+        if self.hub_history == None:
+            self.hub_history, self.auth_history = [],[]
+        else:
+            diffsHub = [abs(x-y) for x, y in zip(curr_hubs,self.hub_history[-1])]
+            diffsAuth = [abs(x-y) for x, y in zip(curr_auths,self.auth_history[-1])]
+            aveDeltaHub  = sum(diffsHub)/float(len(pagesIndex))
+            aveDeltaAuth = sum(diffsAuth)/float(len(pagesIndex))
+            if aveDeltaHub < 0.01 and aveDeltaAuth < 0.01: # may need tweaking
+                return True
+        if len(self.hub_history) > 2: # prevent list from getting long
+            del self.hub_history[0]
+            del self.auth_history[0]
+        self.hub_history.append([x for x in curr_hubs])
+        self.auth_history.append([x for x in curr_auths])
+        return False
+
+
+def getInlinks( page ):
+    if not page.inlinks:
+        page.inlinks = determineInlinks(page)
+    return [p for addr, p in pagesIndex.items() if addr in page.inlinks ]
+
+def getOutlinks( page ):
+    if not page.outlinks:
+        page.outlinks = findOutlinks(page)
+    return [p for addr, p in pagesIndex.items() if addr in page.outlinks]
+
+
+# ______________________________________________________________________________
+# HITS Algorithm
+
+class Page(object):
+    def __init__(self, address, hub=0, authority=0, inlinks=None, outlinks=None):
+        self.address = address
+        self.hub = hub
+        self.authority = authority
+        self.inlinks = inlinks
+        self.outlinks = outlinks
+
+pagesContent = {} # maps Page relative or absolute URL/location to page's HTML content
+pagesIndex = {}
+convergence = ConvergenceDetector() # assign function to variable to mimic pseudocode's syntax
+
+def HITS(query):
+    """The HITS algorithm for computing hubs and authorities with respect to a query."""
+    pages = expand_pages(relevant_pages(query)) # in order to 'map' faithfully to pseudocode we
+    for p in pages:                             # won't pass the list of pages as an argument
+        p.authority = 1
+        p.hub = 1
+    while True: # repeat until... convergence
+        for p in pages:
+            p.authority = sum(x.hub for x in getInlinks(p))  # p.authority ← ∑i Inlinki(p).Hub
+            p.hub = sum(x.authority for x in getOutlinks(p)) # p.hub ← ∑i Outlinki(p).Authority
+        normalize(pages)
+        if convergence():
+            break
+    return pages
diff --git a/tests/test_nlp.py b/tests/test_nlp.py
@@ -1,10 +1,127 @@
 import pytest
-from nlp import *
-
+import nlp
+from nlp import loadPageHTML, stripRawHTML, determineInlinks, findOutlinks, onlyWikipediaURLS
+from nlp import expand_pages, relevant_pages, normalize, ConvergenceDetector, getInlinks
+from nlp import getOutlinks, Page, HITS
+from nlp import Rules, Lexicon
+# Clumsy imports because we want to access certain nlp.py globals explicitly, because
+# they are accessed by function's within nlp.py
 
 def test_rules():
     assert Rules(A="B C | D E") == {'A': [['B', 'C'], ['D', 'E']]}
 
 
 def test_lexicon():
     assert Lexicon(Art="the | a | an") == {'Art': ['the', 'a', 'an']}
+
+
+# ______________________________________________________________________________
+# Data Setup
+
+testHTML = """Keyword String 1: A man is a male human.
+            Keyword String 2: Like most other male mammals, a man inherits an
+            X from his mom and a Y from his dad.
+            Links:
+            href="https://google.com.au"
+            < href="/wiki/TestThing" > href="/wiki/TestBoy"
+            href="/wiki/TestLiving" href="/wiki/TestMan" >"""
+testHTML2 = "Nothing"
+
+pA = Page("A", 1, 6, ["B","C","E"],["D"])
+pB = Page("B", 2, 5, ["E"],["A","C","D"])
+pC = Page("C", 3, 4, ["B","E"],["A","D"])
+pD = Page("D", 4, 3, ["A","B","C","E"],[])
+pE = Page("E", 5, 2, [],["A","B","C","D","F"])
+pF = Page("F", 6, 1, ["E"],[])
+pageDict = {pA.address:pA,pB.address:pB,pC.address:pC,
+            pD.address:pD,pE.address:pE,pF.address:pF}
+nlp.pagesIndex = pageDict
+nlp.pagesContent ={pA.address:testHTML,pB.address:testHTML2,
+              pC.address:testHTML,pD.address:testHTML2,
+              pE.address:testHTML,pF.address:testHTML2}
+
+# This test takes a long time (> 60 secs)
+# def test_loadPageHTML():
+#     # first format all the relative URLs with the base URL
+#     addresses = [examplePagesSet[0] + x for x in examplePagesSet[1:]]
+#     loadedPages = loadPageHTML(addresses)
+#     relURLs = ['Ancient_Greek','Ethics','Plato','Theology']
+#     fullURLs = ["https://en.wikipedia.org/wiki/"+x for x in relURLs]
+#     assert all(x in loadedPages for x in fullURLs)
+#     assert all(loadedPages.get(key,"") != "" for key in addresses)
+
+def test_stripRawHTML():
+    addr = "https://en.wikipedia.org/wiki/Ethics"
+    aPage = loadPageHTML([addr])
+    someHTML = aPage[addr]
+    strippedHTML = stripRawHTML(someHTML)
+    assert "<head>" not in strippedHTML and "</head>" not in strippedHTML
+
+def test_determineInlinks():
+    # TODO
+    assert True
+
+def test_findOutlinks_wiki():
+    testPage = pageDict[pA.address]
+    outlinks = findOutlinks(testPage, handleURLs=onlyWikipediaURLS)
+    assert "https://en.wikipedia.org/wiki/TestThing" in outlinks
+    assert "https://en.wikipedia.org/wiki/TestThing" in outlinks
+    assert "https://google.com.au" not in outlinks
+# ______________________________________________________________________________
+# HITS Helper Functions
+
+def test_expand_pages():
+    pages = {k: pageDict[k] for k in ('F')}
+    pagesTwo = {k: pageDict[k] for k in ('A','E')}
+    expanded_pages = expand_pages(pages)
+    assert all(x in expanded_pages for x in ['F','E'])
+    assert all(x not in expanded_pages for x in ['A','B','C','D'])
+    expanded_pages = expand_pages(pagesTwo)
+    print(expanded_pages)
+    assert all(x in expanded_pages for x in ['A','B','C','D','E','F'])
+
+def test_relevant_pages():
+    pages = relevant_pages("male")
+    assert all((x in pages.keys()) for x in ['A','C','E'])
+    assert all((x not in pages) for x in ['B','D','F'])
+
+def test_normalize():
+    normalize( pageDict )
+    print(page.hub for addr,page in nlp.pagesIndex.items())
+    expected_hub = [1/91,2/91,3/91,4/91,5/91,6/91] # Works only for sample data above
+    expected_auth = list(reversed(expected_hub))
+    assert len(expected_hub) == len(expected_auth) == len(nlp.pagesIndex)
+    assert expected_hub == [page.hub for addr,page in sorted(nlp.pagesIndex.items())]
+    assert expected_auth == [page.authority for addr,page in sorted(nlp.pagesIndex.items())]
+
+def test_detectConvergence():
+    # run detectConvergence once to initialise history
+    convergence = ConvergenceDetector()
+    convergence()
+    assert convergence() # values haven't changed so should return True
+    # make tiny increase/decrease to all values
+    for _, page in nlp.pagesIndex.items():
+        page.hub += 0.0003
+        page.authority += 0.0004
+    # retest function with values. Should still return True
+    assert convergence()
+    for _, page in nlp.pagesIndex.items():
+        page.hub += 3000000
+        page.authority += 3000000
+    # retest function with values. Should now return false
+    assert not convergence()
+
+def test_getInlinks():
+    inlnks = getInlinks(pageDict['A'])
+    assert sorted([page.address for page in inlnks]) == pageDict['A'].inlinks
+
+def test_getOutlinks():
+    outlnks = getOutlinks(pageDict['A'])
+    assert sorted([page.address for page in outlnks]) == pageDict['A'].outlinks
+
+def test_HITS():
+    # TODO
+    assert True # leave for now
+
+if __name__ == '__main__':
+    pytest.main()