Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 04c7d51

Browse files
Jonathon Belottinorvig
Jonathon Belotti
authored andcommitted
Implementing HITS algorithm (aimacode#244)
* Implementing HITS algorithm * Moving HITS work to nlp.py and test_nlp.py
1 parent 61ef267 commit 04c7d51

File tree

2 files changed

+296
-2
lines changed

2 files changed

+296
-2
lines changed

nlp.py

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
# from the third edition until this gets reviewed.)
55

66
from collections import defaultdict
7+
import urllib.request
8+
import re
79

810
# ______________________________________________________________________________
911
# Grammars and Lexicons
@@ -206,3 +208,178 @@ def CYK_parse(words, grammar):
206208
P[X, start, length] = max(P[X, start, length],
207209
P[Y, start, len1] * P[Z, start+len1, len2] * p)
208210
return P
211+
212+
213+
# ______________________________________________________________________________
214+
# Page Ranking
215+
216+
# First entry in list is the base URL, and then following are relative URL pages
217+
examplePagesSet = ["https://en.wikipedia.org/wiki/", "Aesthetics", "Analytic_philosophy",
218+
"Ancient_Greek", "Aristotle", "Astrology","Atheism", "Baruch_Spinoza",
219+
"Belief", "Betrand Russell", "Confucius", "Consciousness",
220+
"Continental Philosophy", "Dialectic", "Eastern_Philosophy",
221+
"Epistemology", "Ethics", "Existentialism", "Friedrich_Nietzsche",
222+
"Idealism", "Immanuel_Kant", "List_of_political_philosophers", "Logic",
223+
"Metaphysics", "Philosophers", "Philosophy", "Philosophy_of_mind", "Physics",
224+
"Plato", "Political_philosophy", "Pythagoras", "Rationalism","Social_philosophy",
225+
"Socrates", "Subjectivity", "Theology", "Truth", "Western_philosophy"]
226+
227+
228+
def loadPageHTML( addressList ):
229+
"""Download HTML page content for every URL address passed as argument"""
230+
contentDict = {}
231+
for addr in addressList:
232+
with urllib.request.urlopen(addr) as response:
233+
raw_html = response.read().decode('utf-8')
234+
# Strip raw html of unnessecary content. Basically everything that isn't link or text
235+
html = stripRawHTML(raw_html)
236+
contentDict[addr] = html
237+
return contentDict
238+
239+
def initPages( addressList ):
240+
"""Create a dictionary of pages from a list of URL addresses"""
241+
pages = {}
242+
for addr in addressList:
243+
pages[addr] = Page(addr)
244+
return pages
245+
246+
def stripRawHTML( raw_html ):
247+
"""Remove the <head> section of the HTML which contains links to stylesheets etc.,
248+
and remove all other unnessecary HTML"""
249+
# TODO: Strip more out of the raw html
250+
return re.sub("<head>.*?</head>", "", raw_html, flags=re.DOTALL) # remove <head> section
251+
252+
def determineInlinks( page ):
253+
"""Given a set of pages that have their outlinks determined, we can fill
254+
out a page's inlinks by looking through all other page's outlinks"""
255+
inlinks = []
256+
for addr, indexPage in pagesIndex.items():
257+
if page.address == indexPage.address:
258+
continue
259+
elif page.address in indexPage.outlinks:
260+
inlinks.append(addr)
261+
return inlinks
262+
263+
def findOutlinks( page, handleURLs=None ):
264+
"""Search a page's HTML content for URL links to other pages"""
265+
urls = re.findall(r'href=[\'"]?([^\'" >]+)', pagesContent[page.address])
266+
if handleURLs:
267+
urls = handleURLs(urls)
268+
return urls
269+
270+
def onlyWikipediaURLS( urls ):
271+
"""Some example HTML page data is from wikipedia. This function converts
272+
relative wikipedia links to full wikipedia URLs"""
273+
wikiURLs = [url for url in urls if url.startswith('/wiki/')]
274+
return ["https://en.wikipedia.org"+url for url in wikiURLs]
275+
276+
277+
# ______________________________________________________________________________
278+
# HITS Helper Functions
279+
280+
def expand_pages( pages ):
281+
"""From Textbook: adds in every page that links to or is linked from one of
282+
the relevant pages."""
283+
expanded = {}
284+
for addr,page in pages.items():
285+
if addr not in expanded:
286+
expanded[addr] = page
287+
for inlink in page.inlinks:
288+
if inlink not in expanded:
289+
expanded[inlink] = pagesIndex[inlink]
290+
for outlink in page.outlinks:
291+
if outlink not in expanded:
292+
expanded[outlink] = pagesIndex[outlink]
293+
return expanded
294+
295+
def relevant_pages(query):
296+
"""relevant pages are pages that contain the query in its entireity.
297+
If a page's content contains the query it is returned by the function"""
298+
relevant = {}
299+
print("pagesContent in function: ", pagesContent)
300+
for addr, page in pagesIndex.items():
301+
if query.lower() in pagesContent[addr].lower():
302+
relevant[addr] = page
303+
return relevant
304+
305+
def normalize( pages ):
306+
"""From the pseudocode: Normalize divides each page's score by the sum of
307+
the squares of all pages' scores (separately for both the authority and hubs scores).
308+
"""
309+
summed_hub = sum(page.hub**2 for _,page in pages.items())
310+
summed_auth = sum(page.authority**2 for _,page in pages.items())
311+
for _, page in pages.items():
312+
page.hub /= summed_hub
313+
page.authority /= summed_auth
314+
315+
class ConvergenceDetector(object):
316+
"""If the hub and authority values of the pages are no longer changing, we have
317+
reached a convergence and further iterations will have no effect. This detects convergence
318+
so that we can stop the HITS algorithm as early as possible."""
319+
def __init__(self):
320+
self.hub_history = None
321+
self.auth_history = None
322+
323+
def __call__(self):
324+
return self.detect()
325+
326+
def detect(self):
327+
curr_hubs = [page.hub for addr, page in pagesIndex.items()]
328+
curr_auths = [page.authority for addr, page in pagesIndex.items()]
329+
if self.hub_history == None:
330+
self.hub_history, self.auth_history = [],[]
331+
else:
332+
diffsHub = [abs(x-y) for x, y in zip(curr_hubs,self.hub_history[-1])]
333+
diffsAuth = [abs(x-y) for x, y in zip(curr_auths,self.auth_history[-1])]
334+
aveDeltaHub = sum(diffsHub)/float(len(pagesIndex))
335+
aveDeltaAuth = sum(diffsAuth)/float(len(pagesIndex))
336+
if aveDeltaHub < 0.01 and aveDeltaAuth < 0.01: # may need tweaking
337+
return True
338+
if len(self.hub_history) > 2: # prevent list from getting long
339+
del self.hub_history[0]
340+
del self.auth_history[0]
341+
self.hub_history.append([x for x in curr_hubs])
342+
self.auth_history.append([x for x in curr_auths])
343+
return False
344+
345+
346+
def getInlinks( page ):
347+
if not page.inlinks:
348+
page.inlinks = determineInlinks(page)
349+
return [p for addr, p in pagesIndex.items() if addr in page.inlinks ]
350+
351+
def getOutlinks( page ):
352+
if not page.outlinks:
353+
page.outlinks = findOutlinks(page)
354+
return [p for addr, p in pagesIndex.items() if addr in page.outlinks]
355+
356+
357+
# ______________________________________________________________________________
358+
# HITS Algorithm
359+
360+
class Page(object):
361+
def __init__(self, address, hub=0, authority=0, inlinks=None, outlinks=None):
362+
self.address = address
363+
self.hub = hub
364+
self.authority = authority
365+
self.inlinks = inlinks
366+
self.outlinks = outlinks
367+
368+
pagesContent = {} # maps Page relative or absolute URL/location to page's HTML content
369+
pagesIndex = {}
370+
convergence = ConvergenceDetector() # assign function to variable to mimic pseudocode's syntax
371+
372+
def HITS(query):
373+
"""The HITS algorithm for computing hubs and authorities with respect to a query."""
374+
pages = expand_pages(relevant_pages(query)) # in order to 'map' faithfully to pseudocode we
375+
for p in pages: # won't pass the list of pages as an argument
376+
p.authority = 1
377+
p.hub = 1
378+
while True: # repeat until... convergence
379+
for p in pages:
380+
p.authority = sum(x.hub for x in getInlinks(p)) # p.authority ← ∑i Inlinki(p).Hub
381+
p.hub = sum(x.authority for x in getOutlinks(p)) # p.hub ← ∑i Outlinki(p).Authority
382+
normalize(pages)
383+
if convergence():
384+
break
385+
return pages

tests/test_nlp.py

Lines changed: 119 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,127 @@
11
import pytest
2-
from nlp import *
3-
2+
import nlp
3+
from nlp import loadPageHTML, stripRawHTML, determineInlinks, findOutlinks, onlyWikipediaURLS
4+
from nlp import expand_pages, relevant_pages, normalize, ConvergenceDetector, getInlinks
5+
from nlp import getOutlinks, Page, HITS
6+
from nlp import Rules, Lexicon
7+
# Clumsy imports because we want to access certain nlp.py globals explicitly, because
8+
# they are accessed by function's within nlp.py
49

510
def test_rules():
611
assert Rules(A="B C | D E") == {'A': [['B', 'C'], ['D', 'E']]}
712

813

914
def test_lexicon():
1015
assert Lexicon(Art="the | a | an") == {'Art': ['the', 'a', 'an']}
16+
17+
18+
# ______________________________________________________________________________
19+
# Data Setup
20+
21+
testHTML = """Keyword String 1: A man is a male human.
22+
Keyword String 2: Like most other male mammals, a man inherits an
23+
X from his mom and a Y from his dad.
24+
Links:
25+
href="https://google.com.au"
26+
< href="/wiki/TestThing" > href="/wiki/TestBoy"
27+
href="/wiki/TestLiving" href="/wiki/TestMan" >"""
28+
testHTML2 = "Nothing"
29+
30+
pA = Page("A", 1, 6, ["B","C","E"],["D"])
31+
pB = Page("B", 2, 5, ["E"],["A","C","D"])
32+
pC = Page("C", 3, 4, ["B","E"],["A","D"])
33+
pD = Page("D", 4, 3, ["A","B","C","E"],[])
34+
pE = Page("E", 5, 2, [],["A","B","C","D","F"])
35+
pF = Page("F", 6, 1, ["E"],[])
36+
pageDict = {pA.address:pA,pB.address:pB,pC.address:pC,
37+
pD.address:pD,pE.address:pE,pF.address:pF}
38+
nlp.pagesIndex = pageDict
39+
nlp.pagesContent ={pA.address:testHTML,pB.address:testHTML2,
40+
pC.address:testHTML,pD.address:testHTML2,
41+
pE.address:testHTML,pF.address:testHTML2}
42+
43+
# This test takes a long time (> 60 secs)
44+
# def test_loadPageHTML():
45+
# # first format all the relative URLs with the base URL
46+
# addresses = [examplePagesSet[0] + x for x in examplePagesSet[1:]]
47+
# loadedPages = loadPageHTML(addresses)
48+
# relURLs = ['Ancient_Greek','Ethics','Plato','Theology']
49+
# fullURLs = ["https://en.wikipedia.org/wiki/"+x for x in relURLs]
50+
# assert all(x in loadedPages for x in fullURLs)
51+
# assert all(loadedPages.get(key,"") != "" for key in addresses)
52+
53+
def test_stripRawHTML():
54+
addr = "https://en.wikipedia.org/wiki/Ethics"
55+
aPage = loadPageHTML([addr])
56+
someHTML = aPage[addr]
57+
strippedHTML = stripRawHTML(someHTML)
58+
assert "<head>" not in strippedHTML and "</head>" not in strippedHTML
59+
60+
def test_determineInlinks():
61+
# TODO
62+
assert True
63+
64+
def test_findOutlinks_wiki():
65+
testPage = pageDict[pA.address]
66+
outlinks = findOutlinks(testPage, handleURLs=onlyWikipediaURLS)
67+
assert "https://en.wikipedia.org/wiki/TestThing" in outlinks
68+
assert "https://en.wikipedia.org/wiki/TestThing" in outlinks
69+
assert "https://google.com.au" not in outlinks
70+
# ______________________________________________________________________________
71+
# HITS Helper Functions
72+
73+
def test_expand_pages():
74+
pages = {k: pageDict[k] for k in ('F')}
75+
pagesTwo = {k: pageDict[k] for k in ('A','E')}
76+
expanded_pages = expand_pages(pages)
77+
assert all(x in expanded_pages for x in ['F','E'])
78+
assert all(x not in expanded_pages for x in ['A','B','C','D'])
79+
expanded_pages = expand_pages(pagesTwo)
80+
print(expanded_pages)
81+
assert all(x in expanded_pages for x in ['A','B','C','D','E','F'])
82+
83+
def test_relevant_pages():
84+
pages = relevant_pages("male")
85+
assert all((x in pages.keys()) for x in ['A','C','E'])
86+
assert all((x not in pages) for x in ['B','D','F'])
87+
88+
def test_normalize():
89+
normalize( pageDict )
90+
print(page.hub for addr,page in nlp.pagesIndex.items())
91+
expected_hub = [1/91,2/91,3/91,4/91,5/91,6/91] # Works only for sample data above
92+
expected_auth = list(reversed(expected_hub))
93+
assert len(expected_hub) == len(expected_auth) == len(nlp.pagesIndex)
94+
assert expected_hub == [page.hub for addr,page in sorted(nlp.pagesIndex.items())]
95+
assert expected_auth == [page.authority for addr,page in sorted(nlp.pagesIndex.items())]
96+
97+
def test_detectConvergence():
98+
# run detectConvergence once to initialise history
99+
convergence = ConvergenceDetector()
100+
convergence()
101+
assert convergence() # values haven't changed so should return True
102+
# make tiny increase/decrease to all values
103+
for _, page in nlp.pagesIndex.items():
104+
page.hub += 0.0003
105+
page.authority += 0.0004
106+
# retest function with values. Should still return True
107+
assert convergence()
108+
for _, page in nlp.pagesIndex.items():
109+
page.hub += 3000000
110+
page.authority += 3000000
111+
# retest function with values. Should now return false
112+
assert not convergence()
113+
114+
def test_getInlinks():
115+
inlnks = getInlinks(pageDict['A'])
116+
assert sorted([page.address for page in inlnks]) == pageDict['A'].inlinks
117+
118+
def test_getOutlinks():
119+
outlnks = getOutlinks(pageDict['A'])
120+
assert sorted([page.address for page in outlnks]) == pageDict['A'].outlinks
121+
122+
def test_HITS():
123+
# TODO
124+
assert True # leave for now
125+
126+
if __name__ == '__main__':
127+
pytest.main()

0 commit comments

Comments
 (0)