|
4 | 4 | # from the third edition until this gets reviewed.)
|
5 | 5 |
|
6 | 6 | from collections import defaultdict
|
| 7 | +import urllib.request |
| 8 | +import re |
7 | 9 |
|
8 | 10 | # ______________________________________________________________________________
|
9 | 11 | # Grammars and Lexicons
|
@@ -206,3 +208,178 @@ def CYK_parse(words, grammar):
|
206 | 208 | P[X, start, length] = max(P[X, start, length],
|
207 | 209 | P[Y, start, len1] * P[Z, start+len1, len2] * p)
|
208 | 210 | return P
|
| 211 | + |
| 212 | + |
| 213 | +# ______________________________________________________________________________ |
| 214 | +# Page Ranking |
| 215 | + |
| 216 | +# First entry in list is the base URL, and then following are relative URL pages |
| 217 | +examplePagesSet = ["https://en.wikipedia.org/wiki/", "Aesthetics", "Analytic_philosophy", |
| 218 | + "Ancient_Greek", "Aristotle", "Astrology","Atheism", "Baruch_Spinoza", |
| 219 | + "Belief", "Betrand Russell", "Confucius", "Consciousness", |
| 220 | + "Continental Philosophy", "Dialectic", "Eastern_Philosophy", |
| 221 | + "Epistemology", "Ethics", "Existentialism", "Friedrich_Nietzsche", |
| 222 | + "Idealism", "Immanuel_Kant", "List_of_political_philosophers", "Logic", |
| 223 | + "Metaphysics", "Philosophers", "Philosophy", "Philosophy_of_mind", "Physics", |
| 224 | + "Plato", "Political_philosophy", "Pythagoras", "Rationalism","Social_philosophy", |
| 225 | + "Socrates", "Subjectivity", "Theology", "Truth", "Western_philosophy"] |
| 226 | + |
| 227 | + |
| 228 | +def loadPageHTML( addressList ): |
| 229 | + """Download HTML page content for every URL address passed as argument""" |
| 230 | + contentDict = {} |
| 231 | + for addr in addressList: |
| 232 | + with urllib.request.urlopen(addr) as response: |
| 233 | + raw_html = response.read().decode('utf-8') |
| 234 | + # Strip raw html of unnessecary content. Basically everything that isn't link or text |
| 235 | + html = stripRawHTML(raw_html) |
| 236 | + contentDict[addr] = html |
| 237 | + return contentDict |
| 238 | + |
| 239 | +def initPages( addressList ): |
| 240 | + """Create a dictionary of pages from a list of URL addresses""" |
| 241 | + pages = {} |
| 242 | + for addr in addressList: |
| 243 | + pages[addr] = Page(addr) |
| 244 | + return pages |
| 245 | + |
| 246 | +def stripRawHTML( raw_html ): |
| 247 | + """Remove the <head> section of the HTML which contains links to stylesheets etc., |
| 248 | + and remove all other unnessecary HTML""" |
| 249 | + # TODO: Strip more out of the raw html |
| 250 | + return re.sub("<head>.*?</head>", "", raw_html, flags=re.DOTALL) # remove <head> section |
| 251 | + |
| 252 | +def determineInlinks( page ): |
| 253 | + """Given a set of pages that have their outlinks determined, we can fill |
| 254 | + out a page's inlinks by looking through all other page's outlinks""" |
| 255 | + inlinks = [] |
| 256 | + for addr, indexPage in pagesIndex.items(): |
| 257 | + if page.address == indexPage.address: |
| 258 | + continue |
| 259 | + elif page.address in indexPage.outlinks: |
| 260 | + inlinks.append(addr) |
| 261 | + return inlinks |
| 262 | + |
| 263 | +def findOutlinks( page, handleURLs=None ): |
| 264 | + """Search a page's HTML content for URL links to other pages""" |
| 265 | + urls = re.findall(r'href=[\'"]?([^\'" >]+)', pagesContent[page.address]) |
| 266 | + if handleURLs: |
| 267 | + urls = handleURLs(urls) |
| 268 | + return urls |
| 269 | + |
| 270 | +def onlyWikipediaURLS( urls ): |
| 271 | + """Some example HTML page data is from wikipedia. This function converts |
| 272 | + relative wikipedia links to full wikipedia URLs""" |
| 273 | + wikiURLs = [url for url in urls if url.startswith('/wiki/')] |
| 274 | + return ["https://en.wikipedia.org"+url for url in wikiURLs] |
| 275 | + |
| 276 | + |
| 277 | +# ______________________________________________________________________________ |
| 278 | +# HITS Helper Functions |
| 279 | + |
| 280 | +def expand_pages( pages ): |
| 281 | + """From Textbook: adds in every page that links to or is linked from one of |
| 282 | + the relevant pages.""" |
| 283 | + expanded = {} |
| 284 | + for addr,page in pages.items(): |
| 285 | + if addr not in expanded: |
| 286 | + expanded[addr] = page |
| 287 | + for inlink in page.inlinks: |
| 288 | + if inlink not in expanded: |
| 289 | + expanded[inlink] = pagesIndex[inlink] |
| 290 | + for outlink in page.outlinks: |
| 291 | + if outlink not in expanded: |
| 292 | + expanded[outlink] = pagesIndex[outlink] |
| 293 | + return expanded |
| 294 | + |
| 295 | +def relevant_pages(query): |
| 296 | + """relevant pages are pages that contain the query in its entireity. |
| 297 | + If a page's content contains the query it is returned by the function""" |
| 298 | + relevant = {} |
| 299 | + print("pagesContent in function: ", pagesContent) |
| 300 | + for addr, page in pagesIndex.items(): |
| 301 | + if query.lower() in pagesContent[addr].lower(): |
| 302 | + relevant[addr] = page |
| 303 | + return relevant |
| 304 | + |
| 305 | +def normalize( pages ): |
| 306 | + """From the pseudocode: Normalize divides each page's score by the sum of |
| 307 | + the squares of all pages' scores (separately for both the authority and hubs scores). |
| 308 | + """ |
| 309 | + summed_hub = sum(page.hub**2 for _,page in pages.items()) |
| 310 | + summed_auth = sum(page.authority**2 for _,page in pages.items()) |
| 311 | + for _, page in pages.items(): |
| 312 | + page.hub /= summed_hub |
| 313 | + page.authority /= summed_auth |
| 314 | + |
| 315 | +class ConvergenceDetector(object): |
| 316 | + """If the hub and authority values of the pages are no longer changing, we have |
| 317 | + reached a convergence and further iterations will have no effect. This detects convergence |
| 318 | + so that we can stop the HITS algorithm as early as possible.""" |
| 319 | + def __init__(self): |
| 320 | + self.hub_history = None |
| 321 | + self.auth_history = None |
| 322 | + |
| 323 | + def __call__(self): |
| 324 | + return self.detect() |
| 325 | + |
| 326 | + def detect(self): |
| 327 | + curr_hubs = [page.hub for addr, page in pagesIndex.items()] |
| 328 | + curr_auths = [page.authority for addr, page in pagesIndex.items()] |
| 329 | + if self.hub_history == None: |
| 330 | + self.hub_history, self.auth_history = [],[] |
| 331 | + else: |
| 332 | + diffsHub = [abs(x-y) for x, y in zip(curr_hubs,self.hub_history[-1])] |
| 333 | + diffsAuth = [abs(x-y) for x, y in zip(curr_auths,self.auth_history[-1])] |
| 334 | + aveDeltaHub = sum(diffsHub)/float(len(pagesIndex)) |
| 335 | + aveDeltaAuth = sum(diffsAuth)/float(len(pagesIndex)) |
| 336 | + if aveDeltaHub < 0.01 and aveDeltaAuth < 0.01: # may need tweaking |
| 337 | + return True |
| 338 | + if len(self.hub_history) > 2: # prevent list from getting long |
| 339 | + del self.hub_history[0] |
| 340 | + del self.auth_history[0] |
| 341 | + self.hub_history.append([x for x in curr_hubs]) |
| 342 | + self.auth_history.append([x for x in curr_auths]) |
| 343 | + return False |
| 344 | + |
| 345 | + |
| 346 | +def getInlinks( page ): |
| 347 | + if not page.inlinks: |
| 348 | + page.inlinks = determineInlinks(page) |
| 349 | + return [p for addr, p in pagesIndex.items() if addr in page.inlinks ] |
| 350 | + |
| 351 | +def getOutlinks( page ): |
| 352 | + if not page.outlinks: |
| 353 | + page.outlinks = findOutlinks(page) |
| 354 | + return [p for addr, p in pagesIndex.items() if addr in page.outlinks] |
| 355 | + |
| 356 | + |
| 357 | +# ______________________________________________________________________________ |
| 358 | +# HITS Algorithm |
| 359 | + |
| 360 | +class Page(object): |
| 361 | + def __init__(self, address, hub=0, authority=0, inlinks=None, outlinks=None): |
| 362 | + self.address = address |
| 363 | + self.hub = hub |
| 364 | + self.authority = authority |
| 365 | + self.inlinks = inlinks |
| 366 | + self.outlinks = outlinks |
| 367 | + |
| 368 | +pagesContent = {} # maps Page relative or absolute URL/location to page's HTML content |
| 369 | +pagesIndex = {} |
| 370 | +convergence = ConvergenceDetector() # assign function to variable to mimic pseudocode's syntax |
| 371 | + |
| 372 | +def HITS(query): |
| 373 | + """The HITS algorithm for computing hubs and authorities with respect to a query.""" |
| 374 | + pages = expand_pages(relevant_pages(query)) # in order to 'map' faithfully to pseudocode we |
| 375 | + for p in pages: # won't pass the list of pages as an argument |
| 376 | + p.authority = 1 |
| 377 | + p.hub = 1 |
| 378 | + while True: # repeat until... convergence |
| 379 | + for p in pages: |
| 380 | + p.authority = sum(x.hub for x in getInlinks(p)) # p.authority ← ∑i Inlinki(p).Hub |
| 381 | + p.hub = sum(x.authority for x in getOutlinks(p)) # p.hub ← ∑i Outlinki(p).Authority |
| 382 | + normalize(pages) |
| 383 | + if convergence(): |
| 384 | + break |
| 385 | + return pages |
0 commit comments