CS 3308- INFORMATION RETRIEVAL
UNIT 7 PROGRAMMING ASSIGNMENT
UNIVERSITY OF THE PEOPLE
SOURCE CODE OF THE ASSIGNMENT
import sqlite3
import re
import math
import time
import urllib.request as urllib2
import urllib.parse as urlparse
from html.parser import HTMLParser
# Stop words list
stopwords = ['the', 'of', 'and', 'to', 'in', 'you', 'it', 'with', 'that', 'or', 'was', 'he', 'is', 'for', 'this', 'his', 'as',
'not', 'at', 'by', 'all', 'they', 'but', 'be', 'on', 'from', 'had', 'her', 'work', 'are', 'any', 'she', 'if', 'said', 'so',
'which', 'have', 'do', 'we', 'no', 'my', 'were', 'them', 'their', 'him', 'one', 'will', 'me', 'there', 'who', 'up',
'other', 'an', 'its', 'when', 'what', 'can', 'may', 'into', 'out', 'must', 'your', 'then', 'would', 'could', 'more',
'now', 'has', 'like', 'down', 'where', 'been', 'through', 'did', 'away', 'these', 'such', 'set', 'back', 'some',
'than', 'way', 'made', 'our', 'after', 'well', 'should', 'get', 'even', 'am', 'go', 'saw', 'just', 'put', 'while', 'ever',
'off', 'here', 'also']
# Regular expressions
chars = re.compile(r'\W+')
# Global counters
tokens = 0
documents = 0
terms = 0
class Term:
def __init__(self):
self.termid = 0
self.termfreq = 0
self.docs = 0
self.docids = {}
@staticmethod
def splitchars(line):
return chars.split(line)
@staticmethod
def stripTags(s):
# Using HTMLParser as a built-in alternative to BeautifulSoup
parser = HTMLParser()
return parser.unescape(s)
@staticmethod
def printText(tags):
for tag in tags:
if isinstance(tag, NavigableString):
print(tag)
else:
Term.printText(tag)
print("")
# Simple stemming function
def basic_stem(word):
# Simple suffix stripping (this is just an example, not as robust as PorterStemmer)
suffixes = ['ing', 'ed', 'es', 's']
for suffix in suffixes:
if word.endswith(suffix):
return word[:-len(suffix)]
return word # Return the word if no suffix is found
def parsetoken(db, line):
global documents
global tokens
global terms
# Clean up the line
line = line.replace('\t', ' ').strip()
# Split line into tokens
tokens_list = Term.splitchars(line)
# Process each token
for elmt in tokens_list:
elmt = elmt.replace('\n', '')
lowerElmt = elmt.lower().strip()
# Count tokens
tokens += 1
# Skip short tokens, stopwords, and numbers
if len(lowerElmt) < 2 or lowerElmt in stopwords:
continue
try:
int(lowerElmt) # Check if token is a number
continue
except ValueError:
stemword = lowerElmt
# Apply basic stemming
lowerElmt = basic_stem(stemword)
# Add new term to dictionary if it doesn't exist
if lowerElmt not in db:
terms += 1
db[lowerElmt] = Term()
db[lowerElmt].termid = terms
# Update document frequency and term frequency
if documents not in db[lowerElmt].docids:
db[lowerElmt].docs += 1
db[lowerElmt].docids[documents] = 0
db[lowerElmt].docids[documents] += 1
return tokens_list
def writeindex(db, cur):
for k, term in db.items():
cur.execute('INSERT INTO TermDictionary (Term, TermId) VALUES (?, ?)', (k, term.termid))
docfreq = term.docs
ratio = float(documents) / float(docfreq)
idf = math.log10(ratio)
for i, termfreq in term.docids.items():
tfidf = float(termfreq) * float(idf)
if tfidf > 0:
cur.execute('INSERT INTO Posting (TermId, DocId, tfidf, docfreq, termfreq) VALUES (?, ?, ?, ?, ?)',
(term.termid, i, tfidf, docfreq, termfreq))
def main():
global documents
global tokens
global terms
# Get the starting URL to crawl
start_url = input("Enter URL to crawl (must be in the form http://www.domain.com): ")
# Initialize database
db = {}
# Capture the start time
t2 = time.localtime()
print(f'Start Time: {t2.tm_hour:02d}:{t2.tm_min:02d}')
# Create SQLite database
con = sqlite3.connect("webcrawler.db")
cur = con.cursor()
# Create tables
cur.execute("DROP TABLE IF EXISTS DocumentDictionary")
cur.execute("DROP INDEX IF EXISTS idxDocumentDictionary")
cur.execute("CREATE TABLE IF NOT EXISTS DocumentDictionary (DocumentName TEXT, DocId
INTEGER)")
cur.execute("CREATE INDEX IF NOT EXISTS idxDocumentDictionary ON DocumentDictionary (DocId)")
cur.execute("DROP TABLE IF EXISTS TermDictionary")
cur.execute("DROP INDEX IF EXISTS idxTermDictionary")
cur.execute("CREATE TABLE IF NOT EXISTS TermDictionary (Term TEXT, TermId INTEGER)")
cur.execute("CREATE INDEX IF NOT EXISTS idxTermDictionary ON TermDictionary (TermId)")
cur.execute("DROP TABLE IF EXISTS Posting")
cur.execute("DROP INDEX IF EXISTS idxPosting1")
cur.execute("DROP INDEX IF EXISTS idxPosting2")
cur.execute("CREATE TABLE IF NOT EXISTS Posting (TermId INTEGER, DocId INTEGER, tfidf REAL,
docfreq INTEGER, termfreq INTEGER)")
cur.execute("CREATE INDEX IF NOT EXISTS idxPosting1 ON Posting (TermId)")
cur.execute("CREATE INDEX IF NOT EXISTS idxPosting2 ON Posting (DocId)")
# Initialize crawling variables
crawled = set()
tocrawl = [start_url]
links_queue = 0
crawlcomplete = True
while crawlcomplete:
if links_queue >= 500:
print("URL frontier reached its limit of 500 URLs.")
break
try:
crawling = tocrawl.pop(0)
except IndexError:
crawlcomplete = False
continue
# Skip non-HTML files
if crawling.endswith(('.pdf', '.png', '.jpg', '.gif', '.asp')):
crawled.add(crawling)
continue
print(f'{len(tocrawl)} URLs remaining to crawl. Crawling: {crawling}')
# Fetch the page
url = urlparse.urlparse(crawling)
try:
response = urllib2.urlopen(crawling).read().decode('utf-8')
except Exception as e:
print(f'Error fetching {crawling}: {e}')
continue
# Parse the page content
text = Term.stripTags(response)
# Process tokens
parsetoken(db, text)
documents += 1
# Store document info
cur.execute("INSERT INTO DocumentDictionary (DocumentName, DocId) VALUES (?, ?)", (crawling,
documents))
# Extract and queue links
if links_queue < 500:
links = re.findall(r'href=["\'](.[^"\']+)["\']', response, re.I)
for link in links:
link = urlparse.urljoin(crawling, link)
if link not in crawled and link not in tocrawl:
tocrawl.append(link)
links_queue += 1
crawled.add(crawling)
links_queue -= 1
# Finish and write index to disk
t2 = time.localtime()
print(f'Indexing Complete, write to disk: {t2.tm_hour:02d}:{t2.tm_min:02d}')
writeindex(db, cur)
# Commit and close the database
con.commit()
con.close()
# Print statistics
print(f"Documents {documents}")
print(f"Terms {terms}")
print(f"Tokens {tokens}")
t2 = time.localtime()
print(f'End Time: {t2.tm_hour:02d}:{t2.tm_min:02d}')
# Call the main function directly
main()
THE OUTPUT OF THE ASSIGNMENT
I crawled the website: http://www.thesaurus.com