0% found this document useful (0 votes)

105 views8 pages

Cs 3308 Unit 7 Programming Assignment

The document contains a programming assignment for an Information Retrieval course at the University of the People, which involves creating a web crawler using Python and SQLite. The code includes functionalities for tokenizing text, removing stop words, basic stemming, and storing crawled data in a database. The assignment culminates in crawling a specified website and indexing the retrieved documents and terms.

Uploaded by

Eric Baudouin Ake

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

105 views8 pages

Cs 3308 Unit 7 Programming Assignment

Uploaded by

Eric Baudouin Ake

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 8

CS 3308- INFORMATION RETRIEVAL

UNIT 7 PROGRAMMING ASSIGNMENT

UNIVERSITY OF THE PEOPLE

SOURCE CODE OF THE ASSIGNMENT

import sqlite3
import re
import math
import time
import urllib.request as urllib2
import urllib.parse as urlparse
from html.parser import HTMLParser

# Stop words list

stopwords = ['the', 'of', 'and', 'to', 'in', 'you', 'it', 'with', 'that', 'or', 'was', 'he', 'is', 'for', 'this', 'his', 'as',
'not', 'at', 'by', 'all', 'they', 'but', 'be', 'on', 'from', 'had', 'her', 'work', 'are', 'any', 'she', 'if', 'said', 'so',
'which', 'have', 'do', 'we', 'no', 'my', 'were', 'them', 'their', 'him', 'one', 'will', 'me', 'there', 'who', 'up',
'other', 'an', 'its', 'when', 'what', 'can', 'may', 'into', 'out', 'must', 'your', 'then', 'would', 'could', 'more',
'now', 'has', 'like', 'down', 'where', 'been', 'through', 'did', 'away', 'these', 'such', 'set', 'back', 'some',
'than', 'way', 'made', 'our', 'after', 'well', 'should', 'get', 'even', 'am', 'go', 'saw', 'just', 'put', 'while', 'ever',
'off', 'here', 'also']

# Regular expressions
chars = re.compile(r'\W+')

# Global counters
tokens = 0
documents = 0
terms = 0

class Term:
def __init__(self):
self.termid = 0
self.termfreq = 0
self.docs = 0
self.docids = {}

@staticmethod
def splitchars(line):
return chars.split(line)

@staticmethod
def stripTags(s):
# Using HTMLParser as a built-in alternative to BeautifulSoup
parser = HTMLParser()
return parser.unescape(s)

@staticmethod
def printText(tags):
for tag in tags:
if isinstance(tag, NavigableString):
print(tag)
else:
Term.printText(tag)
print("")

# Simple stemming function

def basic_stem(word):
# Simple suffix stripping (this is just an example, not as robust as PorterStemmer)
suffixes = ['ing', 'ed', 'es', 's']
for suffix in suffixes:
if word.endswith(suffix):
return word[:-len(suffix)]
return word # Return the word if no suffix is found

def parsetoken(db, line):

global documents
global tokens
global terms

# Clean up the line

line = line.replace('\t', ' ').strip()

# Split line into tokens

tokens_list = Term.splitchars(line)

# Process each token

for elmt in tokens_list:
elmt = elmt.replace('\n', '')
lowerElmt = elmt.lower().strip()

# Count tokens
tokens += 1

# Skip short tokens, stopwords, and numbers

if len(lowerElmt) < 2 or lowerElmt in stopwords:
continue

try:
int(lowerElmt) # Check if token is a number
continue
except ValueError:
stemword = lowerElmt

# Apply basic stemming

lowerElmt = basic_stem(stemword)

# Add new term to dictionary if it doesn't exist

if lowerElmt not in db:
terms += 1
db[lowerElmt] = Term()
db[lowerElmt].termid = terms

# Update document frequency and term frequency

if documents not in db[lowerElmt].docids:
db[lowerElmt].docs += 1
db[lowerElmt].docids[documents] = 0

db[lowerElmt].docids[documents] += 1

return tokens_list

def writeindex(db, cur):

for k, term in db.items():
cur.execute('INSERT INTO TermDictionary (Term, TermId) VALUES (?, ?)', (k, term.termid))

docfreq = term.docs
ratio = float(documents) / float(docfreq)
idf = math.log10(ratio)

for i, termfreq in term.docids.items():

tfidf = float(termfreq) * float(idf)

if tfidf > 0:
cur.execute('INSERT INTO Posting (TermId, DocId, tfidf, docfreq, termfreq) VALUES (?, ?, ?, ?, ?)',
(term.termid, i, tfidf, docfreq, termfreq))

def main():
global documents
global tokens
global terms

# Get the starting URL to crawl

start_url = input("Enter URL to crawl (must be in the form http://www.domain.com): ")

# Initialize database
db = {}

# Capture the start time

t2 = time.localtime()
print(f'Start Time: {t2.tm_hour:02d}:{t2.tm_min:02d}')

# Create SQLite database

con = sqlite3.connect("webcrawler.db")
cur = con.cursor()

# Create tables
cur.execute("DROP TABLE IF EXISTS DocumentDictionary")
cur.execute("DROP INDEX IF EXISTS idxDocumentDictionary")
cur.execute("CREATE TABLE IF NOT EXISTS DocumentDictionary (DocumentName TEXT, DocId
INTEGER)")
cur.execute("CREATE INDEX IF NOT EXISTS idxDocumentDictionary ON DocumentDictionary (DocId)")

cur.execute("DROP TABLE IF EXISTS TermDictionary")

cur.execute("DROP INDEX IF EXISTS idxTermDictionary")
cur.execute("CREATE TABLE IF NOT EXISTS TermDictionary (Term TEXT, TermId INTEGER)")
cur.execute("CREATE INDEX IF NOT EXISTS idxTermDictionary ON TermDictionary (TermId)")
cur.execute("DROP TABLE IF EXISTS Posting")
cur.execute("DROP INDEX IF EXISTS idxPosting1")
cur.execute("DROP INDEX IF EXISTS idxPosting2")
cur.execute("CREATE TABLE IF NOT EXISTS Posting (TermId INTEGER, DocId INTEGER, tfidf REAL,
docfreq INTEGER, termfreq INTEGER)")
cur.execute("CREATE INDEX IF NOT EXISTS idxPosting1 ON Posting (TermId)")
cur.execute("CREATE INDEX IF NOT EXISTS idxPosting2 ON Posting (DocId)")

# Initialize crawling variables

crawled = set()
tocrawl = [start_url]
links_queue = 0
crawlcomplete = True

while crawlcomplete:
if links_queue >= 500:
print("URL frontier reached its limit of 500 URLs.")
break

try:
crawling = tocrawl.pop(0)
except IndexError:
crawlcomplete = False
continue

# Skip non-HTML files

if crawling.endswith(('.pdf', '.png', '.jpg', '.gif', '.asp')):
crawled.add(crawling)
continue

print(f'{len(tocrawl)} URLs remaining to crawl. Crawling: {crawling}')

# Fetch the page

url = urlparse.urlparse(crawling)
try:
response = urllib2.urlopen(crawling).read().decode('utf-8')
except Exception as e:
print(f'Error fetching {crawling}: {e}')
continue

# Parse the page content

text = Term.stripTags(response)

# Process tokens
parsetoken(db, text)

documents += 1

# Store document info

cur.execute("INSERT INTO DocumentDictionary (DocumentName, DocId) VALUES (?, ?)", (crawling,
documents))
# Extract and queue links
if links_queue < 500:
links = re.findall(r'href=["\'](.[^"\']+)["\']', response, re.I)
for link in links:
link = urlparse.urljoin(crawling, link)
if link not in crawled and link not in tocrawl:
tocrawl.append(link)
links_queue += 1

crawled.add(crawling)
links_queue -= 1

# Finish and write index to disk

t2 = time.localtime()
print(f'Indexing Complete, write to disk: {t2.tm_hour:02d}:{t2.tm_min:02d}')
writeindex(db, cur)

# Commit and close the database

con.commit()
con.close()

# Print statistics
print(f"Documents {documents}")
print(f"Terms {terms}")
print(f"Tokens {tokens}")

t2 = time.localtime()
print(f'End Time: {t2.tm_hour:02d}:{t2.tm_min:02d}')

# Call the main function directly

main()

THE OUTPUT OF THE ASSIGNMENT

I crawled the website: http://www.thesaurus.com

Python Cheat Sheet - The Basics Coursera
No ratings yet
Python Cheat Sheet - The Basics Coursera
2 pages
ColorGATE RIP-Software Release Notes 8.00 Build 5055
No ratings yet
ColorGATE RIP-Software Release Notes 8.00 Build 5055
34 pages
Web Scraping & Inverted Index Guide
No ratings yet
Web Scraping & Inverted Index Guide
10 pages
20BCE1779 - Web Mining - Lab-1
No ratings yet
20BCE1779 - Web Mining - Lab-1
9 pages
Assessment - 2: - K Mary Nikitha
No ratings yet
Assessment - 2: - K Mary Nikitha
27 pages
Language Engineering - Section
No ratings yet
Language Engineering - Section
20 pages
PYTHON
No ratings yet
PYTHON
2 pages
Rescued Document
No ratings yet
Rescued Document
4 pages
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
No ratings yet
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
5 pages
DSBD 7 Ass
No ratings yet
DSBD 7 Ass
9 pages
Assignment 2 IR
No ratings yet
Assignment 2 IR
6 pages
Demo
No ratings yet
Demo
3 pages
Document Indexing & Retrieval Guide
No ratings yet
Document Indexing & Retrieval Guide
20 pages
Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE
No ratings yet
Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE
11 pages
Lab3 IR BIM
No ratings yet
Lab3 IR BIM
14 pages
Accounts Payable User Manual
No ratings yet
Accounts Payable User Manual
32 pages
Lab2 IR
No ratings yet
Lab2 IR
16 pages
ISO (International Organization Standardization)
100% (1)
ISO (International Organization Standardization)
18 pages
Lab Manual Computer Data Security & Privacy (COMP-324) : Course Coordinator: Dr. Sherif Tawfik Amin
No ratings yet
Lab Manual Computer Data Security & Privacy (COMP-324) : Course Coordinator: Dr. Sherif Tawfik Amin
51 pages
Thesis Statement About Gadgets
100% (2)
Thesis Statement About Gadgets
7 pages
Citl Exp 8
No ratings yet
Citl Exp 8
7 pages
Example Network Diagram: Msa Bts1 Bsc1 Msc/Vlr1 Air Interface/Lapdm Abis Interface/Lapd A Interface Map - E Interface
No ratings yet
Example Network Diagram: Msa Bts1 Bsc1 Msc/Vlr1 Air Interface/Lapdm Abis Interface/Lapd A Interface Map - E Interface
40 pages
Compiler MCQ (MCA 504A)
No ratings yet
Compiler MCQ (MCA 504A)
23 pages
Contact ID Codes
No ratings yet
Contact ID Codes
6 pages
Single Axis Solar Tracking System Using Microcontroller (ATmega328) and Servo Motor
No ratings yet
Single Axis Solar Tracking System Using Microcontroller (ATmega328) and Servo Motor
4 pages
Ir Lab 2 Ir Learning Outcomes: Pyterrier
No ratings yet
Ir Lab 2 Ir Learning Outcomes: Pyterrier
7 pages
Ir Op 6
No ratings yet
Ir Op 6
2 pages
Introduction
No ratings yet
Introduction
17 pages
Payroll Calculator & Database Code
No ratings yet
Payroll Calculator & Database Code
49 pages
How To Find The Where Used List of Query Restrictions
No ratings yet
How To Find The Where Used List of Query Restrictions
14 pages
Fall 2011 - CS502 - 1
No ratings yet
Fall 2011 - CS502 - 1
3 pages
Keyword Protocol 2000 - Part 1 - Physical Layer - Swedish
No ratings yet
Keyword Protocol 2000 - Part 1 - Physical Layer - Swedish
12 pages
Ir Task
No ratings yet
Ir Task
6 pages
Why Do Students Like Online Learning
No ratings yet
Why Do Students Like Online Learning
2 pages
Worldspan Galileo
No ratings yet
Worldspan Galileo
8 pages
CA 13 VectorProcessors
No ratings yet
CA 13 VectorProcessors
16 pages
Lab Manual
No ratings yet
Lab Manual
10 pages
Bits ZG553 Ec-2r First Sem 2019-2020
No ratings yet
Bits ZG553 Ec-2r First Sem 2019-2020
2 pages
CS 3308 Programming Assignment Unit 2
No ratings yet
CS 3308 Programming Assignment Unit 2
10 pages
Notes - by Kishor
No ratings yet
Notes - by Kishor
11 pages
Uganda National Bureau of Standards: Laboratory Test Report
No ratings yet
Uganda National Bureau of Standards: Laboratory Test Report
1 page
Sans Titre
No ratings yet
Sans Titre
11 pages
Information Retrival
No ratings yet
Information Retrival
43 pages
Module 6 - Spring Boot Java (MCA)
No ratings yet
Module 6 - Spring Boot Java (MCA)
113 pages
IR Practical
No ratings yet
IR Practical
24 pages
Python Script For PDF - Reading
No ratings yet
Python Script For PDF - Reading
2 pages
Vanessaa Wim
No ratings yet
Vanessaa Wim
9 pages
IR Practical Code
No ratings yet
IR Practical Code
13 pages
Zihad Projeject
No ratings yet
Zihad Projeject
20 pages
Operating Systems Course Guide
No ratings yet
Operating Systems Course Guide
2 pages
BlackBelt Plus Roadmap - 23 - v2
No ratings yet
BlackBelt Plus Roadmap - 23 - v2
6 pages
Log
No ratings yet
Log
4 pages
CS 3308 Programming Assignment Unit 4
No ratings yet
CS 3308 Programming Assignment Unit 4
7 pages
1a NLTK
No ratings yet
1a NLTK
10 pages
Ballerono Cappuchino
No ratings yet
Ballerono Cappuchino
10 pages
IR
No ratings yet
IR
12 pages
Another Hack Test3
No ratings yet
Another Hack Test3
4 pages
Python for HR and Employee Management
No ratings yet
Python for HR and Employee Management
7 pages
Trip Planner Example
No ratings yet
Trip Planner Example
7 pages
Philips C++ Coding Standard ( C++11)
No ratings yet
Philips C++ Coding Standard ( C++11)
97 pages
Professional 2019: Fire Detection and Voice Evacuation Systems
No ratings yet
Professional 2019: Fire Detection and Voice Evacuation Systems
76 pages
Information Retrieval WA
No ratings yet
Information Retrieval WA
9 pages
Inverted Index-Unit-3
No ratings yet
Inverted Index-Unit-3
11 pages
IR Practical 1
No ratings yet
IR Practical 1
5 pages
De1 GK NHKTLT
No ratings yet
De1 GK NHKTLT
12 pages
Loxone Compendium Building Automation
No ratings yet
Loxone Compendium Building Automation
44 pages
Zref
No ratings yet
Zref
8 pages
Gen Ai-1
No ratings yet
Gen Ai-1
6 pages
Assignment 4
No ratings yet
Assignment 4
11 pages
Exam AZ-120 Topic 10 Question 2 Discussion - ExamTopics
No ratings yet
Exam AZ-120 Topic 10 Question 2 Discussion - ExamTopics
3 pages
Untitled Document
No ratings yet
Untitled Document
18 pages
Installation Manual - MPS125-100-R003
No ratings yet
Installation Manual - MPS125-100-R003
29 pages
Unit 4 Source Code
No ratings yet
Unit 4 Source Code
11 pages
IR Assignment5
No ratings yet
IR Assignment5
4 pages
TC2 Lecture Note
No ratings yet
TC2 Lecture Note
11 pages
IR Assignment4
No ratings yet
IR Assignment4
5 pages
Assignment 2
No ratings yet
Assignment 2
4 pages
Evolution of The Practice of Software Testing in Java Projects
No ratings yet
Evolution of The Practice of Software Testing in Java Projects
5 pages
Azure Rag Implementation Part2
No ratings yet
Azure Rag Implementation Part2
15 pages
Python v3 URL and Page
No ratings yet
Python v3 URL and Page
4 pages
45
No ratings yet
45
5 pages
CST8276 MongoScraping Spring2025
No ratings yet
CST8276 MongoScraping Spring2025
17 pages
Assignment 4
No ratings yet
Assignment 4
13 pages
Programming Assignment Unit 07 - CS 3308 - Information Retrieval - University of The People
No ratings yet
Programming Assignment Unit 07 - CS 3308 - Information Retrieval - University of The People
4 pages
AI-tutor - SRC - Auth - Py at Main AdityaK-101 - AI-Tutor
No ratings yet
AI-tutor - SRC - Auth - Py at Main AdityaK-101 - AI-Tutor
17 pages
Programming Assignment Unit 05 - CS 3308 - Information Retrieval - University of The People
No ratings yet
Programming Assignment Unit 05 - CS 3308 - Information Retrieval - University of The People
9 pages
IR Journal 21054
No ratings yet
IR Journal 21054
30 pages

Cs 3308 Unit 7 Programming Assignment

Uploaded by

Cs 3308 Unit 7 Programming Assignment

Uploaded by

CS 3308- INFORMATION RETRIEVAL

UNIT 7 PROGRAMMING ASSIGNMENT

UNIVERSITY OF THE PEOPLE

# Stop words list

# Simple stemming function

def parsetoken(db, line):

# Clean up the line

# Split line into tokens

# Process each token

# Skip short tokens, stopwords, and numbers

# Apply basic stemming

# Add new term to dictionary if it doesn't exist

# Update document frequency and term frequency

def writeindex(db, cur):

for i, termfreq in term.docids.items():

# Get the starting URL to crawl

# Capture the start time

# Create SQLite database

cur.execute("DROP TABLE IF EXISTS TermDictionary")

# Initialize crawling variables

# Skip non-HTML files

print(f'{len(tocrawl)} URLs remaining to crawl. Crawling: {crawling}')

# Fetch the page

# Parse the page content

# Store document info

# Finish and write index to disk

# Commit and close the database

# Call the main function directly

THE OUTPUT OF THE ASSIGNMENT

I crawled the website: http://www.thesaurus.com

You might also like