IR Assignment 1 21/08/18, 10(35 PM
In [1]: import glob
docs=[]
content=[]
doc_list=[]
i=0
directory= "../Information Retrieval/"
for files in glob.iglob(directory + "doc*.txt"):
infile = open(files)
a=infile.readline().split(' ')
doc_list.append(i)
i+=1
content+=a
print(a)
docs.append(a)
#for k in range (0,len(a)):
#print(a[0])
infile.close()
#The following printout is the corresponding doc no i.e. doc 0,1,2,3.
['july', 'new', 'home', 'sales', 'rise']
['new', 'home', 'sales', 'top', 'forecasts']
['increase', 'in', 'home', 'sales', 'in', 'july']
['home', 'sales', 'rise', 'in', 'july']
In [2]: #content contains all the words of the corpus
print(content)
['july', 'new', 'home', 'sales', 'rise', 'new', 'home', 'sales', 'to
p', 'forecasts', 'increase', 'in', 'home', 'sales', 'in', 'july', 'h
ome', 'sales', 'rise', 'in', 'july']
In [3]: #list of lists i.e. words occuring in different documents
print(docs)
[['july', 'new', 'home', 'sales', 'rise'], ['new', 'home', 'sales',
'top', 'forecasts'], ['increase', 'in', 'home', 'sales', 'in', 'july
'], ['home', 'sales', 'rise', 'in', 'july']]
In [4]: print(doc_list)
[0, 1, 2, 3]
http://localhost:8888/notebooks/mlworkshop/Information%20Retrieval/IR%20Assignment%201.ipynb# Page 1 of 8
IR Assignment 1 21/08/18, 10(35 PM
In [5]: #dictionary for inverted index
# the data point at index 0 for every word in inverted_index is the freque
#e.g. in('rise', [2, 0, 3])~ 2 is frequency
#and 0,3 are the documents in which it occurs.
inv_indx = {i:[] for i in content}
for word in content:
if not inv_indx[word]:
cnt=0
for i in range(len(docs)):
if word in docs[i]:
inv_indx[word].append(i)
cnt+=1
inv_indx[word].insert(0,cnt)
In [6]: #print inverted index
for k,v in inv_indx.items():
print(k,v)
This is the frequency
('in', [2, 2, 3]) This is postings list
('top', [1, 1])
('rise', [2, 0, 3])
('sales', [4, 0, 1, 2, 3])
('forecasts', [1, 1])
('increase', [1, 2])
('home', [4, 0, 1, 2, 3])
('new', [2, 0, 1])
('july', [3, 0, 2, 3])
http://localhost:8888/notebooks/mlworkshop/Information%20Retrieval/IR%20Assignment%201.ipynb# Page 2 of 8
IR Assignment 1 21/08/18, 10(35 PM
In [7]: #AND - merge algorithm
def intersect(inv_indx,w1,w2):
answer=[]
if w1 in inv_indx:
p1=inv_indx[w1]
print(p1)
else: print(w1+" not present in any doc")
if w2 in inv_indx:
p2=inv_indx[w2]
print(p2)
else: print(w2+" not present in any doc")
i=1
j=1
num=min(len(p1),len(p2))
num=num-1
while i<=num or j<= num:
print(p1[i],p2[j])
if p1[i]==p2[j]:
answer.append(p1[i])
i+=1
j+=1
elif p1[i]<p2[j]:
i+=1
else: j+=1
return answer
In [8]: #question "home and july"
ans=intersect(inv_indx,'home','july')
print(ans)
[4, 0, 1, 2, 3]
[3, 0, 2, 3]
(0, 0)
(1, 2)
(2, 2)
(3, 3)
[0, 2, 3]
http://localhost:8888/notebooks/mlworkshop/Information%20Retrieval/IR%20Assignment%201.ipynb# Page 3 of 8
IR Assignment 1 21/08/18, 10(35 PM
In [9]: def OR (w1,w2):
answer=[]
if w1 in inv_indx:
p1=inv_indx[w1]
print(p1)
else: print(w1+" not present in any doc")
if w2 in inv_indx:
p2=inv_indx[w2]
print(p2)
else: print(w2+" not present in any doc")
i=1
j=1
num=min(len(p1),len(p2))
num=num-1
while i<=num or j<= num:
print(p1[i],p2[j])
if p1[i]==p2[j]:
answer.append(p1[i])
i+=1
j+=1
elif p1[i]<p2[j]:
answer.append(p1[i])
i+=1
else: answer.append(p2[j]);j+=1
if i<j: answer.append(p2[j:])
else: answer.append(p1[i:])
return answer
In [10]: #question"sales or forecasts
ans=OR('sales','forecasts')
print(ans)
[4, 0, 1, 2, 3]
[1, 1]
(0, 1)
(1, 1)
[0, 1, [2, 3]]
http://localhost:8888/notebooks/mlworkshop/Information%20Retrieval/IR%20Assignment%201.ipynb# Page 4 of 8
IR Assignment 1 21/08/18, 10(35 PM
In [11]: def minus (w1,w2):
answer=[]
if w1 in inv_indx:
p1=inv_indx[w1]
print(p1)
else: print(w1+" not present in any doc")
if w2 in inv_indx:
p2=inv_indx[w2]
print(p2)
else: print(w2+" not present in any doc")
i=1
j=1
num=min(len(p1),len(p2))
num=num-1
while i<=num or j<= num:
print(p1[i],p2[j])
if p1[i]==p2[j]:
i+=1
j+=1
elif p1[i]<p2[j]:
answer.append(p1[i])
i+=1
else: answer.append(p2[j]);j+=1
answer.append(p1[i:])
return answer
In [12]: #question "sales-july"
ans=minus('sales','july')
print(ans)
[4, 0, 1, 2, 3]
[3, 0, 2, 3]
(0, 0)
(1, 2)
(2, 2)
(3, 3)
[1, []]
http://localhost:8888/notebooks/mlworkshop/Information%20Retrieval/IR%20Assignment%201.ipynb# Page 5 of 8
IR Assignment 1 21/08/18, 10(35 PM
In [14]: def NOT (w1):
answer=[]
if w1 in inv_indx:
p1=inv_indx[w1]
print(p1)
else: print(w1+" not present in any doc")
num=len(p1)-1
i=1
temp=doc_list
while i<= num:
if p1[i] in doc_list:
temp.remove(p1[i])
i+=1
answer=temp
return answer
In [15]: #question not july
print(NOT('july'))
[3, 0, 2, 3]
[1]
In [16]: l1=[4,6,10,12,14,16,18,20,22,32,47,81,120,122,157,180]
l2=[47]
In [17]: #AND - merge algorithm
import math
def intersect_skip(p1,p2):
answer=[]
i=0
j=0
num=max(len(p1),len(p2))
print(num)
skip=int(math.sqrt(num))
while i<=num:
print(p1[i],p2[j])
if p1[i]==p2[j]:
answer.append(i)
i+=1
return answer
elif p1[i]<p2[j]:
i+=skip
else: i-=1
return answer
http://localhost:8888/notebooks/mlworkshop/Information%20Retrieval/IR%20Assignment%201.ipynb# Page 6 of 8
IR Assignment 1 21/08/18, 10(35 PM
In [18]: print(intersect_skip(l1,l2))
#the number of comparisons here with skip pointers is 6
#due to the fact that lists are sorted and we can easily skip
#and adjust the pointer in accordance with the value being searched.
16
(4, 47)
(14, 47)
(22, 47)
(120, 47)
(81, 47)
(47, 47)
[10] The index of first occurence
In [19]: #AND - merge algorithm
import math
def intersect_normal(p1,p2):
answer=[]
i=0
j=0
num=max(len(p1),len(p2))
print(num)
skip=int(math.sqrt(num))
while i<=num:
print(p1[i],p2[j])
if p1[i]==p2[j]:
answer.append(i)
i+=1
return answer
elif p1[i]<p2[j]:
i+=1
else: i-=1
return answer
In [20]: print(intersect_normal(l1,l2))
#here 11 comparisons needed since we need to move in linear fashion
# no use of the soretd lists is made of
16
(4, 47)
(6, 47)
(10, 47)
(12, 47)
(14, 47)
(16, 47)
(18, 47)
(20, 47)
(22, 47)
(32, 47)
(47, 47)
[10]
http://localhost:8888/notebooks/mlworkshop/Information%20Retrieval/IR%20Assignment%201.ipynb# Page 7 of 8
IR Assignment 1 21/08/18, 10(35 PM
In [ ]:
http://localhost:8888/notebooks/mlworkshop/Information%20Retrieval/IR%20Assignment%201.ipynb# Page 8 of 8