1
1
import pytest
2
2
import nlp
3
- from nlp import loadPageHTML , stripRawHTML , determineInlinks , findOutlinks , onlyWikipediaURLS
3
+ from nlp import loadPageHTML , stripRawHTML , findOutlinks , onlyWikipediaURLS
4
4
from nlp import expand_pages , relevant_pages , normalize , ConvergenceDetector , getInlinks
5
- from nlp import getOutlinks , Page , HITS
5
+ from nlp import getOutlinks , Page
6
6
from nlp import Rules , Lexicon
7
7
# Clumsy imports because we want to access certain nlp.py globals explicitly, because
8
8
# they are accessed by function's within nlp.py
9
9
10
+
10
11
def test_rules ():
11
12
assert Rules (A = "B C | D E" ) == {'A' : [['B' , 'C' ], ['D' , 'E' ]]}
12
13
@@ -27,18 +28,18 @@ def test_lexicon():
27
28
href="/wiki/TestLiving" href="/wiki/TestMan" >"""
28
29
testHTML2 = "Nothing"
29
30
30
- pA = Page ("A" , 1 , 6 , ["B" ,"C" ,"E" ],["D" ])
31
- pB = Page ("B" , 2 , 5 , ["E" ],["A" ,"C" ,"D" ])
32
- pC = Page ("C" , 3 , 4 , ["B" ,"E" ],["A" ,"D" ])
33
- pD = Page ("D" , 4 , 3 , ["A" ,"B" ,"C" ,"E" ],[])
34
- pE = Page ("E" , 5 , 2 , [],["A" ,"B" ,"C" ,"D" ,"F" ])
35
- pF = Page ("F" , 6 , 1 , ["E" ],[])
36
- pageDict = {pA .address :pA ,pB .address :pB ,pC .address :pC ,
37
- pD .address :pD ,pE .address :pE ,pF .address :pF }
31
+ pA = Page ("A" , 1 , 6 , ["B" , "C" , "E" ], ["D" ])
32
+ pB = Page ("B" , 2 , 5 , ["E" ], ["A" , "C" , "D" ])
33
+ pC = Page ("C" , 3 , 4 , ["B" , "E" ], ["A" , "D" ])
34
+ pD = Page ("D" , 4 , 3 , ["A" , "B" , "C" , "E" ], [])
35
+ pE = Page ("E" , 5 , 2 , [], ["A" , "B" , "C" , "D" , "F" ])
36
+ pF = Page ("F" , 6 , 1 , ["E" ], [])
37
+ pageDict = {pA .address : pA , pB .address : pB , pC .address : pC ,
38
+ pD .address : pD , pE .address : pE , pF .address : pF }
38
39
nlp .pagesIndex = pageDict
39
- nlp .pagesContent = {pA .address :testHTML ,pB .address :testHTML2 ,
40
- pC .address :testHTML ,pD .address :testHTML2 ,
41
- pE .address :testHTML ,pF .address :testHTML2 }
40
+ nlp .pagesContent = {pA .address : testHTML , pB .address : testHTML2 ,
41
+ pC .address : testHTML , pD .address : testHTML2 ,
42
+ pE .address : testHTML , pF .address : testHTML2 }
42
43
43
44
# This test takes a long time (> 60 secs)
44
45
# def test_loadPageHTML():
@@ -50,17 +51,20 @@ def test_lexicon():
50
51
# assert all(x in loadedPages for x in fullURLs)
51
52
# assert all(loadedPages.get(key,"") != "" for key in addresses)
52
53
54
+
53
55
def test_stripRawHTML ():
54
56
addr = "https://en.wikipedia.org/wiki/Ethics"
55
57
aPage = loadPageHTML ([addr ])
56
58
someHTML = aPage [addr ]
57
59
strippedHTML = stripRawHTML (someHTML )
58
60
assert "<head>" not in strippedHTML and "</head>" not in strippedHTML
59
61
62
+
60
63
def test_determineInlinks ():
61
64
# TODO
62
65
assert True
63
66
67
+
64
68
def test_findOutlinks_wiki ():
65
69
testPage = pageDict [pA .address ]
66
70
outlinks = findOutlinks (testPage , handleURLs = onlyWikipediaURLS )
@@ -70,35 +74,39 @@ def test_findOutlinks_wiki():
70
74
# ______________________________________________________________________________
71
75
# HITS Helper Functions
72
76
77
+
73
78
def test_expand_pages ():
74
79
pages = {k : pageDict [k ] for k in ('F' )}
75
- pagesTwo = {k : pageDict [k ] for k in ('A' ,'E' )}
80
+ pagesTwo = {k : pageDict [k ] for k in ('A' , 'E' )}
76
81
expanded_pages = expand_pages (pages )
77
- assert all (x in expanded_pages for x in ['F' ,'E' ])
78
- assert all (x not in expanded_pages for x in ['A' ,'B' ,'C' ,'D' ])
82
+ assert all (x in expanded_pages for x in ['F' , 'E' ])
83
+ assert all (x not in expanded_pages for x in ['A' , 'B' , 'C' , 'D' ])
79
84
expanded_pages = expand_pages (pagesTwo )
80
85
print (expanded_pages )
81
- assert all (x in expanded_pages for x in ['A' ,'B' ,'C' ,'D' ,'E' ,'F' ])
86
+ assert all (x in expanded_pages for x in ['A' , 'B' , 'C' , 'D' , 'E' , 'F' ])
87
+
82
88
83
89
def test_relevant_pages ():
84
90
pages = relevant_pages ("male" )
85
- assert all ((x in pages .keys ()) for x in ['A' ,'C' ,'E' ])
86
- assert all ((x not in pages ) for x in ['B' ,'D' ,'F' ])
91
+ assert all ((x in pages .keys ()) for x in ['A' , 'C' , 'E' ])
92
+ assert all ((x not in pages ) for x in ['B' , 'D' , 'F' ])
93
+
87
94
88
95
def test_normalize ():
89
- normalize ( pageDict )
90
- print (page .hub for addr ,page in nlp .pagesIndex .items ())
91
- expected_hub = [1 / 91 ,2 / 91 ,3 / 91 ,4 / 91 ,5 / 91 ,6 / 91 ] # Works only for sample data above
96
+ normalize (pageDict )
97
+ print (page .hub for addr , page in nlp .pagesIndex .items ())
98
+ expected_hub = [1 / 91 , 2 / 91 , 3 / 91 , 4 / 91 , 5 / 91 , 6 / 91 ] # Works only for sample data above
92
99
expected_auth = list (reversed (expected_hub ))
93
100
assert len (expected_hub ) == len (expected_auth ) == len (nlp .pagesIndex )
94
- assert expected_hub == [page .hub for addr ,page in sorted (nlp .pagesIndex .items ())]
95
- assert expected_auth == [page .authority for addr ,page in sorted (nlp .pagesIndex .items ())]
101
+ assert expected_hub == [page .hub for addr , page in sorted (nlp .pagesIndex .items ())]
102
+ assert expected_auth == [page .authority for addr , page in sorted (nlp .pagesIndex .items ())]
103
+
96
104
97
105
def test_detectConvergence ():
98
106
# run detectConvergence once to initialise history
99
107
convergence = ConvergenceDetector ()
100
108
convergence ()
101
- assert convergence () # values haven't changed so should return True
109
+ assert convergence () # values haven't changed so should return True
102
110
# make tiny increase/decrease to all values
103
111
for _ , page in nlp .pagesIndex .items ():
104
112
page .hub += 0.0003
@@ -111,17 +119,21 @@ def test_detectConvergence():
111
119
# retest function with values. Should now return false
112
120
assert not convergence ()
113
121
122
+
114
123
def test_getInlinks ():
115
124
inlnks = getInlinks (pageDict ['A' ])
116
125
assert sorted ([page .address for page in inlnks ]) == pageDict ['A' ].inlinks
117
126
127
+
118
128
def test_getOutlinks ():
119
129
outlnks = getOutlinks (pageDict ['A' ])
120
130
assert sorted ([page .address for page in outlnks ]) == pageDict ['A' ].outlinks
121
131
132
+
122
133
def test_HITS ():
123
134
# TODO
124
- assert True # leave for now
135
+ assert True # leave for now
136
+
125
137
126
138
if __name__ == '__main__' :
127
139
pytest .main ()
0 commit comments