1
1
import pytest
2
2
import nlp
3
+
3
4
from nlp import loadPageHTML , stripRawHTML , findOutlinks , onlyWikipediaURLS
4
5
from nlp import expand_pages , relevant_pages , normalize , ConvergenceDetector , getInlinks
5
6
from nlp import getOutlinks , Page
6
7
from nlp import Rules , Lexicon
7
8
# Clumsy imports because we want to access certain nlp.py globals explicitly, because
8
9
# they are accessed by function's within nlp.py
9
10
11
+ from unittest .mock import patch
12
+ from io import BytesIO
13
+
10
14
11
15
def test_rules ():
12
16
assert Rules (A = "B C | D E" ) == {'A' : [['B' , 'C' ], ['D' , 'E' ]]}
@@ -27,6 +31,19 @@ def test_lexicon():
27
31
< href="/wiki/TestThing" > href="/wiki/TestBoy"
28
32
href="/wiki/TestLiving" href="/wiki/TestMan" >"""
29
33
testHTML2 = "Nothing"
34
+ testHTML3 = """
35
+ <!DOCTYPE html>
36
+ <html>
37
+ <head>
38
+ <title>Page Title</title>
39
+ </head>
40
+ <body>
41
+
42
+ <p>AIMA book</p>
43
+
44
+ </body>
45
+ </html>
46
+ """
30
47
31
48
pA = Page ("A" , 1 , 6 , ["B" , "C" , "E" ], ["D" ])
32
49
pB = Page ("B" , 2 , 5 , ["E" ], ["A" , "C" , "D" ])
@@ -52,12 +69,14 @@ def test_lexicon():
52
69
# assert all(loadedPages.get(key,"") != "" for key in addresses)
53
70
54
71
55
- def test_stripRawHTML ():
72
+ @patch ('urllib.request.urlopen' , return_value = BytesIO (testHTML3 .encode ()))
73
+ def test_stripRawHTML (html_mock ):
56
74
addr = "https://en.wikipedia.org/wiki/Ethics"
57
75
aPage = loadPageHTML ([addr ])
58
76
someHTML = aPage [addr ]
59
77
strippedHTML = stripRawHTML (someHTML )
60
78
assert "<head>" not in strippedHTML and "</head>" not in strippedHTML
79
+ assert "AIMA book" in someHTML and "AIMA book" in strippedHTML
61
80
62
81
63
82
def test_determineInlinks ():
0 commit comments