Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 73fd46d

Browse files
committed
Bug 3347: robotparser failed because it didn't convert bytes to string.
The solution is to convert bytes to text via utf-8. I'm not entirely sure if this is safe, but it looks like robots.txt is expected to be ascii.
1 parent 48577d1 commit 73fd46d

2 files changed

Lines changed: 18 additions & 5 deletions

File tree

Lib/test/test_robotparser.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,9 @@ def RobotTest(index, robots_txt, good_urls, bad_urls,
136136

137137
RobotTest(7, doc, good, bad)
138138

139-
class TestCase(unittest.TestCase):
140-
def runTest(self):
139+
class NetworkTestCase(unittest.TestCase):
140+
141+
def testPasswordProtectedSite(self):
141142
support.requires('network')
142143
# whole site is password-protected.
143144
url = 'http://mueblesmoraleda.com'
@@ -146,9 +147,17 @@ def runTest(self):
146147
parser.read()
147148
self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False)
148149

150+
def testPythonOrg(self):
151+
support.requires('network')
152+
parser = urllib.robotparser.RobotFileParser(
153+
"http://www.python.org/robots.txt")
154+
parser.read()
155+
self.assertTrue(parser.can_fetch("*",
156+
"http://www.python.org/robots.txt"))
157+
149158
def test_main():
159+
support.run_unittest(NetworkTestCase)
150160
support.run_unittest(tests)
151-
TestCase().run()
152161

153162
if __name__=='__main__':
154163
support.Verbose = 1

Lib/urllib/robotparser.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ def read(self):
6060
elif err.code >= 400:
6161
self.allow_all = True
6262
else:
63-
self.parse(f.read().splitlines())
63+
raw = f.read()
64+
self.parse(raw.decode("utf-8").splitlines())
6465

6566
def _add_entry(self, entry):
6667
if "*" in entry.useragents:
@@ -123,7 +124,10 @@ def can_fetch(self, useragent, url):
123124
return True
124125
# search for given user agent matches
125126
# the first match counts
126-
url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/"
127+
url = urllib.parse.quote(
128+
urllib.parse.urlparse(urllib.parse.unquote(url))[2])
129+
if not url:
130+
url = "/"
127131
for entry in self.entries:
128132
if entry.applies_to(useragent):
129133
return entry.allowance(url)

0 commit comments

Comments
 (0)