Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 41e4faa

Browse files
author
Johannes Gijsbers
committed
Patch #712317: In URLs such as http://www.example.com?query=spam, treat '?' as
a delimiter. Previously, the 'network location' (<authority> in RFC 2396) would become 'www.example.com?query=spam', while RFC 2396 does not allow a '?' in <authority>. See bug #548176 for further discussion.
1 parent cdd625a commit 41e4faa

2 files changed

Lines changed: 63 additions & 41 deletions

File tree

Lib/test/test_urlparse.py

Lines changed: 51 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,41 +8,64 @@
88
RFC2396_BASE = "http://a/b/c/d;p?q"
99

1010
class UrlParseTestCase(unittest.TestCase):
11-
def test_frags(self):
12-
for url, parsed, split in [
13-
('http://www.python.org',
14-
('http', 'www.python.org', '', '', '', ''),
15-
('http', 'www.python.org', '', '', '')),
16-
('http://www.python.org#abc',
17-
('http', 'www.python.org', '', '', '', 'abc'),
18-
('http', 'www.python.org', '', '', 'abc')),
19-
('http://www.python.org/#abc',
20-
('http', 'www.python.org', '/', '', '', 'abc'),
21-
('http', 'www.python.org', '/', '', 'abc')),
22-
(RFC1808_BASE,
23-
('http', 'a', '/b/c/d', 'p', 'q', 'f'),
24-
('http', 'a', '/b/c/d;p', 'q', 'f')),
11+
12+
def checkRoundtrips(self, url, parsed, split):
13+
result = urlparse.urlparse(url)
14+
self.assertEqual(result, parsed)
15+
# put it back together and it should be the same
16+
result2 = urlparse.urlunparse(result)
17+
self.assertEqual(result2, url)
18+
19+
# check the roundtrip using urlsplit() as well
20+
result = urlparse.urlsplit(url)
21+
self.assertEqual(result, split)
22+
result2 = urlparse.urlunsplit(result)
23+
self.assertEqual(result2, url)
24+
25+
def test_roundtrips(self):
26+
testcases = [
2527
('file:///tmp/junk.txt',
2628
('file', '', '/tmp/junk.txt', '', '', ''),
2729
('file', '', '/tmp/junk.txt', '', '')),
2830
('imap://mail.python.org/mbox1',
2931
('imap', 'mail.python.org', '/mbox1', '', '', ''),
3032
('imap', 'mail.python.org', '/mbox1', '', '')),
3133
('mms://wms.sys.hinet.net/cts/Drama/09006251100.asf',
32-
('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf', '', '', ''),
33-
('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf', '', '')),
34-
]:
35-
result = urlparse.urlparse(url)
36-
self.assertEqual(result, parsed)
37-
# put it back together and it should be the same
38-
result2 = urlparse.urlunparse(result)
39-
self.assertEqual(result2, url)
40-
41-
# check the roundtrip using urlsplit() as well
42-
result = urlparse.urlsplit(url)
43-
self.assertEqual(result, split)
44-
result2 = urlparse.urlunsplit(result)
45-
self.assertEqual(result2, url)
34+
('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf',
35+
'', '', ''),
36+
('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf',
37+
'', '')),
38+
]
39+
for url, parsed, split in testcases:
40+
self.checkRoundtrips(url, parsed, split)
41+
42+
def test_http_roundtrips(self):
43+
# urlparse.urlsplit treats 'http:' as an optimized special case,
44+
# so we test both 'http:' and 'https:' in all the following.
45+
# Three cheers for white box knowledge!
46+
testcases = [
47+
('://www.python.org',
48+
('www.python.org', '', '', '', ''),
49+
('www.python.org', '', '', '')),
50+
('://www.python.org#abc',
51+
('www.python.org', '', '', '', 'abc'),
52+
('www.python.org', '', '', 'abc')),
53+
('://www.python.org?q=abc',
54+
('www.python.org', '', '', 'q=abc', ''),
55+
('www.python.org', '', 'q=abc', '')),
56+
('://www.python.org/#abc',
57+
('www.python.org', '/', '', '', 'abc'),
58+
('www.python.org', '/', '', 'abc')),
59+
('://a/b/c/d;p?q#f',
60+
('a', '/b/c/d', 'p', 'q', 'f'),
61+
('a', '/b/c/d;p', 'q', 'f')),
62+
]
63+
for scheme in ('http', 'https'):
64+
for url, parsed, split in testcases:
65+
url = scheme + url
66+
parsed = (scheme,) + parsed
67+
split = (scheme,) + split
68+
self.checkRoundtrips(url, parsed, split)
4669

4770
def checkJoin(self, base, relurl, expected):
4871
self.assertEqual(urlparse.urljoin(base, relurl), expected,

Lib/urlparse.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,15 @@ def _splitparams(url):
6363
i = url.find(';')
6464
return url[:i], url[i+1:]
6565

66+
def _splitnetloc(url, start=0):
67+
for c in '/?#': # the order is important!
68+
delim = url.find(c, start)
69+
if delim >= 0:
70+
break
71+
else:
72+
delim = len(url)
73+
return url[start:delim], url[delim:]
74+
6675
def urlsplit(url, scheme='', allow_fragments=1):
6776
"""Parse a URL into 5 components:
6877
<scheme>://<netloc>/<path>?<query>#<fragment>
@@ -82,13 +91,7 @@ def urlsplit(url, scheme='', allow_fragments=1):
8291
scheme = url[:i].lower()
8392
url = url[i+1:]
8493
if url[:2] == '//':
85-
i = url.find('/', 2)
86-
if i < 0:
87-
i = url.find('#')
88-
if i < 0:
89-
i = len(url)
90-
netloc = url[2:i]
91-
url = url[i:]
94+
netloc, url = _splitnetloc(url, 2)
9295
if allow_fragments and '#' in url:
9396
url, fragment = url.split('#', 1)
9497
if '?' in url:
@@ -101,12 +104,8 @@ def urlsplit(url, scheme='', allow_fragments=1):
101104
break
102105
else:
103106
scheme, url = url[:i].lower(), url[i+1:]
104-
if scheme in uses_netloc:
105-
if url[:2] == '//':
106-
i = url.find('/', 2)
107-
if i < 0:
108-
i = len(url)
109-
netloc, url = url[2:i], url[i:]
107+
if scheme in uses_netloc and url[:2] == '//':
108+
netloc, url = _splitnetloc(url, 2)
110109
if allow_fragments and scheme in uses_fragment and '#' in url:
111110
url, fragment = url.split('#', 1)
112111
if scheme in uses_query and '?' in url:

0 commit comments

Comments
 (0)