Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 23fc041

Browse files
zoobaned-deily
authored andcommitted
[3.6] bpo-36216: Add check for characters in netloc that normalize to separators (GH-12201) (GH-12215)
1 parent 5565b1d commit 23fc041

4 files changed

Lines changed: 61 additions & 0 deletions

File tree

Doc/library/urllib.parse.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,11 @@ or on combining URL components into a URL string.
121121
Unmatched square brackets in the :attr:`netloc` attribute will raise a
122122
:exc:`ValueError`.
123123

124+
Characters in the :attr:`netloc` attribute that decompose under NFKC
125+
normalization (as used by the IDNA encoding) into any of ``/``, ``?``,
126+
``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
127+
decomposed before parsing, no error will be raised.
128+
124129
.. versionchanged:: 3.2
125130
Added IPv6 URL parsing capabilities.
126131

@@ -133,6 +138,10 @@ or on combining URL components into a URL string.
133138
Out-of-range port numbers now raise :exc:`ValueError`, instead of
134139
returning :const:`None`.
135140

141+
.. versionchanged:: 3.6.9
142+
Characters that affect netloc parsing under NFKC normalization will
143+
now raise :exc:`ValueError`.
144+
136145

137146
.. function:: parse_qs(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace', max_num_fields=None)
138147

@@ -256,10 +265,19 @@ or on combining URL components into a URL string.
256265
Unmatched square brackets in the :attr:`netloc` attribute will raise a
257266
:exc:`ValueError`.
258267

268+
Characters in the :attr:`netloc` attribute that decompose under NFKC
269+
normalization (as used by the IDNA encoding) into any of ``/``, ``?``,
270+
``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
271+
decomposed before parsing, no error will be raised.
272+
259273
.. versionchanged:: 3.6
260274
Out-of-range port numbers now raise :exc:`ValueError`, instead of
261275
returning :const:`None`.
262276

277+
.. versionchanged:: 3.6.9
278+
Characters that affect netloc parsing under NFKC normalization will
279+
now raise :exc:`ValueError`.
280+
263281

264282
.. function:: urlunsplit(parts)
265283

Lib/test/test_urlparse.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import sys
2+
import unicodedata
13
import unittest
24
import urllib.parse
35

@@ -984,6 +986,27 @@ def test_all(self):
984986
expected.append(name)
985987
self.assertCountEqual(urllib.parse.__all__, expected)
986988

989+
def test_urlsplit_normalization(self):
990+
# Certain characters should never occur in the netloc,
991+
# including under normalization.
992+
# Ensure that ALL of them are detected and cause an error
993+
illegal_chars = '/:#?@'
994+
hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars}
995+
denorm_chars = [
996+
c for c in map(chr, range(128, sys.maxunicode))
997+
if (hex_chars & set(unicodedata.decomposition(c).split()))
998+
and c not in illegal_chars
999+
]
1000+
# Sanity check that we found at least one such character
1001+
self.assertIn('\u2100', denorm_chars)
1002+
self.assertIn('\uFF03', denorm_chars)
1003+
1004+
for scheme in ["http", "https", "ftp"]:
1005+
for c in denorm_chars:
1006+
url = "{}://netloc{}false.netloc/path".format(scheme, c)
1007+
with self.subTest(url=url, char='{:04X}'.format(ord(c))):
1008+
with self.assertRaises(ValueError):
1009+
urllib.parse.urlsplit(url)
9871010

9881011
class Utility_Tests(unittest.TestCase):
9891012
"""Testcase to test the various utility functions in the urllib."""

Lib/urllib/parse.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,21 @@ def _splitnetloc(url, start=0):
391391
delim = min(delim, wdelim) # use earliest delim position
392392
return url[start:delim], url[delim:] # return (domain, rest)
393393

394+
def _checknetloc(netloc):
395+
if not netloc or not any(ord(c) > 127 for c in netloc):
396+
return
397+
# looking for characters like \u2100 that expand to 'a/c'
398+
# IDNA uses NFKC equivalence, so normalize for this check
399+
import unicodedata
400+
netloc2 = unicodedata.normalize('NFKC', netloc)
401+
if netloc == netloc2:
402+
return
403+
_, _, netloc = netloc.rpartition('@') # anything to the left of '@' is okay
404+
for c in '/?#@:':
405+
if c in netloc2:
406+
raise ValueError("netloc '" + netloc2 + "' contains invalid " +
407+
"characters under NFKC normalization")
408+
394409
def urlsplit(url, scheme='', allow_fragments=True):
395410
"""Parse a URL into 5 components:
396411
<scheme>://<netloc>/<path>?<query>#<fragment>
@@ -420,6 +435,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
420435
url, fragment = url.split('#', 1)
421436
if '?' in url:
422437
url, query = url.split('?', 1)
438+
_checknetloc(netloc)
423439
v = SplitResult(scheme, netloc, url, query, fragment)
424440
_parse_cache[key] = v
425441
return _coerce_result(v)
@@ -443,6 +459,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
443459
url, fragment = url.split('#', 1)
444460
if '?' in url:
445461
url, query = url.split('?', 1)
462+
_checknetloc(netloc)
446463
v = SplitResult(scheme, netloc, url, query, fragment)
447464
_parse_cache[key] = v
448465
return _coerce_result(v)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Changes urlsplit() to raise ValueError when the URL contains characters that
2+
decompose under IDNA encoding (NFKC-normalization) into characters that
3+
affect how the URL is parsed.

0 commit comments

Comments
 (0)