@@ -391,6 +391,21 @@ def _splitnetloc(url, start=0):
391391 delim = min (delim , wdelim ) # use earliest delim position
392392 return url [start :delim ], url [delim :] # return (domain, rest)
393393
394+ def _checknetloc (netloc ):
395+ if not netloc or not any (ord (c ) > 127 for c in netloc ):
396+ return
397+ # looking for characters like \u2100 that expand to 'a/c'
398+ # IDNA uses NFKC equivalence, so normalize for this check
399+ import unicodedata
400+ netloc2 = unicodedata .normalize ('NFKC' , netloc )
401+ if netloc == netloc2 :
402+ return
403+ _ , _ , netloc = netloc .rpartition ('@' ) # anything to the left of '@' is okay
404+ for c in '/?#@:' :
405+ if c in netloc2 :
406+ raise ValueError ("netloc '" + netloc2 + "' contains invalid " +
407+ "characters under NFKC normalization" )
408+
394409def urlsplit (url , scheme = '' , allow_fragments = True ):
395410 """Parse a URL into 5 components:
396411 <scheme>://<netloc>/<path>?<query>#<fragment>
@@ -420,6 +435,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
420435 url , fragment = url .split ('#' , 1 )
421436 if '?' in url :
422437 url , query = url .split ('?' , 1 )
438+ _checknetloc (netloc )
423439 v = SplitResult (scheme , netloc , url , query , fragment )
424440 _parse_cache [key ] = v
425441 return _coerce_result (v )
@@ -443,6 +459,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
443459 url , fragment = url .split ('#' , 1 )
444460 if '?' in url :
445461 url , query = url .split ('?' , 1 )
462+ _checknetloc (netloc )
446463 v = SplitResult (scheme , netloc , url , query , fragment )
447464 _parse_cache [key ] = v
448465 return _coerce_result (v )
0 commit comments