2727test_urlparse.py provides a good indicator of parsing behavior.
2828"""
2929
30+ import re
3031import sys
3132import collections
3233
@@ -470,6 +471,10 @@ def urldefrag(url):
470471 defrag = url
471472 return _coerce_result (DefragResult (defrag , frag ))
472473
474+ _hexdig = '0123456789ABCDEFabcdef'
475+ _hextobyte = {(a + b ).encode (): bytes ([int (a + b , 16 )])
476+ for a in _hexdig for b in _hexdig }
477+
473478def unquote_to_bytes (string ):
474479 """unquote_to_bytes('abc%20def') -> b'abc def'."""
475480 # Note: strings are encoded as UTF-8. This is only an issue if it contains
@@ -480,16 +485,21 @@ def unquote_to_bytes(string):
480485 return b''
481486 if isinstance (string , str ):
482487 string = string .encode ('utf-8' )
483- res = string .split (b'%' )
484- if len (res ) == 1 :
488+ bits = string .split (b'%' )
489+ if len (bits ) == 1 :
485490 return string
486- string = res [0 ]
487- for item in res [1 :]:
491+ res = [bits [0 ]]
492+ append = res .append
493+ for item in bits [1 :]:
488494 try :
489- string += bytes ([int (item [:2 ], 16 )]) + item [2 :]
490- except ValueError :
491- string += b'%' + item
492- return string
495+ append (_hextobyte [item [:2 ]])
496+ append (item [2 :])
497+ except KeyError :
498+ append (b'%' )
499+ append (item )
500+ return b'' .join (res )
501+
502+ _asciire = re .compile ('([\x00 -\x7f ]+)' )
493503
494504def unquote (string , encoding = 'utf-8' , errors = 'replace' ):
495505 """Replace %xx escapes by their single-character equivalent. The optional
@@ -501,39 +511,20 @@ def unquote(string, encoding='utf-8', errors='replace'):
501511
502512 unquote('abc%20def') -> 'abc def'.
503513 """
504- if string == '' :
505- return string
506- res = string .split ('%' )
507- if len (res ) == 1 :
514+ if '%' not in string :
515+ string .split
508516 return string
509517 if encoding is None :
510518 encoding = 'utf-8'
511519 if errors is None :
512520 errors = 'replace'
513- # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
514- pct_sequence = b''
515- string = res [0 ]
516- for item in res [1 :]:
517- try :
518- if not item :
519- raise ValueError
520- pct_sequence += bytes .fromhex (item [:2 ])
521- rest = item [2 :]
522- if not rest :
523- # This segment was just a single percent-encoded character.
524- # May be part of a sequence of code units, so delay decoding.
525- # (Stored in pct_sequence).
526- continue
527- except ValueError :
528- rest = '%' + item
529- # Encountered non-percent-encoded characters. Flush the current
530- # pct_sequence.
531- string += pct_sequence .decode (encoding , errors ) + rest
532- pct_sequence = b''
533- if pct_sequence :
534- # Flush the final pct_sequence
535- string += pct_sequence .decode (encoding , errors )
536- return string
521+ bits = _asciire .split (string )
522+ res = [bits [0 ]]
523+ append = res .append
524+ for i in range (1 , len (bits ), 2 ):
525+ append (unquote_to_bytes (bits [i ]).decode (encoding , errors ))
526+ append (bits [i + 1 ])
527+ return '' .join (res )
537528
538529def parse_qs (qs , keep_blank_values = False , strict_parsing = False ,
539530 encoding = 'utf-8' , errors = 'replace' ):
0 commit comments