Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 8ea4616

Browse files
Issue #1285086: Get rid of the refcounting hack and speed up
urllib.parse.unquote() and urllib.parse.unquote_to_bytes().
1 parent 3b220e1 commit 8ea4616

2 files changed

Lines changed: 30 additions & 36 deletions

File tree

Lib/urllib/parse.py

Lines changed: 27 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
test_urlparse.py provides a good indicator of parsing behavior.
2828
"""
2929

30+
import re
3031
import sys
3132
import collections
3233

@@ -470,6 +471,10 @@ def urldefrag(url):
470471
defrag = url
471472
return _coerce_result(DefragResult(defrag, frag))
472473

474+
_hexdig = '0123456789ABCDEFabcdef'
475+
_hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])
476+
for a in _hexdig for b in _hexdig}
477+
473478
def unquote_to_bytes(string):
474479
"""unquote_to_bytes('abc%20def') -> b'abc def'."""
475480
# Note: strings are encoded as UTF-8. This is only an issue if it contains
@@ -480,16 +485,21 @@ def unquote_to_bytes(string):
480485
return b''
481486
if isinstance(string, str):
482487
string = string.encode('utf-8')
483-
res = string.split(b'%')
484-
if len(res) == 1:
488+
bits = string.split(b'%')
489+
if len(bits) == 1:
485490
return string
486-
string = res[0]
487-
for item in res[1:]:
491+
res = [bits[0]]
492+
append = res.append
493+
for item in bits[1:]:
488494
try:
489-
string += bytes([int(item[:2], 16)]) + item[2:]
490-
except ValueError:
491-
string += b'%' + item
492-
return string
495+
append(_hextobyte[item[:2]])
496+
append(item[2:])
497+
except KeyError:
498+
append(b'%')
499+
append(item)
500+
return b''.join(res)
501+
502+
_asciire = re.compile('([\x00-\x7f]+)')
493503

494504
def unquote(string, encoding='utf-8', errors='replace'):
495505
"""Replace %xx escapes by their single-character equivalent. The optional
@@ -501,39 +511,20 @@ def unquote(string, encoding='utf-8', errors='replace'):
501511
502512
unquote('abc%20def') -> 'abc def'.
503513
"""
504-
if string == '':
505-
return string
506-
res = string.split('%')
507-
if len(res) == 1:
514+
if '%' not in string:
515+
string.split
508516
return string
509517
if encoding is None:
510518
encoding = 'utf-8'
511519
if errors is None:
512520
errors = 'replace'
513-
# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
514-
pct_sequence = b''
515-
string = res[0]
516-
for item in res[1:]:
517-
try:
518-
if not item:
519-
raise ValueError
520-
pct_sequence += bytes.fromhex(item[:2])
521-
rest = item[2:]
522-
if not rest:
523-
# This segment was just a single percent-encoded character.
524-
# May be part of a sequence of code units, so delay decoding.
525-
# (Stored in pct_sequence).
526-
continue
527-
except ValueError:
528-
rest = '%' + item
529-
# Encountered non-percent-encoded characters. Flush the current
530-
# pct_sequence.
531-
string += pct_sequence.decode(encoding, errors) + rest
532-
pct_sequence = b''
533-
if pct_sequence:
534-
# Flush the final pct_sequence
535-
string += pct_sequence.decode(encoding, errors)
536-
return string
521+
bits = _asciire.split(string)
522+
res = [bits[0]]
523+
append = res.append
524+
for i in range(1, len(bits), 2):
525+
append(unquote_to_bytes(bits[i]).decode(encoding, errors))
526+
append(bits[i + 1])
527+
return ''.join(res)
537528

538529
def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
539530
encoding='utf-8', errors='replace'):

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,9 @@ Core and Builtins
233233
Library
234234
-------
235235

236+
- Issue #1285086: Get rid of the refcounting hack and speed up
237+
urllib.parse.unquote() and urllib.parse.unquote_to_bytes().
238+
236239
- Issue #17368: Fix an off-by-one error in the Python JSON decoder that caused
237240
a failure while decoding empty object literals when object_pairs_hook was
238241
specified.

0 commit comments

Comments
 (0)