bpo-44002: Switch to lru_cache in urllib.parse. (GH-25798)

gpshead · web-flow · commit d597fdc5fd0e · 2021-05-11T17:01:44.000-07:00
Switch to lru_cache in urllib.parse.

urllib.parse now uses functool.lru_cache for its internal URL splitting and
quoting caches instead of rolling its own like its the 90s.

The undocumented internal Quoted class API is now deprecated
as it had no reason to be public and no existing OSS users were found.

The clear_cache() API remains undocumented but gets an explicit test as it
is used in a few projects' (twisted, gevent) tests as well as our own regrtest.
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py
@@ -1044,16 +1044,24 @@ def test_telurl_params(self):
         self.assertEqual(p1.params, 'phone-context=+1-914-555')
 
     def test_Quoter_repr(self):
-        quoter = urllib.parse.Quoter(urllib.parse._ALWAYS_SAFE)
+        quoter = urllib.parse._Quoter(urllib.parse._ALWAYS_SAFE)
         self.assertIn('Quoter', repr(quoter))
 
+    def test_clear_cache_for_code_coverage(self):
+        urllib.parse.clear_cache()
+
+    def test_urllib_parse_getattr_failure(self):
+        """Test that urllib.parse.__getattr__() fails correctly."""
+        with self.assertRaises(AttributeError):
+            unused = urllib.parse.this_does_not_exist
+
     def test_all(self):
         expected = []
         undocumented = {
             'splitattr', 'splithost', 'splitnport', 'splitpasswd',
             'splitport', 'splitquery', 'splittag', 'splittype', 'splituser',
             'splitvalue',
-            'Quoter', 'ResultBase', 'clear_cache', 'to_bytes', 'unwrap',
+            'ResultBase', 'clear_cache', 'to_bytes', 'unwrap',
         }
         for name in dir(urllib.parse):
             if name.startswith('_') or name in undocumented:
@@ -1245,6 +1253,12 @@ def test_unwrap(self):
 
 class DeprecationTest(unittest.TestCase):
 
+    def test_Quoter_deprecation(self):
+        with self.assertWarns(DeprecationWarning) as cm:
+            old_class = urllib.parse.Quoter
+            self.assertIs(old_class, urllib.parse._Quoter)
+        self.assertIn('Quoter will be removed', str(cm.warning))
+
     def test_splittype_deprecation(self):
         with self.assertWarns(DeprecationWarning) as cm:
             urllib.parse.splittype('')
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
@@ -27,10 +27,11 @@
 test_urlparse.py provides a good indicator of parsing behavior.
 """
 
+from collections import namedtuple
+import functools
 import re
 import sys
 import types
-import collections
 import warnings
 
 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
@@ -81,15 +82,10 @@
 # Unsafe bytes to be removed per WHATWG spec
 _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']
 
-# XXX: Consider replacing with functools.lru_cache
-MAX_CACHE_SIZE = 20
-_parse_cache = {}
-
 def clear_cache():
-    """Clear the parse cache and the quoters cache."""
-    _parse_cache.clear()
-    _safe_quoters.clear()
-
+    """Clear internal performance caches. Undocumented; some tests want it."""
+    urlsplit.cache_clear()
+    _byte_quoter_factory.cache_clear()
 
 # Helpers for bytes handling
 # For 3.2, we deliberately require applications that
@@ -243,8 +239,6 @@ def _hostinfo(self):
         return hostname, port
 
 
-from collections import namedtuple
-
 _DefragResultBase = namedtuple('DefragResult', 'url fragment')
 _SplitResultBase = namedtuple(
     'SplitResult', 'scheme netloc path query fragment')
@@ -434,6 +428,9 @@ def _checknetloc(netloc):
             raise ValueError("netloc '" + netloc + "' contains invalid " +
                              "characters under NFKC normalization")
 
+# typed=True avoids BytesWarnings being emitted during cache key
+# comparison since this API supports both bytes and str input.
+@functools.lru_cache(typed=True)
 def urlsplit(url, scheme='', allow_fragments=True):
     """Parse a URL into 5 components:
     <scheme>://<netloc>/<path>?<query>#<fragment>
@@ -462,12 +459,6 @@ def urlsplit(url, scheme='', allow_fragments=True):
         scheme = scheme.replace(b, "")
 
     allow_fragments = bool(allow_fragments)
-    key = url, scheme, allow_fragments, type(url), type(scheme)
-    cached = _parse_cache.get(key, None)
-    if cached:
-        return _coerce_result(cached)
-    if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
-        clear_cache()
     netloc = query = fragment = ''
     i = url.find(':')
     if i > 0:
@@ -488,7 +479,6 @@ def urlsplit(url, scheme='', allow_fragments=True):
         url, query = url.split('?', 1)
     _checknetloc(netloc)
     v = SplitResult(scheme, netloc, url, query, fragment)
-    _parse_cache[key] = v
     return _coerce_result(v)
 
 def urlunparse(components):
@@ -791,23 +781,30 @@ def unquote_plus(string, encoding='utf-8', errors='replace'):
                          b'0123456789'
                          b'_.-~')
 _ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
-_safe_quoters = {}
 
-class Quoter(collections.defaultdict):
-    """A mapping from bytes (in range(0,256)) to strings.
+def __getattr__(name):
+    if name == 'Quoter':
+        warnings.warn('Deprecated in 3.11. '
+                      'urllib.parse.Quoter will be removed in Python 3.14. '
+                      'It was not intended to be a public API.',
+                      DeprecationWarning, stacklevel=2)
+        return _Quoter
+    raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
+
+class _Quoter(dict):
+    """A mapping from bytes numbers (in range(0,256)) to strings.
 
     String values are percent-encoded byte values, unless the key < 128, and
-    in the "safe" set (either the specified safe set, or default set).
+    in either of the specified safe set, or the always safe set.
     """
-    # Keeps a cache internally, using defaultdict, for efficiency (lookups
+    # Keeps a cache internally, via __missing__, for efficiency (lookups
     # of cached keys don't call Python code at all).
     def __init__(self, safe):
         """safe: bytes object."""
         self.safe = _ALWAYS_SAFE.union(safe)
 
     def __repr__(self):
-        # Without this, will just display as a defaultdict
-        return "<%s %r>" % (self.__class__.__name__, dict(self))
+        return f"<Quoter {dict(self)!r}>"
 
     def __missing__(self, b):
         # Handle a cache miss. Store quoted string in cache and return.
@@ -886,6 +883,11 @@ def quote_plus(string, safe='', encoding=None, errors=None):
     string = quote(string, safe + space, encoding, errors)
     return string.replace(' ', '+')
 
+# Expectation: A typical program is unlikely to create more than 5 of these.
+@functools.lru_cache
+def _byte_quoter_factory(safe):
+    return _Quoter(safe).__getitem__
+
 def quote_from_bytes(bs, safe='/'):
     """Like quote(), but accepts a bytes object rather than a str, and does
     not perform string-to-bytes encoding.  It always returns an ASCII string.
@@ -899,13 +901,11 @@ def quote_from_bytes(bs, safe='/'):
         # Normalize 'safe' by converting to bytes and removing non-ASCII chars
         safe = safe.encode('ascii', 'ignore')
     else:
+        # List comprehensions are faster than generator expressions.
         safe = bytes([c for c in safe if c < 128])
     if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
         return bs.decode()
-    try:
-        quoter = _safe_quoters[safe]
-    except KeyError:
-        _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
+    quoter = _byte_quoter_factory(safe)
     return ''.join([quoter(char) for char in bs])
 
 def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
diff --git a/Misc/NEWS.d/next/Library/2021-05-01-15-43-37.bpo-44002.KLT_wd.rst b/Misc/NEWS.d/next/Library/2021-05-01-15-43-37.bpo-44002.KLT_wd.rst
@@ -0,0 +1,5 @@
+:mod:`urllib.parse` now uses :func:`functool.lru_cache` for its internal URL
+splitting and quoting caches instead of rolling its own like its the '90s.
+
+The undocumented internal :mod:`urllib.parse` ``Quoted`` class API is now
+deprecated, for removal in 3.14.