Update fnmatch from CPython 3.10

CPython Developers · youknowone · commit 3d4fe934a1cd · 2022-08-16T10:08:03.000+09:00
diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py
@@ -9,16 +9,19 @@
 The function translate(PATTERN) returns a regular expression
 corresponding to PATTERN.  (It does not compile it.)
 """
-try:
-    import os
-except ImportError:
-    import _dummy_os as os
+import os
 import posixpath
 import re
 import functools
 
 __all__ = ["filter", "fnmatch", "fnmatchcase", "translate"]
 
+# Build a thread-safe incrementing counter to help create unique regexp group
+# names across calls.
+from itertools import count
+_nextgroupnum = count().__next__
+del count
+
 def fnmatch(name, pat):
     """Test whether FILENAME matches PATTERN.
 
@@ -49,7 +52,7 @@ def _compile_pattern(pat):
     return re.compile(res).match
 
 def filter(names, pat):
-    """Return the subset of the list NAMES that match PAT."""
+    """Construct a list from those elements of the iterable NAMES that match PAT."""
     result = []
     pat = os.path.normcase(pat)
     match = _compile_pattern(pat)
@@ -80,15 +83,19 @@ def translate(pat):
     There is no way to quote meta-characters.
     """
 
+    STAR = object()
+    res = []
+    add = res.append
     i, n = 0, len(pat)
-    res = ''
     while i < n:
         c = pat[i]
         i = i+1
         if c == '*':
-            res = res + '.*'
+            # compress consecutive `*` into one
+            if (not res) or res[-1] is not STAR:
+                add(STAR)
         elif c == '?':
-            res = res + '.'
+            add('.')
         elif c == '[':
             j = i
             if j < n and pat[j] == '!':
@@ -98,10 +105,10 @@ def translate(pat):
             while j < n and pat[j] != ']':
                 j = j+1
             if j >= n:
-                res = res + '\\['
+                add('\\[')
             else:
                 stuff = pat[i:j]
-                if '--' not in stuff:
+                if '-' not in stuff:
                     stuff = stuff.replace('\\', r'\\')
                 else:
                     chunks = []
@@ -113,19 +120,80 @@ def translate(pat):
                         chunks.append(pat[i:k])
                         i = k+1
                         k = k+3
-                    chunks.append(pat[i:j])
+                    chunk = pat[i:j]
+                    if chunk:
+                        chunks.append(chunk)
+                    else:
+                        chunks[-1] += '-'
+                    # Remove empty ranges -- invalid in RE.
+                    for k in range(len(chunks)-1, 0, -1):
+                        if chunks[k-1][-1] > chunks[k][0]:
+                            chunks[k-1] = chunks[k-1][:-1] + chunks[k][1:]
+                            del chunks[k]
                     # Escape backslashes and hyphens for set difference (--).
                     # Hyphens that create ranges shouldn't be escaped.
                     stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-')
                                      for s in chunks)
                 # Escape set operations (&&, ~~ and ||).
                 stuff = re.sub(r'([&~|])', r'\\\1', stuff)
                 i = j+1
-                if stuff[0] == '!':
-                    stuff = '^' + stuff[1:]
-                elif stuff[0] in ('^', '['):
-                    stuff = '\\' + stuff
-                res = '%s[%s]' % (res, stuff)
+                if not stuff:
+                    # Empty range: never match.
+                    add('(?!)')
+                elif stuff == '!':
+                    # Negated empty range: match any character.
+                    add('.')
+                else:
+                    if stuff[0] == '!':
+                        stuff = '^' + stuff[1:]
+                    elif stuff[0] in ('^', '['):
+                        stuff = '\\' + stuff
+                    add(f'[{stuff}]')
+        else:
+            add(re.escape(c))
+    assert i == n
+
+    # Deal with STARs.
+    inp = res
+    res = []
+    add = res.append
+    i, n = 0, len(inp)
+    # Fixed pieces at the start?
+    while i < n and inp[i] is not STAR:
+        add(inp[i])
+        i += 1
+    # Now deal with STAR fixed STAR fixed ...
+    # For an interior `STAR fixed` pairing, we want to do a minimal
+    # .*? match followed by `fixed`, with no possibility of backtracking.
+    # We can't spell that directly, but can trick it into working by matching
+    #    .*?fixed
+    # in a lookahead assertion, save the matched part in a group, then
+    # consume that group via a backreference. If the overall match fails,
+    # the lookahead assertion won't try alternatives. So the translation is:
+    #     (?=(?P<name>.*?fixed))(?P=name)
+    # Group names are created as needed: g0, g1, g2, ...
+    # The numbers are obtained from _nextgroupnum() to ensure they're unique
+    # across calls and across threads. This is because people rely on the
+    # undocumented ability to join multiple translate() results together via
+    # "|" to build large regexps matching "one of many" shell patterns.
+    while i < n:
+        assert inp[i] is STAR
+        i += 1
+        if i == n:
+            add(".*")
+            break
+        assert inp[i] is not STAR
+        fixed = []
+        while i < n and inp[i] is not STAR:
+            fixed.append(inp[i])
+            i += 1
+        fixed = "".join(fixed)
+        if i == n:
+            add(".*")
+            add(fixed)
         else:
-            res = res + re.escape(c)
-    return r'(?s:%s)\Z' % res
+            groupnum = _nextgroupnum()
+            add(f"(?=(?P<g{groupnum}>.*?{fixed}))(?P=g{groupnum})")
+    assert i == n
+    res = "".join(res)
+    return fr'(?s:{res})\Z'
diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py
@@ -2,6 +2,7 @@
 
 import unittest
 import os
+import string
 import warnings
 
 from fnmatch import fnmatch, fnmatchcase, translate, filter
@@ -45,6 +46,13 @@ def test_fnmatch(self):
         check('\nfoo', 'foo*', False)
         check('\n', '*')
 
+    def test_slow_fnmatch(self):
+        check = self.check_match
+        check('a' * 50, '*a*a*a*a*a*a*a*a*a*a')
+        # The next "takes forever" if the regexp translation is
+        # straightforward.  See bpo-40480.
+        check('a' * 50 + 'b', '*a*a*a*a*a*a*a*a*a*a', False)
+
     def test_mix_bytes_str(self):
         self.assertRaises(TypeError, fnmatch, 'test', b'*')
         self.assertRaises(TypeError, fnmatch, b'test', '*')
@@ -89,6 +97,119 @@ def test_sep(self):
         check('usr/bin', 'usr\\bin', normsep)
         check('usr\\bin', 'usr\\bin')
 
+    def test_char_set(self):
+        ignorecase = os.path.normcase('ABC') == os.path.normcase('abc')
+        check = self.check_match
+        tescases = string.ascii_lowercase + string.digits + string.punctuation
+        for c in tescases:
+            check(c, '[az]', c in 'az')
+            check(c, '[!az]', c not in 'az')
+        # Case insensitive.
+        for c in tescases:
+            check(c, '[AZ]', (c in 'az') and ignorecase)
+            check(c, '[!AZ]', (c not in 'az') or not ignorecase)
+        for c in string.ascii_uppercase:
+            check(c, '[az]', (c in 'AZ') and ignorecase)
+            check(c, '[!az]', (c not in 'AZ') or not ignorecase)
+        # Repeated same character.
+        for c in tescases:
+            check(c, '[aa]', c == 'a')
+        # Special cases.
+        for c in tescases:
+            check(c, '[^az]', c in '^az')
+            check(c, '[[az]', c in '[az')
+            check(c, r'[!]]', c != ']')
+        check('[', '[')
+        check('[]', '[]')
+        check('[!', '[!')
+        check('[!]', '[!]')
+
+    def test_range(self):
+        ignorecase = os.path.normcase('ABC') == os.path.normcase('abc')
+        normsep = os.path.normcase('\\') == os.path.normcase('/')
+        check = self.check_match
+        tescases = string.ascii_lowercase + string.digits + string.punctuation
+        for c in tescases:
+            check(c, '[b-d]', c in 'bcd')
+            check(c, '[!b-d]', c not in 'bcd')
+            check(c, '[b-dx-z]', c in 'bcdxyz')
+            check(c, '[!b-dx-z]', c not in 'bcdxyz')
+        # Case insensitive.
+        for c in tescases:
+            check(c, '[B-D]', (c in 'bcd') and ignorecase)
+            check(c, '[!B-D]', (c not in 'bcd') or not ignorecase)
+        for c in string.ascii_uppercase:
+            check(c, '[b-d]', (c in 'BCD') and ignorecase)
+            check(c, '[!b-d]', (c not in 'BCD') or not ignorecase)
+        # Upper bound == lower bound.
+        for c in tescases:
+            check(c, '[b-b]', c == 'b')
+        # Special cases.
+        for c in tescases:
+            check(c, '[!-#]', c not in '-#')
+            check(c, '[!--.]', c not in '-.')
+            check(c, '[^-`]', c in '^_`')
+            if not (normsep and c == '/'):
+                check(c, '[[-^]', c in r'[\]^')
+                check(c, r'[\-^]', c in r'\]^')
+            check(c, '[b-]', c in '-b')
+            check(c, '[!b-]', c not in '-b')
+            check(c, '[-b]', c in '-b')
+            check(c, '[!-b]', c not in '-b')
+            check(c, '[-]', c in '-')
+            check(c, '[!-]', c not in '-')
+        # Upper bound is less that lower bound: error in RE.
+        for c in tescases:
+            check(c, '[d-b]', False)
+            check(c, '[!d-b]', True)
+            check(c, '[d-bx-z]', c in 'xyz')
+            check(c, '[!d-bx-z]', c not in 'xyz')
+            check(c, '[d-b^-`]', c in '^_`')
+            if not (normsep and c == '/'):
+                check(c, '[d-b[-^]', c in r'[\]^')
+
+    def test_sep_in_char_set(self):
+        normsep = os.path.normcase('\\') == os.path.normcase('/')
+        check = self.check_match
+        check('/', r'[/]')
+        check('\\', r'[\]')
+        check('/', r'[\]', normsep)
+        check('\\', r'[/]', normsep)
+        check('[/]', r'[/]', False)
+        check(r'[\\]', r'[/]', False)
+        check('\\', r'[\t]')
+        check('/', r'[\t]', normsep)
+        check('t', r'[\t]')
+        check('\t', r'[\t]', False)
+
+    def test_sep_in_range(self):
+        normsep = os.path.normcase('\\') == os.path.normcase('/')
+        check = self.check_match
+        check('a/b', 'a[.-0]b', not normsep)
+        check('a\\b', 'a[.-0]b', False)
+        check('a\\b', 'a[Z-^]b', not normsep)
+        check('a/b', 'a[Z-^]b', False)
+
+        check('a/b', 'a[/-0]b', not normsep)
+        check(r'a\b', 'a[/-0]b', False)
+        check('a[/-0]b', 'a[/-0]b', False)
+        check(r'a[\-0]b', 'a[/-0]b', False)
+
+        check('a/b', 'a[.-/]b')
+        check(r'a\b', 'a[.-/]b', normsep)
+        check('a[.-/]b', 'a[.-/]b', False)
+        check(r'a[.-\]b', 'a[.-/]b', False)
+
+        check(r'a\b', r'a[\-^]b')
+        check('a/b', r'a[\-^]b', normsep)
+        check(r'a[\-^]b', r'a[\-^]b', False)
+        check('a[/-^]b', r'a[\-^]b', False)
+
+        check(r'a\b', r'a[Z-\]b', not normsep)
+        check('a/b', r'a[Z-\]b', False)
+        check(r'a[Z-\]b', r'a[Z-\]b', False)
+        check('a[Z-/]b', r'a[Z-\]b', False)
+
     def test_warnings(self):
         with warnings.catch_warnings():
             warnings.simplefilter('error', Warning)
@@ -104,6 +225,7 @@ def test_warnings(self):
 class TranslateTestCase(unittest.TestCase):
 
     def test_translate(self):
+        import re
         self.assertEqual(translate('*'), r'(?s:.*)\Z')
         self.assertEqual(translate('?'), r'(?s:.)\Z')
         self.assertEqual(translate('a?b*'), r'(?s:a.b.*)\Z')
@@ -112,7 +234,34 @@ def test_translate(self):
         self.assertEqual(translate('[!x]'), r'(?s:[^x])\Z')
         self.assertEqual(translate('[^x]'), r'(?s:[\^x])\Z')
         self.assertEqual(translate('[x'), r'(?s:\[x)\Z')
-
+        # from the docs
+        self.assertEqual(translate('*.txt'), r'(?s:.*\.txt)\Z')
+        # squash consecutive stars
+        self.assertEqual(translate('*********'), r'(?s:.*)\Z')
+        self.assertEqual(translate('A*********'), r'(?s:A.*)\Z')
+        self.assertEqual(translate('*********A'), r'(?s:.*A)\Z')
+        self.assertEqual(translate('A*********?[?]?'), r'(?s:A.*.[?].)\Z')
+        # fancy translation to prevent exponential-time match failure
+        t = translate('**a*a****a')
+        digits = re.findall(r'\d+', t)
+        self.assertEqual(len(digits), 4)
+        self.assertEqual(digits[0], digits[1])
+        self.assertEqual(digits[2], digits[3])
+        g1 = f"g{digits[0]}"  # e.g., group name "g4"
+        g2 = f"g{digits[2]}"  # e.g., group name "g5"
+        self.assertEqual(t,
+         fr'(?s:(?=(?P<{g1}>.*?a))(?P={g1})(?=(?P<{g2}>.*?a))(?P={g2}).*a)\Z')
+        # and try pasting multiple translate results - it's an undocumented
+        # feature that this works; all the pain of generating unique group
+        # names across calls exists to support this
+        r1 = translate('**a**a**a*')
+        r2 = translate('**b**b**b*')
+        r3 = translate('*c*c*c*')
+        fatre = "|".join([r1, r2, r3])
+        self.assertTrue(re.match(fatre, 'abaccad'))
+        self.assertTrue(re.match(fatre, 'abxbcab'))
+        self.assertTrue(re.match(fatre, 'cbabcaxc'))
+        self.assertFalse(re.match(fatre, 'dabccbad'))
 
 class FilterTestCase(unittest.TestCase):