2929import collections
3030from io import TextIOWrapper
3131from itertools import chain
32+ import itertools as _itertools
3233import re
3334import sys
3435from token import *
@@ -131,7 +132,28 @@ def maybe(*choices): return group(*choices) + '?'
131132Imagnumber = group (r'[0-9]+[jJ]' , Floatnumber + r'[jJ]' )
132133Number = group (Imagnumber , Floatnumber , Intnumber )
133134
134- StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
135+ # Return the empty string, plus all of the valid string prefixes.
136+ def _all_string_prefixes ():
137+ # The valid string prefixes. Only contain the lower case versions,
138+ # and don't contain any permuations (include 'fr', but not
139+ # 'rf'). The various permutations will be generated.
140+ _valid_string_prefixes = ['b' , 'r' , 'u' , 'f' , 'br' , 'fr' ]
141+ # if we add binary f-strings, add: ['fb', 'fbr']
142+ result = set (['' ])
143+ for prefix in _valid_string_prefixes :
144+ for t in _itertools .permutations (prefix ):
145+ # create a list with upper and lower versions of each
146+ # character
147+ for u in _itertools .product (* [(c , c .upper ()) for c in t ]):
148+ result .add ('' .join (u ))
149+ return result
150+
151+ def _compile (expr ):
152+ return re .compile (expr , re .UNICODE )
153+
154+ # Note that since _all_string_prefixes includes the empty string,
155+ # StringPrefix can be the empty string (making it optional).
156+ StringPrefix = group (* _all_string_prefixes ())
135157
136158# Tail end of ' string.
137159Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
@@ -169,50 +191,25 @@ def maybe(*choices): return group(*choices) + '?'
169191PseudoExtras = group (r'\\\r?\n|\Z' , Comment , Triple )
170192PseudoToken = Whitespace + group (PseudoExtras , Number , Funny , ContStr , Name )
171193
172- def _compile (expr ):
173- return re .compile (expr , re .UNICODE )
174-
175- endpats = {"'" : Single , '"' : Double ,
176- "'''" : Single3 , '"""' : Double3 ,
177- "r'''" : Single3 , 'r"""' : Double3 ,
178- "b'''" : Single3 , 'b"""' : Double3 ,
179- "R'''" : Single3 , 'R"""' : Double3 ,
180- "B'''" : Single3 , 'B"""' : Double3 ,
181- "br'''" : Single3 , 'br"""' : Double3 ,
182- "bR'''" : Single3 , 'bR"""' : Double3 ,
183- "Br'''" : Single3 , 'Br"""' : Double3 ,
184- "BR'''" : Single3 , 'BR"""' : Double3 ,
185- "rb'''" : Single3 , 'rb"""' : Double3 ,
186- "Rb'''" : Single3 , 'Rb"""' : Double3 ,
187- "rB'''" : Single3 , 'rB"""' : Double3 ,
188- "RB'''" : Single3 , 'RB"""' : Double3 ,
189- "u'''" : Single3 , 'u"""' : Double3 ,
190- "U'''" : Single3 , 'U"""' : Double3 ,
191- 'r' : None , 'R' : None , 'b' : None , 'B' : None ,
192- 'u' : None , 'U' : None }
193-
194- triple_quoted = {}
195- for t in ("'''" , '"""' ,
196- "r'''" , 'r"""' , "R'''" , 'R"""' ,
197- "b'''" , 'b"""' , "B'''" , 'B"""' ,
198- "br'''" , 'br"""' , "Br'''" , 'Br"""' ,
199- "bR'''" , 'bR"""' , "BR'''" , 'BR"""' ,
200- "rb'''" , 'rb"""' , "rB'''" , 'rB"""' ,
201- "Rb'''" , 'Rb"""' , "RB'''" , 'RB"""' ,
202- "u'''" , 'u"""' , "U'''" , 'U"""' ,
203- ):
204- triple_quoted [t ] = t
205- single_quoted = {}
206- for t in ("'" , '"' ,
207- "r'" , 'r"' , "R'" , 'R"' ,
208- "b'" , 'b"' , "B'" , 'B"' ,
209- "br'" , 'br"' , "Br'" , 'Br"' ,
210- "bR'" , 'bR"' , "BR'" , 'BR"' ,
211- "rb'" , 'rb"' , "rB'" , 'rB"' ,
212- "Rb'" , 'Rb"' , "RB'" , 'RB"' ,
213- "u'" , 'u"' , "U'" , 'U"' ,
214- ):
215- single_quoted [t ] = t
194+ # For a given string prefix plus quotes, endpats maps it to a regex
195+ # to match the remainder of that string. _prefix can be empty, for
196+ # a normal single or triple quoted string (with no prefix).
197+ endpats = {}
198+ for _prefix in _all_string_prefixes ():
199+ endpats [_prefix + "'" ] = Single
200+ endpats [_prefix + '"' ] = Double
201+ endpats [_prefix + "'''" ] = Single3
202+ endpats [_prefix + '"""' ] = Double3
203+
204+ # A set of all of the single and triple quoted string prefixes,
205+ # including the opening quotes.
206+ single_quoted = set ()
207+ triple_quoted = set ()
208+ for t in _all_string_prefixes ():
209+ for u in (t + '"' , t + "'" ):
210+ single_quoted .add (u )
211+ for u in (t + '"""' , t + "'''" ):
212+ triple_quoted .add (u )
216213
217214tabsize = 8
218215
@@ -626,6 +623,7 @@ def _tokenize(readline, encoding):
626623 yield stashed
627624 stashed = None
628625 yield TokenInfo (COMMENT , token , spos , epos , line )
626+
629627 elif token in triple_quoted :
630628 endprog = _compile (endpats [token ])
631629 endmatch = endprog .match (line , pos )
@@ -638,19 +636,37 @@ def _tokenize(readline, encoding):
638636 contstr = line [start :]
639637 contline = line
640638 break
641- elif initial in single_quoted or \
642- token [:2 ] in single_quoted or \
643- token [:3 ] in single_quoted :
639+
640+ # Check up to the first 3 chars of the token to see if
641+ # they're in the single_quoted set. If so, they start
642+ # a string.
643+ # We're using the first 3, because we're looking for
644+ # "rb'" (for example) at the start of the token. If
645+ # we switch to longer prefixes, this needs to be
646+ # adjusted.
647+ # Note that initial == token[:1].
648+ # Also note that single quote checking must come afer
649+ # triple quote checking (above).
650+ elif (initial in single_quoted or
651+ token [:2 ] in single_quoted or
652+ token [:3 ] in single_quoted ):
644653 if token [- 1 ] == '\n ' : # continued string
645654 strstart = (lnum , start )
646- endprog = _compile (endpats [initial ] or
647- endpats [token [1 ]] or
648- endpats [token [2 ]])
655+ # Again, using the first 3 chars of the
656+ # token. This is looking for the matching end
657+ # regex for the correct type of quote
658+ # character. So it's really looking for
659+ # endpats["'"] or endpats['"'], by trying to
660+ # skip string prefix characters, if any.
661+ endprog = _compile (endpats .get (initial ) or
662+ endpats .get (token [1 ]) or
663+ endpats .get (token [2 ]))
649664 contstr , needcont = line [start :], 1
650665 contline = line
651666 break
652667 else : # ordinary string
653668 yield TokenInfo (STRING , token , spos , epos , line )
669+
654670 elif initial .isidentifier (): # ordinary name
655671 if token in ('async' , 'await' ):
656672 if async_def :
0 commit comments