Thanks to visit codestin.com
Credit goes to github.com

Skip to content

gh-115154: Fix untokenize handling of unicode named literals #115171

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Feb 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 37 additions & 3 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -1877,6 +1877,43 @@ def test_roundtrip(self):
" print('Can not import' # comment2\n)"
"else: print('Loaded')\n")

self.check_roundtrip("f'\\N{EXCLAMATION MARK}'")
self.check_roundtrip(r"f'\\N{SNAKE}'")
self.check_roundtrip(r"f'\\N{{SNAKE}}'")
self.check_roundtrip(r"f'\N{SNAKE}'")
self.check_roundtrip(r"f'\\\N{SNAKE}'")
self.check_roundtrip(r"f'\\\\\N{SNAKE}'")
self.check_roundtrip(r"f'\\\\\\\N{SNAKE}'")

self.check_roundtrip(r"f'\\N{1}'")
self.check_roundtrip(r"f'\\\\N{2}'")
self.check_roundtrip(r"f'\\\\\\N{3}'")
self.check_roundtrip(r"f'\\\\\\\\N{4}'")

self.check_roundtrip(r"f'\\N{{'")
self.check_roundtrip(r"f'\\\\N{{'")
self.check_roundtrip(r"f'\\\\\\N{{'")
self.check_roundtrip(r"f'\\\\\\\\N{{'")
cases = [
"""
if 1:
"foo"
"bar"
""",
"""
if 1:
("foo"
"bar")
""",
"""
if 1:
"foo"
"bar"
""" ]
for case in cases:
self.check_roundtrip(case)


def test_continuation(self):
# Balancing continuation
self.check_roundtrip("a = (3,4, \n"
Expand Down Expand Up @@ -1911,9 +1948,6 @@ def test_random_files(self):
tempdir = os.path.dirname(__file__) or os.curdir
testfiles = glob.glob(os.path.join(glob.escape(tempdir), "test*.py"))

# TODO: Remove this once we can untokenize PEP 701 syntax
testfiles.remove(os.path.join(tempdir, "test_fstring.py"))

if not support.is_resource_enabled("cpu"):
testfiles = random.sample(testfiles, 10)

Expand Down
53 changes: 46 additions & 7 deletions Lib/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def __init__(self):
self.tokens = []
self.prev_row = 1
self.prev_col = 0
self.prev_type = None
self.encoding = None

def add_whitespace(self, start):
Expand All @@ -183,6 +184,29 @@ def add_whitespace(self, start):
if col_offset:
self.tokens.append(" " * col_offset)

def escape_brackets(self, token):
characters = []
consume_until_next_bracket = False
for character in token:
if character == "}":
if consume_until_next_bracket:
consume_until_next_bracket = False
else:
characters.append(character)
if character == "{":
n_backslashes = sum(
1 for char in _itertools.takewhile(
"\\".__eq__,
characters[-2::-1]
)
)
if n_backslashes % 2 == 0:
characters.append(character)
else:
consume_until_next_bracket = True
characters.append(character)
return "".join(characters)

def untokenize(self, iterable):
it = iter(iterable)
indents = []
Expand Down Expand Up @@ -214,25 +238,29 @@ def untokenize(self, iterable):
startline = False
elif tok_type == FSTRING_MIDDLE:
if '{' in token or '}' in token:
token = self.escape_brackets(token)
last_line = token.splitlines()[-1]
end_line, end_col = end
end = (end_line, end_col + token.count('{') + token.count('}'))
token = re.sub('{', '{{', token)
token = re.sub('}', '}}', token)

extra_chars = last_line.count("{{") + last_line.count("}}")
end = (end_line, end_col + extra_chars)
elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
self.tokens.append(" ")

self.add_whitespace(start)
self.tokens.append(token)
self.prev_row, self.prev_col = end
if tok_type in (NEWLINE, NL):
self.prev_row += 1
self.prev_col = 0
self.prev_type = tok_type
return "".join(self.tokens)

def compat(self, token, iterable):
indents = []
toks_append = self.tokens.append
startline = token[0] in (NEWLINE, NL)
prevstring = False
in_fstring = 0

for tok in _itertools.chain([token], iterable):
toknum, tokval = tok[:2]
Expand All @@ -251,6 +279,10 @@ def compat(self, token, iterable):
else:
prevstring = False

if toknum == FSTRING_START:
in_fstring += 1
elif toknum == FSTRING_END:
in_fstring -= 1
if toknum == INDENT:
indents.append(tokval)
continue
Expand All @@ -263,11 +295,18 @@ def compat(self, token, iterable):
toks_append(indents[-1])
startline = False
elif toknum == FSTRING_MIDDLE:
if '{' in tokval or '}' in tokval:
tokval = re.sub('{', '{{', tokval)
tokval = re.sub('}', '}}', tokval)
tokval = self.escape_brackets(tokval)

# Insert a space between two consecutive brackets if we are in an f-string
if tokval in {"{", "}"} and self.tokens and self.tokens[-1] == tokval and in_fstring:
tokval = ' ' + tokval

# Insert a space between two consecutive f-strings
if toknum in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
self.tokens.append(" ")

toks_append(tokval)
self.prev_type = toknum


def untokenize(iterable):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix a bug that was causing the :func:`tokenize.untokenize` function to
handle unicode named literals incorrectly. Patch by Pablo Galindo