From d7e43791a5b6515bbf12f651a0649c80375906eb Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Wed, 1 Jan 2025 20:32:14 +0000 Subject: [PATCH 1/5] gh-124363: Treat debug expressions in f-string as raw strings --- Parser/action_helpers.c | 78 ++++++++++++++++++----------------------- Parser/lexer/lexer.c | 6 ++-- 2 files changed, 37 insertions(+), 47 deletions(-) diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c index 5ac1dd7813689c..d44fedaec53533 100644 --- a/Parser/action_helpers.c +++ b/Parser/action_helpers.c @@ -969,8 +969,6 @@ _PyPegen_check_fstring_conversion(Parser *p, Token* conv_token, expr_ty conv) return result_token_with_metadata(p, conv, conv_token->metadata); } -static asdl_expr_seq * -unpack_top_level_joined_strs(Parser *p, asdl_expr_seq *raw_expressions); ResultTokenWithMetadata * _PyPegen_setup_full_format_spec(Parser *p, Token *colon, asdl_expr_seq *spec, int lineno, int col_offset, int end_lineno, int end_col_offset, PyArena *arena) @@ -1251,7 +1249,6 @@ _PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq static expr_ty _PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant, Token* token) { assert(PyUnicode_CheckExact(constant->v.Constant.value)); - const char* bstr = PyUnicode_AsUTF8(constant->v.Constant.value); if (bstr == NULL) { return NULL; @@ -1279,9 +1276,9 @@ _PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant, Token* tok p->arena); } -static asdl_expr_seq * -unpack_top_level_joined_strs(Parser *p, asdl_expr_seq *raw_expressions) -{ +expr_ty +_PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* expr, Token*b) { + /* The parser might put multiple f-string values into an individual * JoinedStr node at the top level due to stuff like f-string debugging * expressions. This function flattens those and promotes them to the @@ -1289,44 +1286,14 @@ unpack_top_level_joined_strs(Parser *p, asdl_expr_seq *raw_expressions) * of the regular output, so this is not necessary if you are not going * to expose the output AST to Python level. */ - Py_ssize_t i, req_size, raw_size; - - req_size = raw_size = asdl_seq_LEN(raw_expressions); - expr_ty expr; - for (i = 0; i < raw_size; i++) { - expr = asdl_seq_GET(raw_expressions, i); - if (expr->kind == JoinedStr_kind) { - req_size += asdl_seq_LEN(expr->v.JoinedStr.values) - 1; - } - } - - asdl_expr_seq *expressions = _Py_asdl_expr_seq_new(req_size, p->arena); - if (expressions == NULL) { - return NULL; - } - - Py_ssize_t raw_index, req_index = 0; - for (raw_index = 0; raw_index < raw_size; raw_index++) { - expr = asdl_seq_GET(raw_expressions, raw_index); - if (expr->kind == JoinedStr_kind) { - asdl_expr_seq *values = expr->v.JoinedStr.values; - for (Py_ssize_t n = 0; n < asdl_seq_LEN(values); n++) { - asdl_seq_SET(expressions, req_index, asdl_seq_GET(values, n)); - req_index++; - } - } else { - asdl_seq_SET(expressions, req_index, expr); - req_index++; + Py_ssize_t n_items = asdl_seq_LEN(expr); + Py_ssize_t total_items = n_items; + for (Py_ssize_t i = 0; i < n_items; i++) { + expr_ty item = asdl_seq_GET(expr, i); + if (item->kind == JoinedStr_kind) { + total_items += asdl_seq_LEN(item->v.JoinedStr.values) - 1; } } - return expressions; -} - -expr_ty -_PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* raw_expressions, Token*b) { - - asdl_expr_seq *expr = unpack_top_level_joined_strs(p, raw_expressions); - Py_ssize_t n_items = asdl_seq_LEN(expr); const char* quote_str = PyBytes_AsString(a->bytes); if (quote_str == NULL) { @@ -1334,7 +1301,7 @@ _PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* raw_expressions, Token*b } int is_raw = strpbrk(quote_str, "rR") != NULL; - asdl_expr_seq *seq = _Py_asdl_expr_seq_new(n_items, p->arena); + asdl_expr_seq *seq = _Py_asdl_expr_seq_new(total_items, p->arena); if (seq == NULL) { return NULL; } @@ -1342,6 +1309,29 @@ _PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* raw_expressions, Token*b Py_ssize_t index = 0; for (Py_ssize_t i = 0; i < n_items; i++) { expr_ty item = asdl_seq_GET(expr, i); + + // This should correspond to a JoinedStr node of two elements + // created _PyPegen_formatted_value + if (item->kind == JoinedStr_kind) { + asdl_expr_seq *values = item->v.JoinedStr.values; + if (asdl_seq_LEN(values) != 2) { + PyErr_Format(PyExc_SystemError, + "unexpected JoinedStr node without debug data in f-string at line %d", + item->lineno); + return NULL; + } + + expr_ty first = asdl_seq_GET(values, 0); + assert(first->kind == Constant_kind); + asdl_seq_SET(seq, index++, first); + + expr_ty second = asdl_seq_GET(values, 1); + assert(second->kind == FormattedValue_kind); + asdl_seq_SET(seq, index++, second); + + continue; + } + if (item->kind == Constant_kind) { item = _PyPegen_decode_fstring_part(p, is_raw, item, b); if (item == NULL) { @@ -1360,7 +1350,7 @@ _PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* raw_expressions, Token*b } asdl_expr_seq *resized_exprs; - if (index != n_items) { + if (index != total_items) { resized_exprs = _Py_asdl_expr_seq_new(index, p->arena); if (resized_exprs == NULL) { return NULL; diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index 8c5ae37fa90860..bbbad8c71d55cf 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -153,13 +153,13 @@ set_fstring_expr(struct tok_state* tok, struct token *token, char c) { } result[j] = '\0'; // Null-terminate the result string - res = PyUnicode_DecodeUTF8(result, j, NULL); + res = PyUnicode_DecodeUTF8Stateful(result, j, NULL, NULL); PyMem_Free(result); } else { - res = PyUnicode_DecodeUTF8( + res = PyUnicode_DecodeUTF8Stateful( tok_mode->last_expr_buffer, tok_mode->last_expr_size - tok_mode->last_expr_end, - NULL + NULL, NULL ); } From e3d2f73c6963393ea6b694ac0279ed5ef3200154 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Tue, 21 Jan 2025 19:48:35 +0000 Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2025-01-21-19-48-30.gh-issue-124363.vOFhHW.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-01-21-19-48-30.gh-issue-124363.vOFhHW.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-01-21-19-48-30.gh-issue-124363.vOFhHW.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-01-21-19-48-30.gh-issue-124363.vOFhHW.rst new file mode 100644 index 00000000000000..553aa5a4dd573e --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-01-21-19-48-30.gh-issue-124363.vOFhHW.rst @@ -0,0 +1 @@ +Treat debug expressions in f-string as raw strings. Patch by Pablo Galindo From 13f4e21578f635b8f34b23b5211a769783b656e2 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Wed, 22 Jan 2025 15:25:43 +0000 Subject: [PATCH 3/5] Update Parser/action_helpers.c --- Parser/action_helpers.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c index d44fedaec53533..1bfe3cb28be6c4 100644 --- a/Parser/action_helpers.c +++ b/Parser/action_helpers.c @@ -1311,7 +1311,9 @@ _PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* expr, Token*b) { expr_ty item = asdl_seq_GET(expr, i); // This should correspond to a JoinedStr node of two elements - // created _PyPegen_formatted_value + // created _PyPegen_formatted_value. This situation can only be the result of + // a f-string debug expression where the first element is a constant with the text and the second + // a formatted value with the expression. if (item->kind == JoinedStr_kind) { asdl_expr_seq *values = item->v.JoinedStr.values; if (asdl_seq_LEN(values) != 2) { From 3f86aa3a22a528ab8d66edcc64e17413db5838b4 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Wed, 22 Jan 2025 15:52:25 +0000 Subject: [PATCH 4/5] simplify things and add test --- Lib/test/test_fstring.py | 8 ++++++++ Parser/action_helpers.c | 1 + Parser/lexer/lexer.c | 6 +++--- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index 31d91215889460..1d96b7a2c2459b 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -1649,6 +1649,14 @@ def __repr__(self): #self.assertEqual(f'X{x =}Y', 'Xx\t='+repr(x)+'Y') #self.assertEqual(f'X{x = }Y', 'Xx\t=\t'+repr(x)+'Y') + def test_debug_expressions_are_raw_strings(self): + + self.assertEqual(f'{b"\N{OX}"=}', 'b"\\N{OX}"=b\'\\\\N{OX}\'') + self.assertEqual(f'{r"\xff"=}', 'r"\\xff"=\'\\\\xff\'') + self.assertEqual(f'{r"\n"=}', 'r"\\n"=\'\\\\n\'') + self.assertEqual(f"{'\''=}", "'\\''=\"'\"") + self.assertEqual(f'{'\xc5'=}', r"'\xc5'='Å'") + def test_walrus(self): x = 20 # This isn't an assignment expression, it's 'x', with a format diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c index 1bfe3cb28be6c4..4d35be1699f25e 100644 --- a/Parser/action_helpers.c +++ b/Parser/action_helpers.c @@ -1248,6 +1248,7 @@ _PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq static expr_ty _PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant, Token* token) { + assert(PyUnicode_CheckExact(constant->v.Constant.value)); const char* bstr = PyUnicode_AsUTF8(constant->v.Constant.value); if (bstr == NULL) { diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index bbbad8c71d55cf..8c5ae37fa90860 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -153,13 +153,13 @@ set_fstring_expr(struct tok_state* tok, struct token *token, char c) { } result[j] = '\0'; // Null-terminate the result string - res = PyUnicode_DecodeUTF8Stateful(result, j, NULL, NULL); + res = PyUnicode_DecodeUTF8(result, j, NULL); PyMem_Free(result); } else { - res = PyUnicode_DecodeUTF8Stateful( + res = PyUnicode_DecodeUTF8( tok_mode->last_expr_buffer, tok_mode->last_expr_size - tok_mode->last_expr_end, - NULL, NULL + NULL ); } From 87c732c2a16598601652508960c1b8457e7b9555 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Wed, 22 Jan 2025 15:58:58 +0000 Subject: [PATCH 5/5] fixup! simplify things and add test --- Parser/action_helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c index 4d35be1699f25e..2fe8d11badcbac 100644 --- a/Parser/action_helpers.c +++ b/Parser/action_helpers.c @@ -1248,8 +1248,8 @@ _PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq static expr_ty _PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant, Token* token) { - assert(PyUnicode_CheckExact(constant->v.Constant.value)); + const char* bstr = PyUnicode_AsUTF8(constant->v.Constant.value); if (bstr == NULL) { return NULL;