From bd3cb12c9f77bf651239cc1c0426d170d54c3026 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 18 Oct 2022 16:44:30 +0200 Subject: [PATCH] gh-98401: Reject invalid escape sequences in strings A backslash-character pair that is not a valid escape sequence now generates a SyntaxError. --- Doc/library/re.rst | 6 +-- Doc/reference/lexical_analysis.rst | 7 +-- Doc/whatsnew/3.12.rst | 4 ++ Lib/test/test_codecs.py | 20 +++---- Lib/test/test_codeop.py | 10 ++-- Lib/test/test_fstring.py | 6 +-- Lib/test/test_string_literals.py | 30 ++++------- ...2-10-18-17-38-22.gh-issue-98401.3kHNtJ.rst | 2 + Objects/bytesobject.c | 11 ++-- Objects/unicodeobject.c | 11 ++-- Parser/string_parser.c | 54 ++++++++++--------- 11 files changed, 77 insertions(+), 84 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-10-18-17-38-22.gh-issue-98401.3kHNtJ.rst diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 5b304f717b07fa..de69990fd9d1ab 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -29,9 +29,9 @@ a literal backslash, one might have to write ``'\\\\'`` as the pattern string, because the regular expression must be ``\\``, and each backslash must be expressed as ``\\`` inside a regular Python string literal. Also, please note that any invalid escape sequences in Python's -usage of the backslash in string literals now generate a :exc:`DeprecationWarning` -and in the future this will become a :exc:`SyntaxError`. This behaviour -will happen even if it is a valid escape sequence for a regular expression. +usage of the backslash in string literals now generate a :exc:`SyntaxError`. +This behaviour will happen even if it is a valid escape sequence for a regular +expression. The solution is to use Python's raw string notation for regular expression patterns; backslashes are not handled in any special way in a string literal diff --git a/Doc/reference/lexical_analysis.rst b/Doc/reference/lexical_analysis.rst index 4ab6e90a623449..c82dee8304e4dc 100644 --- a/Doc/reference/lexical_analysis.rst +++ b/Doc/reference/lexical_analysis.rst @@ -646,9 +646,10 @@ escape sequences only recognized in string literals fall into the category of unrecognized escapes for bytes literals. .. versionchanged:: 3.6 - Unrecognized escape sequences produce a :exc:`DeprecationWarning`. In - a future Python version they will be a :exc:`SyntaxWarning` and - eventually a :exc:`SyntaxError`. + Unrecognized escape sequences produce a :exc:`DeprecationWarning`. + + .. versionchanged:: 3.12 + Unrecognized escape sequences produce a :exc:`SyntaxError`. Even in a raw literal, quotes can be escaped with a backslash, but the backslash remains in the result; for example, ``r"\""`` is a valid string diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index 525efc405c8520..9907c75cd0e6be 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -79,6 +79,10 @@ New Features Other Language Changes ====================== +* A backslash-character pair that is not a valid escape sequence now generates + a :exc:`SyntaxError`. + (Contributed by Victor Stinner in :gh:`98401`.) + * :class:`types.MappingProxyType` instances are now hashable if the underlying mapping is hashable. (Contributed by Serhiy Storchaka in :gh:`87995`.) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 57f3648eb7017c..b6ae7155d8c678 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1197,15 +1197,15 @@ def test_escape(self): for i in range(97, 123): b = bytes([i]) if b not in b'abfnrtvx': - with self.assertWarns(DeprecationWarning): + with self.assertRaises(SyntaxError): check(b"\\" + b, b"\\" + b) - with self.assertWarns(DeprecationWarning): + with self.assertRaises(SyntaxError): check(b"\\" + b.upper(), b"\\" + b.upper()) - with self.assertWarns(DeprecationWarning): + with self.assertRaises(SyntaxError): check(br"\8", b"\\8") - with self.assertWarns(DeprecationWarning): + with self.assertRaises(SyntaxError): check(br"\9", b"\\9") - with self.assertWarns(DeprecationWarning): + with self.assertRaises(SyntaxError): check(b"\\\xfa", b"\\\xfa") for i in range(0o400, 0o1000): with self.assertWarns(DeprecationWarning): @@ -2425,16 +2425,16 @@ def test_escape_decode(self): for i in range(97, 123): b = bytes([i]) if b not in b'abfnrtuvx': - with self.assertWarns(DeprecationWarning): + with self.assertRaises(SyntaxError): check(b"\\" + b, "\\" + chr(i)) if b.upper() not in b'UN': - with self.assertWarns(DeprecationWarning): + with self.assertRaises(SyntaxError): check(b"\\" + b.upper(), "\\" + chr(i-32)) - with self.assertWarns(DeprecationWarning): + with self.assertRaises(SyntaxError): check(br"\8", "\\8") - with self.assertWarns(DeprecationWarning): + with self.assertRaises(SyntaxError): check(br"\9", "\\9") - with self.assertWarns(DeprecationWarning): + with self.assertRaises(SyntaxError): check(b"\\\xfa", "\\\xfa") for i in range(0o400, 0o1000): with self.assertWarns(DeprecationWarning): diff --git a/Lib/test/test_codeop.py b/Lib/test/test_codeop.py index 133096d25a44bc..0b4ac87bc2cd77 100644 --- a/Lib/test/test_codeop.py +++ b/Lib/test/test_codeop.py @@ -313,7 +313,7 @@ def test_warning(self): (".*literal", SyntaxWarning), (".*invalid", DeprecationWarning), ) as w: - compile_command(r"'\e' is 0") + compile_command(r"'\777' is 0") self.assertEqual(len(w.warnings), 2) # bpo-41520: check SyntaxWarning treated as an SyntaxError @@ -324,21 +324,21 @@ def test_warning(self): # Check DeprecationWarning treated as an SyntaxError with warnings.catch_warnings(), self.assertRaises(SyntaxError): warnings.simplefilter('error', DeprecationWarning) - compile_command(r"'\e'", symbol='exec') + compile_command(r"'\777'", symbol='exec') def test_incomplete_warning(self): with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') - self.assertIncomplete("'\\e' + (") + self.assertIncomplete("'\\777' + (") self.assertEqual(w, []) def test_invalid_warning(self): with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') - self.assertInvalid("'\\e' 1") + self.assertInvalid("'\\777' 1") self.assertEqual(len(w), 1) self.assertEqual(w[0].category, DeprecationWarning) - self.assertRegex(str(w[0].message), 'invalid escape sequence') + self.assertRegex(str(w[0].message), 'invalid octal escape sequence') self.assertEqual(w[0].filename, '') diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index bf3a5b0bbccdfb..f60761106b2e08 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -776,9 +776,9 @@ def test_backslashes_in_string_part(self): self.assertEqual(f'2\x203', '2 3') self.assertEqual(f'\x203', ' 3') - with self.assertWarns(DeprecationWarning): # invalid escape sequence - value = eval(r"f'\{6*7}'") - self.assertEqual(value, '\\42') + with self.assertRaisesRegex(SyntaxError, 'invalid escape sequence'): + eval(r"f'\{6*7}'") + self.assertEqual(f'\\{6*7}', '\\42') self.assertEqual(fr'\{6*7}', '\\42') diff --git a/Lib/test/test_string_literals.py b/Lib/test/test_string_literals.py index 7247b7e48bc2b6..0ed4e6703f81b5 100644 --- a/Lib/test/test_string_literals.py +++ b/Lib/test/test_string_literals.py @@ -109,23 +109,12 @@ def test_eval_str_invalid_escape(self): for b in range(1, 128): if b in b"""\n\r"'01234567NU\\abfnrtuvx""": continue - with self.assertWarns(DeprecationWarning): + with self.assertRaises(SyntaxError): self.assertEqual(eval(r"'\%c'" % b), '\\' + chr(b)) - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always', category=DeprecationWarning) + with self.assertRaises(SyntaxError) as cm: eval("'''\n\\z'''") - self.assertEqual(len(w), 1) - self.assertEqual(str(w[0].message), r"invalid escape sequence '\z'") - self.assertEqual(w[0].filename, '') - self.assertEqual(w[0].lineno, 1) - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('error', category=DeprecationWarning) - with self.assertRaises(SyntaxError) as cm: - eval("'''\n\\z'''") - exc = cm.exception - self.assertEqual(w, []) + exc = cm.exception self.assertEqual(exc.msg, r"invalid escape sequence '\z'") self.assertEqual(exc.filename, '') self.assertEqual(exc.lineno, 1) @@ -186,16 +175,15 @@ def test_eval_bytes_invalid_escape(self): for b in range(1, 128): if b in b"""\n\r"'01234567\\abfnrtvx""": continue - with self.assertWarns(DeprecationWarning): + with self.assertRaises(SyntaxError): self.assertEqual(eval(r"b'\%c'" % b), b'\\' + bytes([b])) - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always', category=DeprecationWarning) + with self.assertRaises(SyntaxError) as cm: eval("b'''\n\\z'''") - self.assertEqual(len(w), 1) - self.assertEqual(str(w[0].message), r"invalid escape sequence '\z'") - self.assertEqual(w[0].filename, '') - self.assertEqual(w[0].lineno, 1) + exc = cm.exception + self.assertEqual(exc.msg, r"invalid escape sequence '\z'") + self.assertEqual(exc.filename, '') + self.assertEqual(exc.lineno, 1) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('error', category=DeprecationWarning) diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-10-18-17-38-22.gh-issue-98401.3kHNtJ.rst b/Misc/NEWS.d/next/Core and Builtins/2022-10-18-17-38-22.gh-issue-98401.3kHNtJ.rst new file mode 100644 index 00000000000000..5b113ae50da62c --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-10-18-17-38-22.gh-issue-98401.3kHNtJ.rst @@ -0,0 +1,2 @@ +A backslash-character pair that is not a valid escape sequence now generates +a :exc:`SyntaxError`. Patch by Victor Stinner. diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 80660881920fb7..f783a684b5eb39 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -1192,13 +1192,10 @@ PyObject *PyBytes_DecodeEscape(const char *s, } } else { - if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, - "invalid escape sequence '\\%c'", - c) < 0) - { - Py_DECREF(result); - return NULL; - } + PyErr_Format(PyExc_SyntaxError, + "invalid escape sequence '\\%c'", c); + Py_DECREF(result); + return NULL; } } return result; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index d090915146f804..022f674a41e2b7 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5967,13 +5967,10 @@ _PyUnicode_DecodeUnicodeEscapeStateful(const char *s, } } else { - if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, - "invalid escape sequence '\\%c'", - c) < 0) - { - Py_DECREF(result); - return NULL; - } + PyErr_Format(PyExc_SyntaxError, + "invalid escape sequence '\\%c'", c); + Py_DECREF(result); + return NULL; } } return result; diff --git a/Parser/string_parser.c b/Parser/string_parser.c index 9bc3b082136be5..942bf823eea9b3 100644 --- a/Parser/string_parser.c +++ b/Parser/string_parser.c @@ -13,38 +13,42 @@ warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token { unsigned char c = *first_invalid_escape; int octal = ('4' <= c && c <= '7'); - PyObject *msg = - octal - ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'", - first_invalid_escape) - : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c); - if (msg == NULL) { - return -1; - } - if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename, - t->lineno, NULL, NULL) < 0) { - if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) { - /* Replace the DeprecationWarning exception with a SyntaxError - to get a more accurate error report */ - PyErr_Clear(); - - /* This is needed, in order for the SyntaxError to point to the token t, - since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the - error location, if p->known_err_token is not set. */ - p->known_err_token = t; - if (octal) { + + if (octal) { + PyObject *msg = PyUnicode_FromFormat( + "invalid octal escape sequence '\\%.3s'", + first_invalid_escape); + if (msg == NULL) { + return -1; + } + if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename, + t->lineno, NULL, NULL) < 0) { + if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) { + /* Replace the DeprecationWarning exception with a SyntaxError + to get a more accurate error report */ + PyErr_Clear(); + + /* This is needed, in order for the SyntaxError to point to the token t, + since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the + error location, if p->known_err_token is not set. */ + p->known_err_token = t; RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'", first_invalid_escape); } - else { - RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c); - } + Py_DECREF(msg); + return -1; } Py_DECREF(msg); + return 0; + } + else { + /* This is needed, in order for the SyntaxError to point to the token t, + since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the + error location, if p->known_err_token is not set. */ + p->known_err_token = t; + RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c); return -1; } - Py_DECREF(msg); - return 0; } static PyObject *