Fix issues due to breaking tokenize changes in 3.12

lysnikolaou · lysnikolaou · commit 1098b2c170ae · 2023-06-28T19:02:48.000+02:00
diff --git a/IPython/core/inputsplitter.py b/IPython/core/inputsplitter.py
@@ -44,6 +44,7 @@
                                            assign_from_system,
                                            assemble_python_lines,
                                            )
+from IPython.utils import tokenutil
 
 # These are available in this module for backwards compatibility.
 from IPython.core.inputtransformer import (ESC_SHELL, ESC_SH_CAP, ESC_HELP,
@@ -128,7 +129,7 @@ def partial_tokens(s):
     readline = io.StringIO(s).readline
     token = tokenize.TokenInfo(tokenize.NEWLINE, '', (1, 0), (1, 0), '')
     try:
-        for token in tokenize.generate_tokens(readline):
+        for token in tokenutil.generate_tokens_catch_errors(readline):
             yield token
     except tokenize.TokenError as e:
         # catch EOF error
@@ -150,9 +151,17 @@ def find_next_indent(code):
         tokens.pop()
     if not tokens:
         return 0
-    while (tokens[-1].type in {tokenize.DEDENT, tokenize.NEWLINE, tokenize.COMMENT}):
+
+    while (tokens[-1].type in {tokenize.DEDENT, tokenize.NEWLINE, tokenize.COMMENT, tokenize.ERRORTOKEN}):
         tokens.pop()
 
+    # Starting in Python 3.12, the tokenize module adds implicit newlines at the end
+    # of input. We need to remove those if we're in a multiline statement
+    if tokens[-1].type == IN_MULTILINE_STATEMENT:
+        while tokens[-2].type in {tokenize.NL}:
+            tokens.pop(-2)
+
+
     if tokens[-1].type == INCOMPLETE_STRING:
         # Inside a multiline string
         return 0
diff --git a/IPython/core/inputtransformer.py b/IPython/core/inputtransformer.py
@@ -9,10 +9,11 @@
 import functools
 import re
 import tokenize
-from tokenize import generate_tokens, untokenize, TokenError
+from tokenize import untokenize, TokenError
 from io import StringIO
 
 from IPython.core.splitinput import LineInfo
+from IPython.utils import tokenutil
 
 #-----------------------------------------------------------------------------
 # Globals
@@ -127,7 +128,7 @@ def __init__(self, func):
 
     def reset_tokenizer(self):
         it = iter(self.buf)
-        self.tokenizer = generate_tokens(it.__next__)
+        self.tokenizer = tokenutil.generate_tokens_catch_errors(it.__next__)
 
     def push(self, line):
         self.buf.append(line + '\n')
@@ -295,7 +296,7 @@ def _line_tokens(line):
     readline = StringIO(line).readline
     toktypes = set()
     try:
-        for t in generate_tokens(readline):
+        for t in tokenutil.generate_tokens_catch_errors(readline):
             toktypes.add(t[0])
     except TokenError as e:
         # There are only two cases where a TokenError is raised.
diff --git a/IPython/core/inputtransformer2.py b/IPython/core/inputtransformer2.py
@@ -13,10 +13,13 @@
 import ast
 from codeop import CommandCompiler, Compile
 import re
+import sys
 import tokenize
 from typing import List, Tuple, Optional, Any
 import warnings
 
+from IPython.utils import tokenutil
+
 _indent_re = re.compile(r'^[ \t]+')
 
 def leading_empty_lines(lines):
@@ -269,9 +272,7 @@ def transform(self, lines: List[str]):
 class SystemAssign(TokenTransformBase):
     """Transformer for assignments from system commands (a = !foo)"""
     @classmethod
-    def find(cls, tokens_by_line):
-        """Find the first system assignment (a = !foo) in the cell.
-        """
+    def find_pre_312(cls, tokens_by_line):
         for line in tokens_by_line:
             assign_ix = _find_assign_op(line)
             if (assign_ix is not None) \
@@ -287,6 +288,25 @@ def find(cls, tokens_by_line):
                         break
                     ix += 1
 
+    @classmethod
+    def find_post_312(cls, tokens_by_line):
+        for line in tokens_by_line:
+            assign_ix = _find_assign_op(line)
+            if (assign_ix is not None) \
+                    and not line[assign_ix].line.strip().startswith('=') \
+                    and (len(line) >= assign_ix + 2) \
+                    and (line[assign_ix + 1].type == tokenize.OP) \
+                    and (line[assign_ix + 1].string == '!'):
+                return cls(line[assign_ix + 1].start)
+
+    @classmethod
+    def find(cls, tokens_by_line):
+        """Find the first system assignment (a = !foo) in the cell.
+        """
+        if sys.version_info < (3, 12):
+            return cls.find_pre_312(tokens_by_line)
+        return cls.find_post_312(tokens_by_line)
+
     def transform(self, lines: List[str]):
         """Transform a system assignment found by the ``find()`` classmethod.
         """
@@ -511,7 +531,8 @@ def make_tokens_by_line(lines:List[str]):
         )
     parenlev = 0
     try:
-        for token in tokenize.generate_tokens(iter(lines).__next__):
+        for token in tokenutil.generate_tokens_catch_errors(iter(lines).__next__,
+                                                            extra_errors_to_catch=['expected EOF']):
             tokens_by_line[-1].append(token)
             if (token.type == NEWLINE) \
                     or ((token.type == NL) and (parenlev <= 0)):
@@ -677,9 +698,13 @@ def check_complete(self, cell: str):
         if not lines:
             return 'complete', None
 
-        if lines[-1].endswith('\\'):
-            # Explicit backslash continuation
-            return 'incomplete', find_last_indent(lines)
+        for line in reversed(lines):
+            if not line.strip():
+                continue
+            elif line.strip('\n').endswith('\\'):
+                return 'incomplete', find_last_indent(lines)
+            else:
+                break
 
         try:
             for transform in self.cleanup_transforms:
@@ -717,7 +742,8 @@ def check_complete(self, cell: str):
         if not tokens_by_line:
             return 'incomplete', find_last_indent(lines)
 
-        if tokens_by_line[-1][-1].type != tokenize.ENDMARKER:
+        if (tokens_by_line[-1][-1].type != tokenize.ENDMARKER
+                and tokens_by_line[-1][-1].type != tokenize.ERRORTOKEN):
             # We're in a multiline string or expression
             return 'incomplete', find_last_indent(lines)
 
diff --git a/IPython/core/tests/test_inputtransformer2.py b/IPython/core/tests/test_inputtransformer2.py
@@ -297,14 +297,18 @@ def __init__(self, s):
         _find_assign_op([Tk(s) for s in ("", "(", "a", "=", "b", ")", "=", "5")]) == 6
     )
 
-
+extra_closing_paren_param = (
+    pytest.param("(\n))", "invalid", None)
+    if sys.version_info >= (3, 12)
+    else pytest.param("(\n))", "incomplete", 0)
+)
 examples = [
     pytest.param("a = 1", "complete", None),
     pytest.param("for a in range(5):", "incomplete", 4),
     pytest.param("for a in range(5):\n    if a > 0:", "incomplete", 8),
     pytest.param("raise = 2", "invalid", None),
     pytest.param("a = [1,\n2,", "incomplete", 0),
-    pytest.param("(\n))", "incomplete", 0),
+    extra_closing_paren_param,
     pytest.param("\\\r\n", "incomplete", 0),
     pytest.param("a = '''\n   hi", "incomplete", 3),
     pytest.param("def a():\n x=1\n global x", "invalid", None),
diff --git a/IPython/utils/tests/test_pycolorize.py b/IPython/utils/tests/test_pycolorize.py
@@ -18,6 +18,7 @@
 #-----------------------------------------------------------------------------
 
 # our own
+import sys
 from IPython.utils.PyColorize import Parser
 import io
 import pytest
@@ -40,7 +41,7 @@ def function(arg, *args, kwarg=True, **kwargs):
     False == None
 
     with io.open(ru'unicode', encoding='utf-8'):
-        raise ValueError("\n escape \r sequence")
+        raise ValueError("escape \r sequence")
 
     print("wěird ünicoðe")
 
@@ -64,6 +65,6 @@ def test_parse_sample(style):
 
 def test_parse_error(style):
     p = Parser(style=style)
-    f1 = p.format(")", "str")
+    f1 = p.format(r"\ " if sys.version_info >= (3, 12) else ")", "str")
     if style != "NoColor":
         assert "ERROR" in f1
diff --git a/IPython/utils/tokenutil.py b/IPython/utils/tokenutil.py
@@ -21,6 +21,31 @@ def generate_tokens(readline):
         # catch EOF error
         return
 
+def generate_tokens_catch_errors(readline, extra_errors_to_catch=None):
+    default_errors_to_catch = ['unterminated string literal', 'invalid non-printable character',
+                               'after line continuation character']
+    assert extra_errors_to_catch is None or isinstance(extra_errors_to_catch, list)
+    errors_to_catch = default_errors_to_catch + (extra_errors_to_catch or [])
+
+    tokens = []
+    try:
+        for token in tokenize.generate_tokens(readline):
+            tokens.append(token)
+            yield token
+    except tokenize.TokenError as exc:
+        if any(error in exc.args[0] for error in errors_to_catch):
+            if tokens:
+                start = tokens[-1].start[0], tokens[-1].end[0]
+                end = start
+                line = tokens[-1].line
+            else:
+                start = end = (1, 0)
+                line = ''
+            yield tokenize.TokenInfo(tokenize.ERRORTOKEN, '', start, end, line)
+        else:
+            # Catch EOF
+            raise
+
 def line_at_cursor(cell, cursor_pos=0):
     """Return the line in a cell at a given cursor position
 
@@ -123,5 +148,3 @@ def token_at_cursor(cell, cursor_pos=0):
         return names[-1]
     else:
         return ''
-    
-