diff --git a/.github/workflows/pegen.yml b/.github/workflows/pegen.yml deleted file mode 100644 index f61c44c3fa3af8..00000000000000 --- a/.github/workflows/pegen.yml +++ /dev/null @@ -1,44 +0,0 @@ -name: Pegen Tests - -on: - pull_request: - branches: [ pegen ] - -jobs: - pytest: - name: pytest - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Install dependencies - working-directory: ./Tools/peg_generator - run: | - python -m pip install --upgrade pip - python -m pip install -r requirements-test.pip - - name: Test with pytest - working-directory: ./Tools/peg_generator - run: | - python -m pytest - - simple_test: - name: make test - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Install dependencies - working-directory: ./Tools/peg_generator - run: | - python -m pip install --upgrade pip - python -m pip install -r requirements-test.pip - - name: Run make test - working-directory: ./Tools/peg_generator - run: | - make test diff --git a/Lib/test/test_peg_generator/__init__.py b/Lib/test/test_peg_generator/__init__.py new file mode 100644 index 00000000000000..fa855f2104c586 --- /dev/null +++ b/Lib/test/test_peg_generator/__init__.py @@ -0,0 +1,7 @@ +import os + +from test.support import load_package_tests + +# Load all tests in package +def load_tests(*args): + return load_package_tests(os.path.dirname(__file__), *args) diff --git a/Lib/test/test_peg_generator/__main__.py b/Lib/test/test_peg_generator/__main__.py new file mode 100644 index 00000000000000..1fab1fddb57445 --- /dev/null +++ b/Lib/test/test_peg_generator/__main__.py @@ -0,0 +1,4 @@ +import unittest +from . import load_tests + +unittest.main() diff --git a/Lib/test/test_peg_generator/ast_dump.py b/Lib/test/test_peg_generator/ast_dump.py new file mode 100644 index 00000000000000..22d2dde7755971 --- /dev/null +++ b/Lib/test/test_peg_generator/ast_dump.py @@ -0,0 +1,62 @@ +""" +Copy-parse of ast.dump, removing the `isinstance` checks. This is needed, +because testing pegen requires generating a C extension module, which contains +a copy of the symbols defined in Python-ast.c. Thus, the isinstance check would +always fail. We rely on string comparison of the base classes instead. +TODO: Remove the above-described hack. +""" + +def ast_dump(node, annotate_fields=True, include_attributes=False, *, indent=None): + def _format(node, level=0): + if indent is not None: + level += 1 + prefix = '\n' + indent * level + sep = ',\n' + indent * level + else: + prefix = '' + sep = ', ' + if any(cls.__name__ == 'AST' for cls in node.__class__.__mro__): + cls = type(node) + args = [] + allsimple = True + keywords = annotate_fields + for name in node._fields: + try: + value = getattr(node, name) + except AttributeError: + keywords = True + continue + if value is None and getattr(cls, name, ...) is None: + keywords = True + continue + value, simple = _format(value, level) + allsimple = allsimple and simple + if keywords: + args.append('%s=%s' % (name, value)) + else: + args.append(value) + if include_attributes and node._attributes: + for name in node._attributes: + try: + value = getattr(node, name) + except AttributeError: + continue + if value is None and getattr(cls, name, ...) is None: + continue + value, simple = _format(value, level) + allsimple = allsimple and simple + args.append('%s=%s' % (name, value)) + if allsimple and len(args) <= 3: + return '%s(%s)' % (node.__class__.__name__, ', '.join(args)), not args + return '%s(%s%s)' % (node.__class__.__name__, prefix, sep.join(args)), False + elif isinstance(node, list): + if not node: + return '[]', True + return '[%s%s]' % (prefix, sep.join(_format(x, level)[0] for x in node)), False + return repr(node), True + + if all(cls.__name__ != 'AST' for cls in node.__class__.__mro__): + raise TypeError('expected AST, got %r' % node.__class__.__name__) + if indent is not None and not isinstance(indent, str): + indent = ' ' * indent + return _format(node)[0] diff --git a/Lib/test/test_peg_generator/test_c_parser.py b/Lib/test/test_peg_generator/test_c_parser.py new file mode 100644 index 00000000000000..5f2d8cfaf6a570 --- /dev/null +++ b/Lib/test/test_peg_generator/test_c_parser.py @@ -0,0 +1,330 @@ +import ast +import traceback +import tempfile +import shutil +import unittest + +from test import test_tools +from test.test_peg_generator.ast_dump import ast_dump +from pathlib import PurePath, Path +from typing import Sequence + +test_tools.skip_if_missing('peg_generator') +with test_tools.imports_under_tool('peg_generator'): + from pegen.grammar_parser import GeneratedParser as GrammarParser + from pegen.testutil import ( + parse_string, + generate_parser_c_extension, + generate_c_parser_source, + ) + + +class TestCParser(unittest.TestCase): + def setUp(self): + self.tmp_path = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.tmp_path) + + def check_input_strings_for_grammar( + self, + source: str, + tmp_path: PurePath, + valid_cases: Sequence[str] = (), + invalid_cases: Sequence[str] = (), + ) -> None: + grammar = parse_string(source, GrammarParser) + extension = generate_parser_c_extension(grammar, Path(tmp_path)) + + if valid_cases: + for case in valid_cases: + extension.parse_string(case, mode=0) + + if invalid_cases: + for case in invalid_cases: + with self.assertRaises(SyntaxError): + extension.parse_string(case, mode=0) + + def verify_ast_generation(self, source: str, stmt: str, tmp_path: PurePath) -> None: + grammar = parse_string(source, GrammarParser) + extension = generate_parser_c_extension(grammar, Path(tmp_path)) + + expected_ast = ast.parse(stmt) + actual_ast = extension.parse_string(stmt, mode=1) + self.assertEqual(ast_dump(expected_ast), ast_dump(actual_ast)) + + def test_c_parser(self) -> None: + grammar_source = """ + start[mod_ty]: a=stmt* $ { Module(a, NULL, p->arena) } + stmt[stmt_ty]: a=expr_stmt { a } + expr_stmt[stmt_ty]: a=expression NEWLINE { _Py_Expr(a, EXTRA) } + expression[expr_ty]: ( l=expression '+' r=term { _Py_BinOp(l, Add, r, EXTRA) } + | l=expression '-' r=term { _Py_BinOp(l, Sub, r, EXTRA) } + | t=term { t } + ) + term[expr_ty]: ( l=term '*' r=factor { _Py_BinOp(l, Mult, r, EXTRA) } + | l=term '/' r=factor { _Py_BinOp(l, Div, r, EXTRA) } + | f=factor { f } + ) + factor[expr_ty]: ('(' e=expression ')' { e } + | a=atom { a } + ) + atom[expr_ty]: ( n=NAME { n } + | n=NUMBER { n } + | s=STRING { s } + ) + """ + grammar = parse_string(grammar_source, GrammarParser) + extension = generate_parser_c_extension(grammar, Path(self.tmp_path)) + + expressions = [ + "4+5", + "4-5", + "4*5", + "1+4*5", + "1+4/5", + "(1+1) + (1+1)", + "(1+1) - (1+1)", + "(1+1) * (1+1)", + "(1+1) / (1+1)", + ] + + for expr in expressions: + the_ast = extension.parse_string(expr, mode=1) + expected_ast = ast.parse(expr) + self.assertEqual(ast_dump(the_ast), ast_dump(expected_ast)) + + def test_lookahead(self) -> None: + grammar = """ + start: NAME &NAME expr NEWLINE? ENDMARKER + expr: NAME | NUMBER + """ + valid_cases = ["foo bar"] + invalid_cases = ["foo 34"] + self.check_input_strings_for_grammar(grammar, self.tmp_path, valid_cases, invalid_cases) + + def test_negative_lookahead(self) -> None: + grammar = """ + start: NAME !NAME expr NEWLINE? ENDMARKER + expr: NAME | NUMBER + """ + valid_cases = ["foo 34"] + invalid_cases = ["foo bar"] + self.check_input_strings_for_grammar(grammar, self.tmp_path, valid_cases, invalid_cases) + + def test_cut(self) -> None: + grammar = """ + start: X ~ Y Z | X Q S + X: 'x' + Y: 'y' + Z: 'z' + Q: 'q' + S: 's' + """ + valid_cases = ["x y z"] + invalid_cases = ["x q s"] + self.check_input_strings_for_grammar(grammar, self.tmp_path, valid_cases, invalid_cases) + + def test_gather(self) -> None: + grammar = """ + start: ';'.pass_stmt+ NEWLINE + pass_stmt: 'pass' + """ + valid_cases = ["pass", "pass; pass"] + invalid_cases = ["pass;", "pass; pass;"] + self.check_input_strings_for_grammar(grammar, self.tmp_path, valid_cases, invalid_cases) + + def test_left_recursion(self) -> None: + grammar = """ + start: expr NEWLINE + expr: ('-' term | expr '+' term | term) + term: NUMBER + """ + valid_cases = ["-34", "34", "34 + 12", "1 + 1 + 2 + 3"] + self.check_input_strings_for_grammar(grammar, self.tmp_path, valid_cases) + + def test_advanced_left_recursive(self) -> None: + grammar = """ + start: NUMBER | sign start + sign: ['-'] + """ + valid_cases = ["23", "-34"] + self.check_input_strings_for_grammar(grammar, self.tmp_path, valid_cases) + + def test_mutually_left_recursive(self) -> None: + grammar = """ + start: foo 'E' + foo: bar 'A' | 'B' + bar: foo 'C' | 'D' + """ + valid_cases = ["B E", "D A C A E"] + self.check_input_strings_for_grammar(grammar, self.tmp_path, valid_cases) + + def test_nasty_mutually_left_recursive(self) -> None: + grammar = """ + start: target '=' + target: maybe '+' | NAME + maybe: maybe '-' | target + """ + valid_cases = ["x ="] + invalid_cases = ["x - + ="] + self.check_input_strings_for_grammar(grammar, self.tmp_path, valid_cases, invalid_cases) + + def test_return_stmt_noexpr_action(self) -> None: + grammar = """ + start[mod_ty]: a=[statements] ENDMARKER { Module(a, NULL, p->arena) } + statements[asdl_seq*]: a=statement+ { a } + statement[stmt_ty]: simple_stmt + simple_stmt[stmt_ty]: small_stmt + small_stmt[stmt_ty]: return_stmt + return_stmt[stmt_ty]: a='return' NEWLINE { _Py_Return(NULL, EXTRA) } + """ + stmt = "return" + self.verify_ast_generation(grammar, stmt, self.tmp_path) + + def test_gather_action_ast(self) -> None: + grammar = """ + start[mod_ty]: a=';'.pass_stmt+ NEWLINE ENDMARKER { Module(a, NULL, p->arena) } + pass_stmt[stmt_ty]: a='pass' { _Py_Pass(EXTRA)} + """ + stmt = "pass; pass" + self.verify_ast_generation(grammar, stmt, self.tmp_path) + + def test_pass_stmt_action(self) -> None: + grammar = """ + start[mod_ty]: a=[statements] ENDMARKER { Module(a, NULL, p->arena) } + statements[asdl_seq*]: a=statement+ { a } + statement[stmt_ty]: simple_stmt + simple_stmt[stmt_ty]: small_stmt + small_stmt[stmt_ty]: pass_stmt + pass_stmt[stmt_ty]: a='pass' NEWLINE { _Py_Pass(EXTRA) } + """ + stmt = "pass" + self.verify_ast_generation(grammar, stmt, self.tmp_path) + + def test_if_stmt_action(self) -> None: + grammar = """ + start[mod_ty]: a=[statements] ENDMARKER { Module(a, NULL, p->arena) } + statements[asdl_seq*]: a=statement+ { seq_flatten(p, a) } + statement[asdl_seq*]: a=compound_stmt { singleton_seq(p, a) } | simple_stmt + + simple_stmt[asdl_seq*]: a=small_stmt b=further_small_stmt* [';'] NEWLINE { seq_insert_in_front(p, a, b) } + further_small_stmt[stmt_ty]: ';' a=small_stmt { a } + + block: simple_stmt | NEWLINE INDENT a=statements DEDENT { a } + + compound_stmt: if_stmt + + if_stmt: 'if' a=full_expression ':' b=block { _Py_If(a, b, NULL, EXTRA) } + + small_stmt[stmt_ty]: pass_stmt + + pass_stmt[stmt_ty]: a='pass' { _Py_Pass(EXTRA) } + + full_expression: NAME + """ + stmt = "pass" + self.verify_ast_generation(grammar, stmt, self.tmp_path) + + def test_same_name_different_types(self) -> None: + source = """ + start[mod_ty]: a=import_from+ NEWLINE ENDMARKER { Module(a, NULL, p->arena)} + import_from[stmt_ty]: ( a='from' !'import' c=simple_name 'import' d=import_as_names_from { + _Py_ImportFrom(c->v.Name.id, d, 0, EXTRA) } + | a='from' '.' 'import' c=import_as_names_from { + _Py_ImportFrom(NULL, c, 1, EXTRA) } + ) + simple_name[expr_ty]: NAME + import_as_names_from[asdl_seq*]: a=','.import_as_name_from+ { a } + import_as_name_from[alias_ty]: a=NAME 'as' b=NAME { _Py_alias(((expr_ty) a)->v.Name.id, ((expr_ty) b)->v.Name.id, p->arena) } + """ + grammar = parse_string(source, GrammarParser) + extension = generate_parser_c_extension(grammar, Path(self.tmp_path)) + + for stmt in ("from a import b as c", "from . import a as b"): + expected_ast = ast.parse(stmt) + actual_ast = extension.parse_string(stmt, mode=1) + self.assertEqual(ast_dump(expected_ast), ast_dump(actual_ast)) + + def test_with_stmt_with_paren(self) -> None: + grammar_source = """ + start[mod_ty]: a=[statements] ENDMARKER { Module(a, NULL, p->arena) } + statements[asdl_seq*]: a=statement+ { seq_flatten(p, a) } + statement[asdl_seq*]: a=compound_stmt { singleton_seq(p, a) } + compound_stmt[stmt_ty]: with_stmt + with_stmt[stmt_ty]: ( + a='with' '(' b=','.with_item+ ')' ':' c=block { + _Py_With(b, singleton_seq(p, c), NULL, EXTRA) } + ) + with_item[withitem_ty]: ( + e=NAME o=['as' t=NAME { t }] { _Py_withitem(e, set_expr_context(p, o, Store), p->arena) } + ) + block[stmt_ty]: a=pass_stmt NEWLINE { a } | NEWLINE INDENT a=pass_stmt DEDENT { a } + pass_stmt[stmt_ty]: a='pass' { _Py_Pass(EXTRA) } + """ + stmt = "with (\n a as b,\n c as d\n): pass" + grammar = parse_string(grammar_source, GrammarParser) + extension = generate_parser_c_extension(grammar, Path(self.tmp_path)) + the_ast = extension.parse_string(stmt, mode=1) + self.assertTrue(ast_dump(the_ast).startswith( + "Module(body=[With(items=[withitem(context_expr=Name(id='a', ctx=Load()), optional_vars=Name(id='b', ctx=Store())), " + "withitem(context_expr=Name(id='c', ctx=Load()), optional_vars=Name(id='d', ctx=Store()))]" + )) + + def test_ternary_operator(self) -> None: + grammar_source = """ + start[mod_ty]: a=expr ENDMARKER { Module(a, NULL, p->arena) } + expr[asdl_seq*]: a=listcomp NEWLINE { singleton_seq(p, _Py_Expr(a, EXTRA)) } + listcomp[expr_ty]: ( + a='[' b=NAME c=for_if_clauses d=']' { _Py_ListComp(b, c, EXTRA) } + ) + for_if_clauses[asdl_seq*]: ( + a=(y=[ASYNC] 'for' a=NAME 'in' b=NAME c=('if' z=NAME { z })* + { _Py_comprehension(_Py_Name(((expr_ty) a)->v.Name.id, Store, EXTRA), b, c, (y == NULL) ? 0 : 1, p->arena) })+ { a } + ) + """ + stmt = "[i for i in a if b]" + self.verify_ast_generation(grammar_source, stmt, self.tmp_path) + + def test_syntax_error_for_string(self) -> None: + grammar_source = """ + start: expr+ NEWLINE? ENDMARKER + expr: NAME + """ + grammar = parse_string(grammar_source, GrammarParser) + print(list(Path(self.tmp_path).iterdir())) + extension = generate_parser_c_extension(grammar, Path(self.tmp_path)) + for text in ("a b 42 b a", "名 名 42 名 名"): + try: + extension.parse_string(text, mode=0) + except SyntaxError as e: + tb = traceback.format_exc() + self.assertTrue('File "", line 1' in tb) + self.assertTrue(f"{text}\n ^" in tb) + + def test_headers_and_trailer(self) -> None: + grammar_source = """ + @header 'SOME HEADER' + @subheader 'SOME SUBHEADER' + @trailer 'SOME TRAILER' + start: expr+ NEWLINE? ENDMARKER + expr: x=NAME + """ + grammar = parse_string(grammar_source, GrammarParser) + parser_source = generate_c_parser_source(grammar) + + self.assertTrue("SOME HEADER" in parser_source) + self.assertTrue("SOME SUBHEADER" in parser_source) + self.assertTrue("SOME TRAILER" in parser_source) + + + def test_error_in_rules(self) -> None: + grammar_source = """ + start: expr+ NEWLINE? ENDMARKER + expr: NAME {PyTuple_New(-1)} + """ + grammar = parse_string(grammar_source, GrammarParser) + extension = generate_parser_c_extension(grammar, Path(self.tmp_path)) + # PyTuple_New raises SystemError if an invalid argument was passed. + with self.assertRaises(SystemError): + extension.parse_string("a", mode=0) diff --git a/Lib/test/test_peg_generator/test_first_sets.py b/Lib/test/test_peg_generator/test_first_sets.py new file mode 100644 index 00000000000000..bb7c9deae5906d --- /dev/null +++ b/Lib/test/test_peg_generator/test_first_sets.py @@ -0,0 +1,225 @@ +import unittest + +from test import test_tools +from typing import Dict, Set + +test_tools.skip_if_missing('peg_generator') +with test_tools.imports_under_tool('peg_generator'): + from pegen.grammar_parser import GeneratedParser as GrammarParser + from pegen.testutil import parse_string + from pegen.first_sets import FirstSetCalculator + from pegen.grammar import Grammar + + +class TestFirstSets(unittest.TestCase): + def calculate_first_sets(self, grammar_source: str) -> Dict[str, Set[str]]: + grammar: Grammar = parse_string(grammar_source, GrammarParser) + return FirstSetCalculator(grammar.rules).calculate() + + def test_alternatives(self) -> None: + grammar = """ + start: expr NEWLINE? ENDMARKER + expr: A | B + A: 'a' | '-' + B: 'b' | '+' + """ + self.assertEqual(self.calculate_first_sets(grammar), { + "A": {"'a'", "'-'"}, + "B": {"'+'", "'b'"}, + "expr": {"'+'", "'a'", "'b'", "'-'"}, + "start": {"'+'", "'a'", "'b'", "'-'"}, + }) + + def test_optionals(self) -> None: + grammar = """ + start: expr NEWLINE + expr: ['a'] ['b'] 'c' + """ + self.assertEqual(self.calculate_first_sets(grammar), { + "expr": {"'c'", "'a'", "'b'"}, + "start": {"'c'", "'a'", "'b'"}, + }) + + def test_repeat_with_separator(self) -> None: + grammar = """ + start: ','.thing+ NEWLINE + thing: NUMBER + """ + self.assertEqual(self.calculate_first_sets(grammar), {"thing": {"NUMBER"}, "start": {"NUMBER"}}) + + def test_optional_operator(self) -> None: + grammar = """ + start: sum NEWLINE + sum: (term)? 'b' + term: NUMBER + """ + self.assertEqual(self.calculate_first_sets(grammar), { + "term": {"NUMBER"}, + "sum": {"NUMBER", "'b'"}, + "start": {"'b'", "NUMBER"}, + }) + + def test_optional_literal(self) -> None: + grammar = """ + start: sum NEWLINE + sum: '+' ? term + term: NUMBER + """ + self.assertEqual(self.calculate_first_sets(grammar), { + "term": {"NUMBER"}, + "sum": {"'+'", "NUMBER"}, + "start": {"'+'", "NUMBER"}, + }) + + def test_optional_after(self) -> None: + grammar = """ + start: term NEWLINE + term: NUMBER ['+'] + """ + self.assertEqual(self.calculate_first_sets(grammar), {"term": {"NUMBER"}, "start": {"NUMBER"}}) + + def test_optional_before(self) -> None: + grammar = """ + start: term NEWLINE + term: ['+'] NUMBER + """ + self.assertEqual(self.calculate_first_sets(grammar), {"term": {"NUMBER", "'+'"}, "start": {"NUMBER", "'+'"}}) + + def test_repeat_0(self) -> None: + grammar = """ + start: thing* "+" NEWLINE + thing: NUMBER + """ + self.assertEqual(self.calculate_first_sets(grammar), {"thing": {"NUMBER"}, "start": {'"+"', "NUMBER"}}) + + def test_repeat_0_with_group(self) -> None: + grammar = """ + start: ('+' '-')* term NEWLINE + term: NUMBER + """ + self.assertEqual(self.calculate_first_sets(grammar), {"term": {"NUMBER"}, "start": {"'+'", "NUMBER"}}) + + def test_repeat_1(self) -> None: + grammar = """ + start: thing+ '-' NEWLINE + thing: NUMBER + """ + self.assertEqual(self.calculate_first_sets(grammar), {"thing": {"NUMBER"}, "start": {"NUMBER"}}) + + def test_repeat_1_with_group(self) -> None: + grammar = """ + start: ('+' term)+ term NEWLINE + term: NUMBER + """ + self.assertEqual(self.calculate_first_sets(grammar), {"term": {"NUMBER"}, "start": {"'+'"}}) + + def test_gather(self) -> None: + grammar = """ + start: ','.thing+ NEWLINE + thing: NUMBER + """ + self.assertEqual(self.calculate_first_sets(grammar), {"thing": {"NUMBER"}, "start": {"NUMBER"}}) + + def test_positive_lookahead(self) -> None: + grammar = """ + start: expr NEWLINE + expr: &'a' opt + opt: 'a' | 'b' | 'c' + """ + self.assertEqual(self.calculate_first_sets(grammar), { + "expr": {"'a'"}, + "start": {"'a'"}, + "opt": {"'b'", "'c'", "'a'"}, + }) + + def test_negative_lookahead(self) -> None: + grammar = """ + start: expr NEWLINE + expr: !'a' opt + opt: 'a' | 'b' | 'c' + """ + self.assertEqual(self.calculate_first_sets(grammar), { + "opt": {"'b'", "'a'", "'c'"}, + "expr": {"'b'", "'c'"}, + "start": {"'b'", "'c'"}, + }) + + def test_left_recursion(self) -> None: + grammar = """ + start: expr NEWLINE + expr: ('-' term | expr '+' term | term) + term: NUMBER + foo: 'foo' + bar: 'bar' + baz: 'baz' + """ + self.assertEqual(self.calculate_first_sets(grammar), { + "expr": {"NUMBER", "'-'"}, + "term": {"NUMBER"}, + "start": {"NUMBER", "'-'"}, + "foo": {"'foo'"}, + "bar": {"'bar'"}, + "baz": {"'baz'"}, + }) + + def test_advance_left_recursion(self) -> None: + grammar = """ + start: NUMBER | sign start + sign: ['-'] + """ + self.assertEqual(self.calculate_first_sets(grammar), {"sign": {"'-'", ""}, "start": {"'-'", "NUMBER"}}) + + def test_mutual_left_recursion(self) -> None: + grammar = """ + start: foo 'E' + foo: bar 'A' | 'B' + bar: foo 'C' | 'D' + """ + self.assertEqual(self.calculate_first_sets(grammar), { + "foo": {"'D'", "'B'"}, + "bar": {"'D'"}, + "start": {"'D'", "'B'"}, + }) + + def test_nasty_left_recursion(self) -> None: + # TODO: Validate this + grammar = """ + start: target '=' + target: maybe '+' | NAME + maybe: maybe '-' | target + """ + self.assertEqual(self.calculate_first_sets(grammar), {"maybe": set(), "target": {"NAME"}, "start": {"NAME"}}) + + def test_nullable_rule(self) -> None: + grammar = """ + start: sign thing $ + sign: ['-'] + thing: NUMBER + """ + self.assertEqual(self.calculate_first_sets(grammar), { + "sign": {"", "'-'"}, + "thing": {"NUMBER"}, + "start": {"NUMBER", "'-'"}, + }) + + def test_epsilon_production_in_start_rule(self) -> None: + grammar = """ + start: ['-'] $ + """ + self.assertEqual(self.calculate_first_sets(grammar), {"start": {"ENDMARKER", "'-'"}}) + + def test_multiple_nullable_rules(self) -> None: + grammar = """ + start: sign thing other another $ + sign: ['-'] + thing: ['+'] + other: '*' + another: '/' + """ + self.assertEqual(self.calculate_first_sets(grammar), { + "sign": {"", "'-'"}, + "thing": {"'+'", ""}, + "start": {"'+'", "'-'", "'*'"}, + "other": {"'*'"}, + "another": {"'/'"}, + }) diff --git a/Lib/test/test_peg_generator/test_pegen.py b/Lib/test/test_peg_generator/test_pegen.py new file mode 100644 index 00000000000000..582743ca173a14 --- /dev/null +++ b/Lib/test/test_peg_generator/test_pegen.py @@ -0,0 +1,729 @@ +import io +import textwrap +import unittest + +from test import test_tools +from typing import Dict, Any +from tokenize import TokenInfo, NAME, NEWLINE, NUMBER, OP + +test_tools.skip_if_missing('peg_generator') +with test_tools.imports_under_tool('peg_generator'): + from pegen.grammar_parser import GeneratedParser as GrammarParser + from pegen.testutil import ( + parse_string, + generate_parser, + make_parser + ) + from pegen.grammar import GrammarVisitor, GrammarError, Grammar + from pegen.grammar_visualizer import ASTGrammarPrinter + from pegen.parser import Parser + from pegen.python_generator import PythonParserGenerator + + +class TestPegen(unittest.TestCase): + def test_parse_grammar(self) -> None: + grammar_source = """ + start: sum NEWLINE + sum: t1=term '+' t2=term { action } | term + term: NUMBER + """ + expected = """ + start: sum NEWLINE + sum: term '+' term | term + term: NUMBER + """ + grammar: Grammar = parse_string(grammar_source, GrammarParser) + rules = grammar.rules + self.assertEqual(str(grammar), textwrap.dedent(expected).strip()) + # Check the str() and repr() of a few rules; AST nodes don't support ==. + self.assertEqual(str(rules["start"]), "start: sum NEWLINE") + self.assertEqual(str(rules["sum"]), "sum: term '+' term | term") + expected_repr = "Rule('term', None, Rhs([Alt([NamedItem(None, NameLeaf('NUMBER'))])]))" + self.assertEqual(repr(rules["term"]), expected_repr) + + def test_long_rule_str(self) -> None: + grammar_source = """ + start: zero | one | one zero | one one | one zero zero | one zero one | one one zero | one one one + """ + expected = """ + start: + | zero + | one + | one zero + | one one + | one zero zero + | one zero one + | one one zero + | one one one + """ + grammar: Grammar = parse_string(grammar_source, GrammarParser) + self.assertEqual(str(grammar.rules["start"]), textwrap.dedent(expected).strip()) + + def test_typed_rules(self) -> None: + grammar = """ + start[int]: sum NEWLINE + sum[int]: t1=term '+' t2=term { action } | term + term[int]: NUMBER + """ + rules = parse_string(grammar, GrammarParser).rules + # Check the str() and repr() of a few rules; AST nodes don't support ==. + self.assertEqual(str(rules["start"]), "start: sum NEWLINE") + self.assertEqual(str(rules["sum"]), "sum: term '+' term | term") + self.assertEqual( + repr(rules["term"]), + "Rule('term', 'int', Rhs([Alt([NamedItem(None, NameLeaf('NUMBER'))])]))" + ) + + def test_repeat_with_separator_rules(self) -> None: + grammar = """ + start: ','.thing+ NEWLINE + thing: NUMBER + """ + rules = parse_string(grammar, GrammarParser).rules + self.assertEqual(str(rules["start"]), "start: ','.thing+ NEWLINE") + print(repr(rules["start"])) + self.assertTrue(repr(rules["start"]).startswith( + "Rule('start', None, Rhs([Alt([NamedItem(None, Gather(StringLeaf(\"','\"), NameLeaf('thing'" + )) + self.assertEqual(str(rules["thing"]), "thing: NUMBER") + + def test_expr_grammar(self) -> None: + grammar = """ + start: sum NEWLINE + sum: term '+' term | term + term: NUMBER + """ + parser_class = make_parser(grammar) + node = parse_string("42\n", parser_class) + self.assertEqual(node, [ + [[TokenInfo(NUMBER, string="42", start=(1, 0), end=(1, 2), line="42\n")]], + TokenInfo(NEWLINE, string="\n", start=(1, 2), end=(1, 3), line="42\n"), + ]) + + def test_optional_operator(self) -> None: + grammar = """ + start: sum NEWLINE + sum: term ('+' term)? + term: NUMBER + """ + parser_class = make_parser(grammar) + node = parse_string("1+2\n", parser_class) + self.assertEqual(node, [ + [ + [TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1+2\n")], + [ + TokenInfo(OP, string="+", start=(1, 1), end=(1, 2), line="1+2\n"), + [TokenInfo(NUMBER, string="2", start=(1, 2), end=(1, 3), line="1+2\n")], + ], + ], + TokenInfo(NEWLINE, string="\n", start=(1, 3), end=(1, 4), line="1+2\n"), + ]) + node = parse_string("1\n", parser_class) + self.assertEqual(node, [ + [[TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1\n")], None], + TokenInfo(NEWLINE, string="\n", start=(1, 1), end=(1, 2), line="1\n"), + ]) + + def test_optional_literal(self) -> None: + grammar = """ + start: sum NEWLINE + sum: term '+' ? + term: NUMBER + """ + parser_class = make_parser(grammar) + node = parse_string("1+\n", parser_class) + self.assertEqual(node, [ + [ + [TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1+\n")], + TokenInfo(OP, string="+", start=(1, 1), end=(1, 2), line="1+\n"), + ], + TokenInfo(NEWLINE, string="\n", start=(1, 2), end=(1, 3), line="1+\n"), + ]) + node = parse_string("1\n", parser_class) + self.assertEqual(node, [ + [[TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1\n")], None], + TokenInfo(NEWLINE, string="\n", start=(1, 1), end=(1, 2), line="1\n"), + ]) + + def test_alt_optional_operator(self) -> None: + grammar = """ + start: sum NEWLINE + sum: term ['+' term] + term: NUMBER + """ + parser_class = make_parser(grammar) + node = parse_string("1 + 2\n", parser_class) + self.assertEqual(node, [ + [ + [TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 + 2\n")], + [ + TokenInfo(OP, string="+", start=(1, 2), end=(1, 3), line="1 + 2\n"), + [TokenInfo(NUMBER, string="2", start=(1, 4), end=(1, 5), line="1 + 2\n")], + ], + ], + TokenInfo(NEWLINE, string="\n", start=(1, 5), end=(1, 6), line="1 + 2\n"), + ]) + node = parse_string("1\n", parser_class) + self.assertEqual(node, [ + [[TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1\n")], None], + TokenInfo(NEWLINE, string="\n", start=(1, 1), end=(1, 2), line="1\n"), + ]) + + def test_repeat_0_simple(self) -> None: + grammar = """ + start: thing thing* NEWLINE + thing: NUMBER + """ + parser_class = make_parser(grammar) + node = parse_string("1 2 3\n", parser_class) + self.assertEqual(node, [ + [TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 2 3\n")], + [ + [[TokenInfo(NUMBER, string="2", start=(1, 2), end=(1, 3), line="1 2 3\n")]], + [[TokenInfo(NUMBER, string="3", start=(1, 4), end=(1, 5), line="1 2 3\n")]], + ], + TokenInfo(NEWLINE, string="\n", start=(1, 5), end=(1, 6), line="1 2 3\n"), + ]) + node = parse_string("1\n", parser_class) + self.assertEqual(node, [ + [TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1\n")], + [], + TokenInfo(NEWLINE, string="\n", start=(1, 1), end=(1, 2), line="1\n"), + ]) + + def test_repeat_0_complex(self) -> None: + grammar = """ + start: term ('+' term)* NEWLINE + term: NUMBER + """ + parser_class = make_parser(grammar) + node = parse_string("1 + 2 + 3\n", parser_class) + self.assertEqual(node, [ + [TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 + 2 + 3\n")], + [ + [ + [ + TokenInfo(OP, string="+", start=(1, 2), end=(1, 3), line="1 + 2 + 3\n"), + [TokenInfo(NUMBER, string="2", start=(1, 4), end=(1, 5), line="1 + 2 + 3\n")], + ] + ], + [ + [ + TokenInfo(OP, string="+", start=(1, 6), end=(1, 7), line="1 + 2 + 3\n"), + [TokenInfo(NUMBER, string="3", start=(1, 8), end=(1, 9), line="1 + 2 + 3\n")], + ] + ], + ], + TokenInfo(NEWLINE, string="\n", start=(1, 9), end=(1, 10), line="1 + 2 + 3\n"), + ]) + + def test_repeat_1_simple(self) -> None: + grammar = """ + start: thing thing+ NEWLINE + thing: NUMBER + """ + parser_class = make_parser(grammar) + node = parse_string("1 2 3\n", parser_class) + self.assertEqual(node, [ + [TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 2 3\n")], + [ + [[TokenInfo(NUMBER, string="2", start=(1, 2), end=(1, 3), line="1 2 3\n")]], + [[TokenInfo(NUMBER, string="3", start=(1, 4), end=(1, 5), line="1 2 3\n")]], + ], + TokenInfo(NEWLINE, string="\n", start=(1, 5), end=(1, 6), line="1 2 3\n"), + ]) + with self.assertRaises(SyntaxError): + parse_string("1\n", parser_class) + + def test_repeat_1_complex(self) -> None: + grammar = """ + start: term ('+' term)+ NEWLINE + term: NUMBER + """ + parser_class = make_parser(grammar) + node = parse_string("1 + 2 + 3\n", parser_class) + self.assertEqual(node, [ + [TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 + 2 + 3\n")], + [ + [ + [ + TokenInfo(OP, string="+", start=(1, 2), end=(1, 3), line="1 + 2 + 3\n"), + [TokenInfo(NUMBER, string="2", start=(1, 4), end=(1, 5), line="1 + 2 + 3\n")], + ] + ], + [ + [ + TokenInfo(OP, string="+", start=(1, 6), end=(1, 7), line="1 + 2 + 3\n"), + [TokenInfo(NUMBER, string="3", start=(1, 8), end=(1, 9), line="1 + 2 + 3\n")], + ] + ], + ], + TokenInfo(NEWLINE, string="\n", start=(1, 9), end=(1, 10), line="1 + 2 + 3\n"), + ]) + with self.assertRaises(SyntaxError): + parse_string("1\n", parser_class) + + def test_repeat_with_sep_simple(self) -> None: + grammar = """ + start: ','.thing+ NEWLINE + thing: NUMBER + """ + parser_class = make_parser(grammar) + node = parse_string("1, 2, 3\n", parser_class) + self.assertEqual(node, [ + [ + [TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1, 2, 3\n")], + [TokenInfo(NUMBER, string="2", start=(1, 3), end=(1, 4), line="1, 2, 3\n")], + [TokenInfo(NUMBER, string="3", start=(1, 6), end=(1, 7), line="1, 2, 3\n")], + ], + TokenInfo(NEWLINE, string="\n", start=(1, 7), end=(1, 8), line="1, 2, 3\n"), + ]) + + def test_left_recursive(self) -> None: + grammar_source = """ + start: expr NEWLINE + expr: ('-' term | expr '+' term | term) + term: NUMBER + foo: NAME+ + bar: NAME* + baz: NAME? + """ + grammar: Grammar = parse_string(grammar_source, GrammarParser) + parser_class = generate_parser(grammar) + rules = grammar.rules + self.assertFalse(rules["start"].left_recursive) + self.assertTrue(rules["expr"].left_recursive) + self.assertFalse(rules["term"].left_recursive) + self.assertFalse(rules["foo"].left_recursive) + self.assertFalse(rules["bar"].left_recursive) + self.assertFalse(rules["baz"].left_recursive) + node = parse_string("1 + 2 + 3\n", parser_class) + self.assertEqual(node, [ + [ + [ + [[TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 + 2 + 3\n")]], + TokenInfo(OP, string="+", start=(1, 2), end=(1, 3), line="1 + 2 + 3\n"), + [TokenInfo(NUMBER, string="2", start=(1, 4), end=(1, 5), line="1 + 2 + 3\n")], + ], + TokenInfo(OP, string="+", start=(1, 6), end=(1, 7), line="1 + 2 + 3\n"), + [TokenInfo(NUMBER, string="3", start=(1, 8), end=(1, 9), line="1 + 2 + 3\n")], + ], + TokenInfo(NEWLINE, string="\n", start=(1, 9), end=(1, 10), line="1 + 2 + 3\n"), + ]) + + def test_python_expr(self) -> None: + grammar = """ + start: expr NEWLINE? $ { ast.Expression(expr, lineno=1, col_offset=0) } + expr: ( expr '+' term { ast.BinOp(expr, ast.Add(), term, lineno=expr.lineno, col_offset=expr.col_offset, end_lineno=term.end_lineno, end_col_offset=term.end_col_offset) } + | expr '-' term { ast.BinOp(expr, ast.Sub(), term, lineno=expr.lineno, col_offset=expr.col_offset, end_lineno=term.end_lineno, end_col_offset=term.end_col_offset) } + | term { term } + ) + term: ( l=term '*' r=factor { ast.BinOp(l, ast.Mult(), r, lineno=l.lineno, col_offset=l.col_offset, end_lineno=r.end_lineno, end_col_offset=r.end_col_offset) } + | l=term '/' r=factor { ast.BinOp(l, ast.Div(), r, lineno=l.lineno, col_offset=l.col_offset, end_lineno=r.end_lineno, end_col_offset=r.end_col_offset) } + | factor { factor } + ) + factor: ( '(' expr ')' { expr } + | atom { atom } + ) + atom: ( n=NAME { ast.Name(id=n.string, ctx=ast.Load(), lineno=n.start[0], col_offset=n.start[1], end_lineno=n.end[0], end_col_offset=n.end[1]) } + | n=NUMBER { ast.Constant(value=ast.literal_eval(n.string), lineno=n.start[0], col_offset=n.start[1], end_lineno=n.end[0], end_col_offset=n.end[1]) } + ) + """ + parser_class = make_parser(grammar) + node = parse_string("(1 + 2*3 + 5)/(6 - 2)\n", parser_class) + code = compile(node, "", "eval") + val = eval(code) + self.assertEqual(val, 3.0) + + def test_nullable(self) -> None: + grammar_source = """ + start: sign NUMBER + sign: ['-' | '+'] + """ + grammar: Grammar = parse_string(grammar_source, GrammarParser) + out = io.StringIO() + genr = PythonParserGenerator(grammar, out) + rules = grammar.rules + self.assertFalse(rules["start"].nullable) # Not None! + self.assertTrue(rules["sign"].nullable) + + def test_advanced_left_recursive(self) -> None: + grammar_source = """ + start: NUMBER | sign start + sign: ['-'] + """ + grammar: Grammar = parse_string(grammar_source, GrammarParser) + out = io.StringIO() + genr = PythonParserGenerator(grammar, out) + rules = grammar.rules + self.assertFalse(rules["start"].nullable) # Not None! + self.assertTrue(rules["sign"].nullable) + self.assertTrue(rules["start"].left_recursive) + self.assertFalse(rules["sign"].left_recursive) + + def test_mutually_left_recursive(self) -> None: + grammar_source = """ + start: foo 'E' + foo: bar 'A' | 'B' + bar: foo 'C' | 'D' + """ + grammar: Grammar = parse_string(grammar_source, GrammarParser) + out = io.StringIO() + genr = PythonParserGenerator(grammar, out) + rules = grammar.rules + self.assertFalse(rules["start"].left_recursive) + self.assertTrue(rules["foo"].left_recursive) + self.assertTrue(rules["bar"].left_recursive) + genr.generate("") + ns: Dict[str, Any] = {} + exec(out.getvalue(), ns) + parser_class: Type[Parser] = ns["GeneratedParser"] + node = parse_string("D A C A E", parser_class) + self.assertEqual(node, [ + [ + [ + [ + [TokenInfo(type=NAME, string="D", start=(1, 0), end=(1, 1), line="D A C A E")], + TokenInfo(type=NAME, string="A", start=(1, 2), end=(1, 3), line="D A C A E"), + ], + TokenInfo(type=NAME, string="C", start=(1, 4), end=(1, 5), line="D A C A E"), + ], + TokenInfo(type=NAME, string="A", start=(1, 6), end=(1, 7), line="D A C A E"), + ], + TokenInfo(type=NAME, string="E", start=(1, 8), end=(1, 9), line="D A C A E"), + ]) + node = parse_string("B C A E", parser_class) + self.assertIsNotNone(node) + self.assertEqual(node, [ + [ + [ + [TokenInfo(type=NAME, string="B", start=(1, 0), end=(1, 1), line="B C A E")], + TokenInfo(type=NAME, string="C", start=(1, 2), end=(1, 3), line="B C A E"), + ], + TokenInfo(type=NAME, string="A", start=(1, 4), end=(1, 5), line="B C A E"), + ], + TokenInfo(type=NAME, string="E", start=(1, 6), end=(1, 7), line="B C A E"), + ]) + + def test_nasty_mutually_left_recursive(self) -> None: + # This grammar does not recognize 'x - + =', much to my chagrin. + # But that's the way PEG works. + # [Breathlessly] + # The problem is that the toplevel target call + # recurses into maybe, which recognizes 'x - +', + # and then the toplevel target looks for another '+', + # which fails, so it retreats to NAME, + # which succeeds, so we end up just recognizing 'x', + # and then start fails because there's no '=' after that. + grammar_source = """ + start: target '=' + target: maybe '+' | NAME + maybe: maybe '-' | target + """ + grammar: Grammar = parse_string(grammar_source, GrammarParser) + out = io.StringIO() + genr = PythonParserGenerator(grammar, out) + genr.generate("") + ns: Dict[str, Any] = {} + exec(out.getvalue(), ns) + parser_class = ns["GeneratedParser"] + with self.assertRaises(SyntaxError): + parse_string("x - + =", parser_class) + + def test_lookahead(self) -> None: + grammar = """ + start: (expr_stmt | assign_stmt) &'.' + expr_stmt: !(target '=') expr + assign_stmt: target '=' expr + expr: term ('+' term)* + target: NAME + term: NUMBER + """ + parser_class = make_parser(grammar) + node = parse_string("foo = 12 + 12 .", parser_class) + self.assertEqual(node, [ + [ + [ + [TokenInfo(NAME, string="foo", start=(1, 0), end=(1, 3), line="foo = 12 + 12 .")], + TokenInfo(OP, string="=", start=(1, 4), end=(1, 5), line="foo = 12 + 12 ."), + [ + [ + TokenInfo( + NUMBER, string="12", start=(1, 6), end=(1, 8), line="foo = 12 + 12 ." + ) + ], + [ + [ + [ + TokenInfo( + OP, + string="+", + start=(1, 9), + end=(1, 10), + line="foo = 12 + 12 .", + ), + [ + TokenInfo( + NUMBER, + string="12", + start=(1, 11), + end=(1, 13), + line="foo = 12 + 12 .", + ) + ], + ] + ] + ], + ], + ] + ] + ]) + + def test_named_lookahead_error(self) -> None: + grammar = """ + start: foo=!'x' NAME + """ + with self.assertRaises(SyntaxError): + make_parser(grammar) + + def test_start_leader(self) -> None: + grammar = """ + start: attr | NAME + attr: start '.' NAME + """ + # Would assert False without a special case in compute_left_recursives(). + make_parser(grammar) + + def test_left_recursion_too_complex(self) -> None: + grammar = """ + start: foo + foo: bar '+' | baz '+' | '+' + bar: baz '-' | foo '-' | '-' + baz: foo '*' | bar '*' | '*' + """ + with self.assertRaises(ValueError) as errinfo: + make_parser(grammar) + self.assertTrue("no leader" in str(errinfo.exception.value)) + + def test_cut(self) -> None: + grammar = """ + start: '(' ~ expr ')' + expr: NUMBER + """ + parser_class = make_parser(grammar) + node = parse_string("(1)", parser_class, verbose=True) + self.assertEqual(node, [ + TokenInfo(OP, string="(", start=(1, 0), end=(1, 1), line="(1)"), + [TokenInfo(NUMBER, string="1", start=(1, 1), end=(1, 2), line="(1)")], + TokenInfo(OP, string=")", start=(1, 2), end=(1, 3), line="(1)"), + ]) + + def test_dangling_reference(self) -> None: + grammar = """ + start: foo ENDMARKER + foo: bar NAME + """ + with self.assertRaises(GrammarError): + parser_class = make_parser(grammar) + + def test_bad_token_reference(self) -> None: + grammar = """ + start: foo + foo: NAMEE + """ + with self.assertRaises(GrammarError): + parser_class = make_parser(grammar) + + def test_missing_start(self) -> None: + grammar = """ + foo: NAME + """ + with self.assertRaises(GrammarError): + parser_class = make_parser(grammar) + + +class TestGrammarVisitor: + class Visitor(GrammarVisitor): + def __init__(self) -> None: + self.n_nodes = 0 + + def visit(self, node: Any, *args: Any, **kwargs: Any) -> None: + self.n_nodes += 1 + super().visit(node, *args, **kwargs) + + def test_parse_trivial_grammar(self) -> None: + grammar = """ + start: 'a' + """ + rules = parse_string(grammar, GrammarParser) + visitor = self.Visitor() + + visitor.visit(rules) + + self.assertEqual(visitor.n_nodes, 6) + + def test_parse_or_grammar(self) -> None: + grammar = """ + start: rule + rule: 'a' | 'b' + """ + rules = parse_string(grammar, GrammarParser) + visitor = self.Visitor() + + visitor.visit(rules) + + # Grammar/Rule/Rhs/Alt/NamedItem/NameLeaf -> 6 + # Rule/Rhs/ -> 2 + # Alt/NamedItem/StringLeaf -> 3 + # Alt/NamedItem/StringLeaf -> 3 + + self.assertEqual(visitor.n_nodes, 14) + + def test_parse_repeat1_grammar(self) -> None: + grammar = """ + start: 'a'+ + """ + rules = parse_string(grammar, GrammarParser) + visitor = self.Visitor() + + visitor.visit(rules) + + # Grammar/Rule/Rhs/Alt/NamedItem/Repeat1/StringLeaf -> 6 + self.assertEqual(visitor.n_nodes, 7) + + def test_parse_repeat0_grammar(self) -> None: + grammar = """ + start: 'a'* + """ + rules = parse_string(grammar, GrammarParser) + visitor = self.Visitor() + + visitor.visit(rules) + + # Grammar/Rule/Rhs/Alt/NamedItem/Repeat0/StringLeaf -> 6 + + self.assertEqual(visitor.n_nodes, 7) + + def test_parse_optional_grammar(self) -> None: + grammar = """ + start: 'a' ['b'] + """ + rules = parse_string(grammar, GrammarParser) + visitor = self.Visitor() + + visitor.visit(rules) + + # Grammar/Rule/Rhs/Alt/NamedItem/StringLeaf -> 6 + # NamedItem/Opt/Rhs/Alt/NamedItem/Stringleaf -> 6 + + self.assertEqual(visitor.n_nodes, 12) + + +class TestGrammarVisualizer(unittest.TestCase): + def test_simple_rule(self) -> None: + grammar = """ + start: 'a' 'b' + """ + rules = parse_string(grammar, GrammarParser) + + printer = ASTGrammarPrinter() + lines: List[str] = [] + printer.print_grammar_ast(rules, printer=lines.append) + + output = "\n".join(lines) + expected_output = textwrap.dedent( + """\ + └──Rule + └──Rhs + └──Alt + ├──NamedItem + │ └──StringLeaf("'a'") + └──NamedItem + └──StringLeaf("'b'") + """ + ) + + self.assertEqual(output, expected_output) + + def test_multiple_rules(self) -> None: + grammar = """ + start: a b + a: 'a' + b: 'b' + """ + rules = parse_string(grammar, GrammarParser) + + printer = ASTGrammarPrinter() + lines: List[str] = [] + printer.print_grammar_ast(rules, printer=lines.append) + + output = "\n".join(lines) + expected_output = textwrap.dedent( + """\ + └──Rule + └──Rhs + └──Alt + ├──NamedItem + │ └──NameLeaf('a') + └──NamedItem + └──NameLeaf('b') + + └──Rule + └──Rhs + └──Alt + └──NamedItem + └──StringLeaf("'a'") + + └──Rule + └──Rhs + └──Alt + └──NamedItem + └──StringLeaf("'b'") + """ + ) + + self.assertEqual(output, expected_output) + + def test_deep_nested_rule(self) -> None: + grammar = """ + start: 'a' ['b'['c'['d']]] + """ + rules = parse_string(grammar, GrammarParser) + + printer = ASTGrammarPrinter() + lines: List[str] = [] + printer.print_grammar_ast(rules, printer=lines.append) + + output = "\n".join(lines) + print() + print(output) + expected_output = textwrap.dedent( + """\ + └──Rule + └──Rhs + └──Alt + ├──NamedItem + │ └──StringLeaf("'a'") + └──NamedItem + └──Opt + └──Rhs + └──Alt + ├──NamedItem + │ └──StringLeaf("'b'") + └──NamedItem + └──Opt + └──Rhs + └──Alt + ├──NamedItem + │ └──StringLeaf("'c'") + └──NamedItem + └──Opt + └──Rhs + └──Alt + └──NamedItem + └──StringLeaf("'d'") + """ + ) + + self.assertEqual(output, expected_output) + diff --git a/Lib/test/test_peg_parser.py b/Lib/test/test_peg_parser.py index f2d376119eb847..eae41aebd0fcb6 100644 --- a/Lib/test/test_peg_parser.py +++ b/Lib/test/test_peg_parser.py @@ -579,6 +579,33 @@ def f(): """), ] +FSTRINGS_TRACEBACKS = { + 'multiline_fstrings_same_line_with_brace': ( + """ + f''' + {a$b} + ''' + """, + '(a$b)', + ), + 'multiline_fstring_brace_on_next_line': ( + """ + f''' + {a$b + }''' + """, + '(a$b', + ), + 'multiline_fstring_brace_on_previous_line': ( + """ + f''' + { + a$b}''' + """, + 'a$b)', + ), +} + def cleanup_source(source: Any) -> str: if isinstance(source, str): @@ -648,3 +675,9 @@ def test_correct_ast_generation_without_pos_info(self) -> None: ast.dump(expected_ast), f"Wrong AST generation for source: {source}", ) + + def test_fstring_parse_error_tracebacks(self) -> None: + for source, error_text in FSTRINGS_TRACEBACKS.values(): + with self.assertRaises(SyntaxError) as se: + peg_parser.parse_string(dedent(source)) + self.assertEqual(error_text, se.exception.text) diff --git a/Tools/peg_generator/Makefile b/Tools/peg_generator/Makefile index 8861fd4ccec5a2..7132c57644e874 100644 --- a/Tools/peg_generator/Makefile +++ b/Tools/peg_generator/Makefile @@ -1,4 +1,11 @@ -PYTHON ?= python3.8 +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Linux) + PYTHON ?= ../../python +endif +ifeq ($(UNAME_S),Darwin) + PYTHON ?= ../../python.exe +endif + CPYTHON ?= ../.. MYPY ?= mypy @@ -8,22 +15,22 @@ TIMEFILE = data/xxl.py TESTDIR = . TESTFLAGS = --short -build: peg_parser/parse.c +build: peg_extension/parse.c -peg_parser/parse.c: $(GRAMMAR) pegen/*.py peg_parser/pegen.c peg_parser/parse_string.c peg_parser/*.h pegen/grammar_parser.py - $(PYTHON) -m pegen -q -c $(GRAMMAR) -o peg_parser/parse.c --compile-extension +peg_extension/parse.c: $(GRAMMAR) pegen/*.py peg_extension/peg_extension.c ../../Parser/pegen/pegen.c ../../Parser/pegen/parse_string.c ../../Parser/pegen/*.h pegen/grammar_parser.py + $(PYTHON) -m pegen -q -c $(GRAMMAR) -o peg_extension/parse.c --compile-extension clean: - -rm -f peg_parser/*.o peg_parser/*.so peg_parser/parse.c + -rm -f peg_extension/*.o peg_extension/*.so peg_extension/parse.c -dump: peg_parser/parse.c +dump: peg_extension/parse.c cat -n $(TESTFILE) - $(PYTHON) -c "from peg_parser import parse; import ast; t = parse.parse_file('$(TESTFILE)', mode=1); print(ast.dump(t))" + $(PYTHON) -c "from peg_extension import parse; import ast; t = parse.parse_file('$(TESTFILE)', mode=1); print(ast.dump(t))" regen-metaparser: pegen/metagrammar.gram pegen/*.py $(PYTHON) -m pegen -q -c pegen/metagrammar.gram -o pegen/grammar_parser.py -# Note: These targets really depend on the generated shared object in peg_parser/parse.*.so but +# Note: These targets really depend on the generated shared object in peg_extension/parse.*.so but # this has different names in different systems so we are abusing the implicit dependency on # parse.c by the use of --compile-extension. @@ -31,28 +38,28 @@ regen-metaparser: pegen/metagrammar.gram pegen/*.py test: run -run: peg_parser/parse.c - $(PYTHON) -c "from peg_parser import parse; t = parse.parse_file('$(TESTFILE)'); exec(t)" +run: peg_extension/parse.c + $(PYTHON) -c "from peg_extension import parse; t = parse.parse_file('$(TESTFILE)'); exec(t)" -compile: peg_parser/parse.c - $(PYTHON) -c "from peg_parser import parse; t = parse.parse_file('$(TESTFILE)', mode=2)" +compile: peg_extension/parse.c + $(PYTHON) -c "from peg_extension import parse; t = parse.parse_file('$(TESTFILE)', mode=2)" -parse: peg_parser/parse.c - $(PYTHON) -c "from peg_parser import parse; t = parse.parse_file('$(TESTFILE)', mode=1)" +parse: peg_extension/parse.c + $(PYTHON) -c "from peg_extension import parse; t = parse.parse_file('$(TESTFILE)', mode=1)" -check: peg_parser/parse.c - $(PYTHON) -c "from peg_parser import parse; t = parse.parse_file('$(TESTFILE)', mode=0)" +check: peg_extension/parse.c + $(PYTHON) -c "from peg_extension import parse; t = parse.parse_file('$(TESTFILE)', mode=0)" time: time_compile -time_compile: peg_parser/parse.c - /usr/bin/time -l $(PYTHON) -c "from peg_parser import parse; parse.parse_file('$(TIMEFILE)', mode=2)" +time_compile: peg_extension/parse.c + /usr/bin/time -l $(PYTHON) -c "from peg_extension import parse; parse.parse_file('$(TIMEFILE)', mode=2)" -time_parse: peg_parser/parse.c - /usr/bin/time -l $(PYTHON) -c "from peg_parser import parse; parse.parse_file('$(TIMEFILE)', mode=1)" +time_parse: peg_extension/parse.c + /usr/bin/time -l $(PYTHON) -c "from peg_extension import parse; parse.parse_file('$(TIMEFILE)', mode=1)" -time_check: peg_parser/parse.c - /usr/bin/time -l $(PYTHON) -c "from peg_parser import parse; parse.parse_file('$(TIMEFILE)', mode=0)" +time_check: peg_extension/parse.c + /usr/bin/time -l $(PYTHON) -c "from peg_extension import parse; parse.parse_file('$(TIMEFILE)', mode=0)" time_stdlib: time_stdlib_compile @@ -92,26 +99,7 @@ bench: cpython $(MAKE) -s test_global 2>/dev/null $(MAKE) -s test_global 2>/dev/null -# To install clang-format: -# on mac: "brew install clang-format" -# on ubuntu: "apt-get install clang-format" -# on arch: "pacman -S clang" -format-c: - clang-format peg_parser/pegen.c -i - -# To install clang-tidy: -# on mac: -# "brew install llvm" -# Then, create symlinks to the binaries. For example: -# ln -s "$(brew --prefix llvm)/bin/clang-format" "/usr/local/bin/clang-format" -# ln -s "$(brew --prefix llvm)/bin/clang-tidy" "/usr/local/bin/clang-tidy" -# on ubuntu: "apt-get install clang-tidy" -# on arch: "pacman -S clang" -clang-tidy: - $(eval COMPILE_OPTIONS = $(shell python-config --cflags)) - clang-tidy peg_parser/pegen.c -fix-errors -fix -checks="readability-braces-around-statements" -- $(COMPILE_OPTIONS) 1>/dev/null - -format: format-python format-c +format: format-python find_max_nesting: $(PYTHON) scripts/find_max_nesting.py diff --git a/Tools/peg_generator/data/python.gram b/Tools/peg_generator/data/python.gram index f0cf23264e18b0..1c794c516a06a6 100644 --- a/Tools/peg_generator/data/python.gram +++ b/Tools/peg_generator/data/python.gram @@ -338,21 +338,15 @@ primary[expr_ty]: (b) ? ((expr_ty) b)->v.Call.args : NULL, (b) ? ((expr_ty) b)->v.Call.keywords : NULL, EXTRA) } - | a=primary b=slicing { _Py_Subscript(a, b, Load, EXTRA) } + | a=primary '[' b=slices ']' { _Py_Subscript(a, b, Load, EXTRA) } | atom -slicing[slice_ty]: - | '[' b=expression ']' { _Py_Index(b, p->arena) } - | '[' b=slice_expressions ']' { b } - | '[' b=slices ']' { b } -slice_expressions[slice_ty]: - | a=','.expression+ [','] { _Py_Index(_Py_Tuple(a, Load, EXTRA), p->arena) } -slices[slice_ty]: +slices[expr_ty]: | a=slice !',' { a } - | a=','.slice+ [','] { _Py_ExtSlice(a, p->arena) } -slice[slice_ty]: - | a=[expression] ':' b=[expression] c=[':' d=[expression] { d }] { _Py_Slice(a, b, c, p->arena) } - | a=expression { _Py_Index(a, p->arena) } + | a=','.slice+ [','] { _Py_Tuple(a, Load, EXTRA) } +slice[expr_ty]: + | a=[expression] ':' b=[expression] c=[':' d=[expression] { d }] { _Py_Slice(a, b, c, EXTRA) } + | a=expression { a } atom[expr_ty]: | NAME | 'True' { _Py_Constant(Py_True, NULL, EXTRA) } @@ -432,7 +426,7 @@ star_targets_seq[asdl_seq*]: a=','.star_target+ [','] { a } star_target[expr_ty]: | '*' a=bitwise_or { _Py_Starred(CHECK(set_expr_context(p, a, Store)), Store, EXTRA) } | a=t_primary '.' b=NAME !t_lookahead { _Py_Attribute(a, b->v.Name.id, Store, EXTRA) } - | a=t_primary b=slicing !t_lookahead { _Py_Subscript(a, b, Store, EXTRA) } + | a=t_primary '[' b=slices ']' !t_lookahead { _Py_Subscript(a, b, Store, EXTRA) } | star_atom star_atom[expr_ty]: | a=NAME { set_expr_context(p, a, Store) } @@ -447,12 +441,12 @@ inside_paren_ann_assign_target[expr_ty]: ann_assign_subscript_attribute_target[expr_ty]: | a=t_primary '.' b=NAME !t_lookahead { _Py_Attribute(a, b->v.Name.id, Store, EXTRA) } - | a=t_primary b=slicing !t_lookahead { _Py_Subscript(a, b, Store, EXTRA) } + | a=t_primary '[' b=slices ']' !t_lookahead { _Py_Subscript(a, b, Store, EXTRA) } del_targets[asdl_seq*]: a=','.del_target+ [','] { a } del_target[expr_ty]: | a=t_primary '.' b=NAME !t_lookahead { _Py_Attribute(a, b->v.Name.id, Del, EXTRA) } - | a=t_primary b=slicing !t_lookahead { _Py_Subscript(a, b, Del, EXTRA) } + | a=t_primary '[' b=slices ']' !t_lookahead { _Py_Subscript(a, b, Del, EXTRA) } | del_t_atom del_t_atom[expr_ty]: | a=NAME { set_expr_context(p, a, Del) } @@ -463,11 +457,11 @@ del_t_atom[expr_ty]: targets[asdl_seq*]: a=','.target+ [','] { a } target[expr_ty]: | a=t_primary '.' b=NAME !t_lookahead { _Py_Attribute(a, b->v.Name.id, Store, EXTRA) } - | a=t_primary b=slicing !t_lookahead { _Py_Subscript(a, b, Store, EXTRA) } + | a=t_primary '[' b=slices ']' !t_lookahead { _Py_Subscript(a, b, Store, EXTRA) } | t_atom t_primary[expr_ty]: | a=t_primary '.' b=NAME &t_lookahead { _Py_Attribute(a, b->v.Name.id, Load, EXTRA) } - | a=t_primary b=slicing &t_lookahead { _Py_Subscript(a, b, Load, EXTRA) } + | a=t_primary '[' b=slices ']' &t_lookahead { _Py_Subscript(a, b, Load, EXTRA) } | a=t_primary b=genexp &t_lookahead { _Py_Call(a, CHECK(singleton_seq(p, b)), NULL, EXTRA) } | a=t_primary '(' b=[arguments] ')' &t_lookahead { _Py_Call(a, diff --git a/Tools/peg_generator/peg_extension/peg_extension.c b/Tools/peg_generator/peg_extension/peg_extension.c new file mode 100644 index 00000000000000..ff805e0d62e28b --- /dev/null +++ b/Tools/peg_generator/peg_extension/peg_extension.c @@ -0,0 +1,113 @@ +#include "pegen.h" + +PyObject * +_build_return_object(mod_ty module, int mode, PyObject *filename_ob, PyArena *arena) +{ + PyObject *result = NULL; + + if (mode == 2) { + result = (PyObject *)PyAST_CompileObject(module, filename_ob, NULL, -1, arena); + } else if (mode == 1) { + result = PyAST_mod2obj(module); + } else { + result = Py_None; + Py_INCREF(result); + + } + + return result; +} + +static PyObject * +parse_file(PyObject *self, PyObject *args, PyObject *kwds) +{ + static char *keywords[] = {"file", "mode", NULL}; + const char *filename; + int mode = 2; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|i", keywords, &filename, &mode)) { + return NULL; + } + if (mode < 0 || mode > 2) { + return PyErr_Format(PyExc_ValueError, "Bad mode, must be 0 <= mode <= 2"); + } + + PyArena *arena = PyArena_New(); + if (arena == NULL) { + return NULL; + } + + PyObject *result = NULL; + + PyObject *filename_ob = PyUnicode_FromString(filename); + if (filename_ob == NULL) { + goto error; + } + + mod_ty res = run_parser_from_file(filename, START, filename_ob, arena); + if (res == NULL) { + goto error; + } + + result = _build_return_object(res, mode, filename_ob, arena); + +error: + Py_XDECREF(filename_ob); + PyArena_Free(arena); + return result; +} + +static PyObject * +parse_string(PyObject *self, PyObject *args, PyObject *kwds) +{ + static char *keywords[] = {"str", "mode", NULL}; + const char *the_string; + int mode = 2; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|i", keywords, &the_string, &mode)) { + return NULL; + } + if (mode < 0 || mode > 2) { + return PyErr_Format(PyExc_ValueError, "Bad mode, must be 0 <= mode <= 2"); + } + + PyArena *arena = PyArena_New(); + if (arena == NULL) { + return NULL; + } + + PyObject *result = NULL; + + PyObject *filename_ob = PyUnicode_FromString(""); + if (filename_ob == NULL) { + goto error; + } + + mod_ty res = run_parser_from_string(the_string, START, filename_ob, arena); + if (res == NULL) { + goto error; + } + result = _build_return_object(res, mode, filename_ob, arena); + +error: + Py_XDECREF(filename_ob); + PyArena_Free(arena); + return result; +} + +static PyMethodDef ParseMethods[] = { + {"parse_file", (PyCFunction)(void(*)(void))parse_file, METH_VARARGS|METH_KEYWORDS, "Parse a file."}, + {"parse_string", (PyCFunction)(void(*)(void))parse_string, METH_VARARGS|METH_KEYWORDS, "Parse a string."}, + {NULL, NULL, 0, NULL} /* Sentinel */ +}; + +static struct PyModuleDef parsemodule = { + PyModuleDef_HEAD_INIT, + .m_name = "parse", + .m_doc = "A parser.", + .m_methods = ParseMethods, +}; + +PyMODINIT_FUNC +PyInit_parse(void) +{ + return PyModule_Create(&parsemodule); +} diff --git a/Tools/peg_generator/peg_parser/parse_string.c b/Tools/peg_generator/peg_parser/parse_string.c deleted file mode 100644 index 4eaca41c1a3530..00000000000000 --- a/Tools/peg_generator/peg_parser/parse_string.c +++ /dev/null @@ -1,1427 +0,0 @@ -#include "parse_string.h" -#include "pegen.h" -#include "v38tokenizer.h" - -//// STRING HANDLING FUNCTIONS //// - -// These functions are ported directly from Python/ast.c with some modifications -// to account for the use of "Parser *p", the fact that don't have parser nodes -// to pass around and the usage of some specialized APIs present only in this -// file (like "raise_syntax_error"). - -static int -warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char) -{ - PyObject *msg = - PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char); - if (msg == NULL) { - return -1; - } - if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename, - p->tok->lineno, NULL, NULL) < 0) { - if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) { - /* Replace the DeprecationWarning exception with a SyntaxError - to get a more accurate error report */ - PyErr_Clear(); - raise_syntax_error(p, "invalid escape sequence \\%c", first_invalid_escape_char); - } - Py_DECREF(msg); - return -1; - } - Py_DECREF(msg); - return 0; -} - -static PyObject * -decode_utf8(const char **sPtr, const char *end) -{ - const char *s, *t; - t = s = *sPtr; - while (s < end && (*s & 0x80)) { - s++; - } - *sPtr = s; - return PyUnicode_DecodeUTF8(t, s - t, NULL); -} - -static PyObject * -decode_unicode_with_escapes(Parser *parser, const char *s, size_t len) -{ - PyObject *v, *u; - char *buf; - char *p; - const char *end; - - /* check for integer overflow */ - if (len > SIZE_MAX / 6) { - return NULL; - } - /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5 - "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */ - u = PyBytes_FromStringAndSize((char *)NULL, len * 6); - if (u == NULL) { - return NULL; - } - p = buf = PyBytes_AsString(u); - end = s + len; - while (s < end) { - if (*s == '\\') { - *p++ = *s++; - if (s >= end || *s & 0x80) { - strcpy(p, "u005c"); - p += 5; - if (s >= end) { - break; - } - } - } - if (*s & 0x80) { - PyObject *w; - int kind; - void *data; - Py_ssize_t len, i; - w = decode_utf8(&s, end); - if (w == NULL) { - Py_DECREF(u); - return NULL; - } - kind = PyUnicode_KIND(w); - data = PyUnicode_DATA(w); - len = PyUnicode_GET_LENGTH(w); - for (i = 0; i < len; i++) { - Py_UCS4 chr = PyUnicode_READ(kind, data, i); - sprintf(p, "\\U%08x", chr); - p += 10; - } - /* Should be impossible to overflow */ - assert(p - buf <= PyBytes_GET_SIZE(u)); - Py_DECREF(w); - } - else { - *p++ = *s++; - } - } - len = p - buf; - s = buf; - - const char *first_invalid_escape; - v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape); - - if (v != NULL && first_invalid_escape != NULL) { - if (warn_invalid_escape_sequence(parser, *first_invalid_escape) < 0) { - /* We have not decref u before because first_invalid_escape points - inside u. */ - Py_XDECREF(u); - Py_DECREF(v); - return NULL; - } - } - Py_XDECREF(u); - return v; -} - -static PyObject * -decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len) -{ - const char *first_invalid_escape; - PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, 0, NULL, &first_invalid_escape); - if (result == NULL) { - return NULL; - } - - if (first_invalid_escape != NULL) { - if (warn_invalid_escape_sequence(p, *first_invalid_escape) < 0) { - Py_DECREF(result); - return NULL; - } - } - return result; -} - -/* s must include the bracketing quote characters, and r, b, u, - &/or f prefixes (if any), and embedded escape sequences (if any). - parsestr parses it, and sets *result to decoded Python string object. - If the string is an f-string, set *fstr and *fstrlen to the unparsed - string object. Return 0 if no errors occurred. */ -int -parsestr(Parser *p, const char *s, int *bytesmode, int *rawmode, PyObject **result, - const char **fstr, Py_ssize_t *fstrlen) -{ - size_t len; - int quote = Py_CHARMASK(*s); - int fmode = 0; - *bytesmode = 0; - *rawmode = 0; - *result = NULL; - *fstr = NULL; - if (Py_ISALPHA(quote)) { - while (!*bytesmode || !*rawmode) { - if (quote == 'b' || quote == 'B') { - quote = *++s; - *bytesmode = 1; - } - else if (quote == 'u' || quote == 'U') { - quote = *++s; - } - else if (quote == 'r' || quote == 'R') { - quote = *++s; - *rawmode = 1; - } - else if (quote == 'f' || quote == 'F') { - quote = *++s; - fmode = 1; - } - else { - break; - } - } - } - - if (fmode && *bytesmode) { - PyErr_BadInternalCall(); - return -1; - } - if (quote != '\'' && quote != '\"') { - PyErr_BadInternalCall(); - return -1; - } - /* Skip the leading quote char. */ - s++; - len = strlen(s); - if (len > INT_MAX) { - PyErr_SetString(PyExc_OverflowError, "string to parse is too long"); - return -1; - } - if (s[--len] != quote) { - /* Last quote char must match the first. */ - PyErr_BadInternalCall(); - return -1; - } - if (len >= 4 && s[0] == quote && s[1] == quote) { - /* A triple quoted string. We've already skipped one quote at - the start and one at the end of the string. Now skip the - two at the start. */ - s += 2; - len -= 2; - /* And check that the last two match. */ - if (s[--len] != quote || s[--len] != quote) { - PyErr_BadInternalCall(); - return -1; - } - } - - if (fmode) { - /* Just return the bytes. The caller will parse the resulting - string. */ - *fstr = s; - *fstrlen = len; - return 0; - } - - /* Not an f-string. */ - /* Avoid invoking escape decoding routines if possible. */ - *rawmode = *rawmode || strchr(s, '\\') == NULL; - if (*bytesmode) { - /* Disallow non-ASCII characters. */ - const char *ch; - for (ch = s; *ch; ch++) { - if (Py_CHARMASK(*ch) >= 0x80) { - raise_syntax_error(p, - "bytes can only contain ASCII " - "literal characters."); - return -1; - } - } - if (*rawmode) { - *result = PyBytes_FromStringAndSize(s, len); - } - else { - *result = decode_bytes_with_escapes(p, s, len); - } - } - else { - if (*rawmode) { - *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL); - } - else { - *result = decode_unicode_with_escapes(p, s, len); - } - } - return *result == NULL ? -1 : 0; -} - - - -// FSTRING STUFF - -static void fstring_shift_expr_locations(expr_ty n, int lineno, int col_offset); -static void fstring_shift_argument(expr_ty parent, arg_ty args, int lineno, int col_offset); - - -static inline void shift_expr(expr_ty parent, expr_ty n, int line, int col) { - if (parent->lineno < n->lineno) { - col = 0; - } - fstring_shift_expr_locations(n, line, col); -} - -static inline void shift_arg(expr_ty parent, arg_ty n, int line, int col) { - if (parent->lineno < n->lineno) { - col = 0; - } - fstring_shift_argument(parent, n, line, col); -} - -static void fstring_shift_seq_locations(expr_ty parent, asdl_seq *seq, int lineno, int col_offset) { - for (int i = 0, l = asdl_seq_LEN(seq); i < l; i++) { - expr_ty expr = asdl_seq_GET(seq, i); - if (expr == NULL){ - continue; - } - shift_expr(parent, expr, lineno, col_offset); - } -} - -static void fstring_shift_slice_locations(expr_ty parent, slice_ty slice, int lineno, int col_offset) { - switch (slice->kind) { - case Slice_kind: - if (slice->v.Slice.lower) { - shift_expr(parent, slice->v.Slice.lower, lineno, col_offset); - } - if (slice->v.Slice.upper) { - shift_expr(parent, slice->v.Slice.upper, lineno, col_offset); - } - if (slice->v.Slice.step) { - shift_expr(parent, slice->v.Slice.step, lineno, col_offset); - } - break; - case ExtSlice_kind: - for (int i = 0, l = asdl_seq_LEN(slice->v.ExtSlice.dims); i < l; i++) { - slice_ty s = asdl_seq_GET(slice->v.ExtSlice.dims, i); - fstring_shift_slice_locations(parent, s, lineno, col_offset); - } - break; - case Index_kind: - shift_expr(parent, slice->v.Index.value, lineno, col_offset); - } -} - -static void fstring_shift_comprehension(expr_ty parent, comprehension_ty comp, int lineno, int col_offset) { - shift_expr(parent, comp->target, lineno, col_offset); - shift_expr(parent, comp->iter, lineno, col_offset); - fstring_shift_seq_locations(parent, comp->ifs, lineno, col_offset); -} - -static void fstring_shift_argument(expr_ty parent, arg_ty arg, int lineno, int col_offset) { - if (arg->annotation != NULL){ - shift_expr(parent, arg->annotation, lineno, col_offset); - } - arg->col_offset = arg->col_offset + col_offset; - arg->end_col_offset = arg->end_col_offset + col_offset; - arg->lineno = arg->lineno + lineno; - arg->end_lineno = arg->end_lineno + lineno; -} - -static void fstring_shift_arguments(expr_ty parent, arguments_ty args, int lineno, int col_offset) { - for (int i = 0, l = asdl_seq_LEN(args->posonlyargs); i < l; i++) { - arg_ty arg = asdl_seq_GET(args->posonlyargs, i); - shift_arg(parent, arg, lineno, col_offset); - } - - for (int i = 0, l = asdl_seq_LEN(args->args); i < l; i++) { - arg_ty arg = asdl_seq_GET(args->args, i); - shift_arg(parent, arg, lineno, col_offset); - } - - if (args->vararg != NULL) { - shift_arg(parent, args->vararg, lineno, col_offset); - } - - for (int i = 0, l = asdl_seq_LEN(args->kwonlyargs); i < l; i++) { - arg_ty arg = asdl_seq_GET(args->kwonlyargs, i); - shift_arg(parent, arg, lineno, col_offset); - } - - fstring_shift_seq_locations(parent, args->kw_defaults, lineno, col_offset); - - if (args->kwarg != NULL) { - shift_arg(parent, args->kwarg, lineno, col_offset); - } - - fstring_shift_seq_locations(parent, args->defaults, lineno, col_offset); -} - -static void fstring_shift_children_locations(expr_ty n, int lineno, int col_offset) { - switch (n->kind) { - case BoolOp_kind: - fstring_shift_seq_locations(n, n->v.BoolOp.values, lineno, col_offset); - break; - case NamedExpr_kind: - shift_expr(n, n->v.NamedExpr.target, lineno, col_offset); - shift_expr(n, n->v.NamedExpr.value, lineno, col_offset); - break; - case BinOp_kind: - shift_expr(n, n->v.BinOp.left, lineno, col_offset); - shift_expr(n, n->v.BinOp.right, lineno, col_offset); - break; - case UnaryOp_kind: - shift_expr(n, n->v.UnaryOp.operand, lineno, col_offset); - break; - case Lambda_kind: - fstring_shift_arguments(n, n->v.Lambda.args, lineno, col_offset); - shift_expr(n, n->v.Lambda.body, lineno, col_offset); - break; - case IfExp_kind: - shift_expr(n, n->v.IfExp.test, lineno, col_offset); - shift_expr(n, n->v.IfExp.body, lineno, col_offset); - shift_expr(n, n->v.IfExp.orelse, lineno, col_offset); - break; - case Dict_kind: - fstring_shift_seq_locations(n, n->v.Dict.keys, lineno, col_offset); - fstring_shift_seq_locations(n, n->v.Dict.values, lineno, col_offset); - break; - case Set_kind: - fstring_shift_seq_locations(n, n->v.Set.elts, lineno, col_offset); - break; - case ListComp_kind: - shift_expr(n, n->v.ListComp.elt, lineno, col_offset); - for (int i = 0, l = asdl_seq_LEN(n->v.ListComp.generators); i < l; i++) { - comprehension_ty comp = asdl_seq_GET(n->v.ListComp.generators, i); - fstring_shift_comprehension(n, comp, lineno, col_offset); - } - break; - case SetComp_kind: - shift_expr(n, n->v.SetComp.elt, lineno, col_offset); - for (int i = 0, l = asdl_seq_LEN(n->v.SetComp.generators); i < l; i++) { - comprehension_ty comp = asdl_seq_GET(n->v.SetComp.generators, i); - fstring_shift_comprehension(n, comp, lineno, col_offset); - } - break; - case DictComp_kind: - shift_expr(n, n->v.DictComp.key, lineno, col_offset); - shift_expr(n, n->v.DictComp.value, lineno, col_offset); - for (int i = 0, l = asdl_seq_LEN(n->v.DictComp.generators); i < l; i++) { - comprehension_ty comp = asdl_seq_GET(n->v.DictComp.generators, i); - fstring_shift_comprehension(n, comp, lineno, col_offset); - } - break; - case GeneratorExp_kind: - shift_expr(n, n->v.GeneratorExp.elt, lineno, col_offset); - for (int i = 0, l = asdl_seq_LEN(n->v.GeneratorExp.generators); i < l; i++) { - comprehension_ty comp = asdl_seq_GET(n->v.GeneratorExp.generators, i); - fstring_shift_comprehension(n, comp, lineno, col_offset); - } - break; - case Await_kind: - shift_expr(n, n->v.Await.value, lineno, col_offset); - break; - case Yield_kind: - shift_expr(n, n->v.Yield.value, lineno, col_offset); - break; - case YieldFrom_kind: - shift_expr(n, n->v.YieldFrom.value, lineno, col_offset); - break; - case Compare_kind: - shift_expr(n, n->v.Compare.left, lineno, col_offset); - fstring_shift_seq_locations(n, n->v.Compare.comparators, lineno, col_offset); - break; - case Call_kind: - shift_expr(n, n->v.Call.func, lineno, col_offset); - fstring_shift_seq_locations(n, n->v.Call.args, lineno, col_offset); - for (int i = 0, l = asdl_seq_LEN(n->v.Call.keywords); i < l; i++) { - keyword_ty keyword = asdl_seq_GET(n->v.Call.keywords, i); - shift_expr(n, keyword->value, lineno, col_offset); - } - break; - case Attribute_kind: - shift_expr(n, n->v.Attribute.value, lineno, col_offset); - break; - case Subscript_kind: - shift_expr(n, n->v.Subscript.value, lineno, col_offset); - fstring_shift_slice_locations(n, n->v.Subscript.slice, lineno, col_offset); - break; - case Starred_kind: - shift_expr(n, n->v.Starred.value, lineno, col_offset); - break; - case List_kind: - fstring_shift_seq_locations(n, n->v.List.elts, lineno, col_offset); - break; - case Tuple_kind: - fstring_shift_seq_locations(n, n->v.Tuple.elts, lineno, col_offset); - break; - default: - return; - } -} - -/* Shift locations for the given node and all its children by adding `lineno` - and `col_offset` to existing locations. Note that n is the already parsed - expression. */ -static void fstring_shift_expr_locations(expr_ty n, int lineno, int col_offset) -{ - n->col_offset = n->col_offset + col_offset; - - // The following is needed, in order for nodes spanning across multiple lines - // to be shifted correctly. An example of such a node is a Call node, the closing - // parenthesis of which is not on the same line as its name. - if (n->lineno == n->end_lineno) { - n->end_col_offset = n->end_col_offset + col_offset; - } - - fstring_shift_children_locations(n, lineno, col_offset); - n->lineno = n->lineno + lineno; - n->end_lineno = n->end_lineno + lineno; -} - -/* Fix locations for the given node and its children. - - `parent` is the enclosing node. - `n` is the node which locations are going to be fixed relative to parent. - `expr_str` is the child node's string representation, including braces. -*/ -static void -fstring_fix_expr_location(Token *parent, expr_ty n, char *expr_str) -{ - char *substr = NULL; - char *start; - int lines = parent->lineno - 1; - int cols = parent->col_offset; - - if (parent && parent->bytes) { - char *parent_str = PyBytes_AsString(parent->bytes); - if (!parent_str) { - return; - } - substr = strstr(parent_str, expr_str); - if (substr) { - // The following is needed, in order to correctly shift the column - // offset, in the case that (disregarding any whitespace) a newline - // immediately follows the opening curly brace of the fstring expression. - int newline_after_brace = 1; - start = substr + 1; - while (start && *start != '}' && *start != '\n') { - if (*start != ' ' && *start != '\t' && *start != '\f') { - newline_after_brace = 0; - break; - } - start++; - } - - // Account for the characters from the last newline character to our - // left until the beginning of substr. - if (!newline_after_brace) { - start = substr; - while (start > parent_str && start[0] != '\n') { - start--; - } - cols += (int)(substr - start); - } - /* adjust the start based on the number of newlines encountered - before the f-string expression */ - for (char* p = parent_str; p < substr; p++) { - if (*p == '\n') { - lines++; - } - } - } - } - fstring_shift_expr_locations(n, lines, cols); -} - - -/* Compile this expression in to an expr_ty. Add parens around the - expression, in order to allow leading spaces in the expression. */ -static expr_ty -fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end, - Token *t) -{ - mod_ty mod = NULL; - char *str; - Py_ssize_t len; - const char *s; - expr_ty result = NULL; - - assert(expr_end >= expr_start); - assert(*(expr_start-1) == '{'); - assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' || - *expr_end == '='); - - /* If the substring is all whitespace, it's an error. We need to catch this - here, and not when we call PyParser_SimpleParseStringFlagsFilename, - because turning the expression '' in to '()' would go from being invalid - to valid. */ - for (s = expr_start; s != expr_end; s++) { - char c = *s; - /* The Python parser ignores only the following whitespace - characters (\r already is converted to \n). */ - if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) { - break; - } - } - if (s == expr_end) { - raise_syntax_error(p, "f-string: empty expression not allowed"); - return NULL; - } - - len = expr_end - expr_start; - /* Allocate 3 extra bytes: open paren, close paren, null byte. */ - str = PyMem_RawMalloc(len + 3); - if (str == NULL) { - PyErr_NoMemory(); - return NULL; - } - - str[0] = '('; - memcpy(str+1, expr_start, len); - str[len+1] = ')'; - str[len+2] = 0; - - struct tok_state* tok = PyTokenizer_FromString(str, 1); - if (tok == NULL) { - return NULL; - } - tok->filename = PyUnicode_FromString(""); - if (!tok->filename) { - PyTokenizer_Free(tok); - return NULL; - } - mod_ty (*the_start_rule)(Parser*) = p->start_rule_func; - - Parser *p2 = PyMem_Malloc(sizeof(Parser)); - if (p2 == NULL) { - PyErr_Format(PyExc_MemoryError, "Out of memory for Parser"); - goto exit; - } - p2->tok = tok; - p2->input_mode = STRING_INPUT; - p2->keywords = p->keywords; - p2->n_keyword_lists = p->n_keyword_lists; - p2->tokens = PyMem_Malloc(sizeof(Token *)); - if (!p2->tokens) { - PyErr_Format(PyExc_MemoryError, "Out of memory for tokens"); - goto exit; - } - p2->tokens[0] = PyMem_Malloc(sizeof(Token)); - memset(p2->tokens[0], '\0', sizeof(Token)); - p2->mark = 0; - p2->fill = 0; - p2->size = 1; - p2->arena = p->arena; - p2->start_rule_func = the_start_rule; - if (fill_token(p2) < 0) { - goto exit; - } - PyErr_Clear(); - mod = the_start_rule(p2); - - if (mod == NULL){ - raise_syntax_error(p2, "invalid syntax"); - goto exit; - } - - if (asdl_seq_LEN(mod->v.Module.body) == 0) { - raise_syntax_error(p, "f-string: empty expression not allowed"); - goto exit; - } - - stmt_ty expr = asdl_seq_GET(mod->v.Module.body, 0); - if (asdl_seq_LEN(mod->v.Module.body) != 1 || expr->kind != Expr_kind) { - raise_syntax_error(p, "f-string: invalid expression"); - goto exit; - } - - /* Reuse str to find the correct column offset. */ - str[0] = '{'; - str[len+1] = '}'; - fstring_fix_expr_location(t, expr->v.Expr.value, str); - - result = expr->v.Expr.value; - -exit: - PyTokenizer_Free(tok); - for (int i = 0; i < p2->size; i++) { - PyMem_Free(p2->tokens[i]); - } - PyMem_Free(p2->tokens); - PyMem_Free(p2); - if (mod == NULL) { - return NULL; - } - return result; -} - -/* Return -1 on error. - - Return 0 if we reached the end of the literal. - - Return 1 if we haven't reached the end of the literal, but we want - the caller to process the literal up to this point. Used for - doubled braces. -*/ -static int -fstring_find_literal(Parser *p, const char **str, const char *end, int raw, - PyObject **literal, int recurse_lvl) -{ - /* Get any literal string. It ends when we hit an un-doubled left - brace (which isn't part of a unicode name escape such as - "\N{EULER CONSTANT}"), or the end of the string. */ - - const char *s = *str; - const char *literal_start = s; - int result = 0; - - assert(*literal == NULL); - while (s < end) { - char ch = *s++; - if (!raw && ch == '\\' && s < end) { - ch = *s++; - if (ch == 'N') { - if (s < end && *s++ == '{') { - while (s < end && *s++ != '}') { - } - continue; - } - break; - } - if (ch == '{' && warn_invalid_escape_sequence(p, ch) < 0) { - return -1; - } - } - if (ch == '{' || ch == '}') { - /* Check for doubled braces, but only at the top level. If - we checked at every level, then f'{0:{3}}' would fail - with the two closing braces. */ - if (recurse_lvl == 0) { - if (s < end && *s == ch) { - /* We're going to tell the caller that the literal ends - here, but that they should continue scanning. But also - skip over the second brace when we resume scanning. */ - *str = s + 1; - result = 1; - goto done; - } - - /* Where a single '{' is the start of a new expression, a - single '}' is not allowed. */ - if (ch == '}') { - *str = s - 1; - raise_syntax_error(p, "f-string: single '}' is not allowed"); - return -1; - } - } - /* We're either at a '{', which means we're starting another - expression; or a '}', which means we're at the end of this - f-string (for a nested format_spec). */ - s--; - break; - } - } - *str = s; - assert(s <= end); - assert(s == end || *s == '{' || *s == '}'); -done: - if (literal_start != s) { - if (raw) - *literal = PyUnicode_DecodeUTF8Stateful(literal_start, - s - literal_start, - NULL, NULL); - else - *literal = decode_unicode_with_escapes(p, literal_start, - s - literal_start); - if (!*literal) - return -1; - } - return result; -} - -/* Forward declaration because parsing is recursive. */ -static expr_ty -fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl, - Token *first_token, Token* t, Token *last_token); - -/* Parse the f-string at *str, ending at end. We know *str starts an - expression (so it must be a '{'). Returns the FormattedValue node, which - includes the expression, conversion character, format_spec expression, and - optionally the text of the expression (if = is used). - - Note that I don't do a perfect job here: I don't make sure that a - closing brace doesn't match an opening paren, for example. It - doesn't need to error on all invalid expressions, just correctly - find the end of all valid ones. Any errors inside the expression - will be caught when we parse it later. - - *expression is set to the expression. For an '=' "debug" expression, - *expr_text is set to the debug text (the original text of the expression, - including the '=' and any whitespace around it, as a string object). If - not a debug expression, *expr_text set to NULL. */ -static int -fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl, - PyObject **expr_text, expr_ty *expression, Token *first_token, - Token *t, Token *last_token) -{ - /* Return -1 on error, else 0. */ - - const char *expr_start; - const char *expr_end; - expr_ty simple_expression; - expr_ty format_spec = NULL; /* Optional format specifier. */ - int conversion = -1; /* The conversion char. Use default if not - specified, or !r if using = and no format - spec. */ - - /* 0 if we're not in a string, else the quote char we're trying to - match (single or double quote). */ - char quote_char = 0; - - /* If we're inside a string, 1=normal, 3=triple-quoted. */ - int string_type = 0; - - /* Keep track of nesting level for braces/parens/brackets in - expressions. */ - Py_ssize_t nested_depth = 0; - char parenstack[MAXLEVEL]; - - *expr_text = NULL; - - /* Can only nest one level deep. */ - if (recurse_lvl >= 2) { - raise_syntax_error(p, "f-string: expressions nested too deeply"); - goto error; - } - - /* The first char must be a left brace, or we wouldn't have gotten - here. Skip over it. */ - assert(**str == '{'); - *str += 1; - - expr_start = *str; - for (; *str < end; (*str)++) { - char ch; - - /* Loop invariants. */ - assert(nested_depth >= 0); - assert(*str >= expr_start && *str < end); - if (quote_char) - assert(string_type == 1 || string_type == 3); - else - assert(string_type == 0); - - ch = **str; - /* Nowhere inside an expression is a backslash allowed. */ - if (ch == '\\') { - /* Error: can't include a backslash character, inside - parens or strings or not. */ - raise_syntax_error(p, - "f-string expression part " - "cannot include a backslash"); - goto error; - } - if (quote_char) { - /* We're inside a string. See if we're at the end. */ - /* This code needs to implement the same non-error logic - as tok_get from tokenizer.c, at the letter_quote - label. To actually share that code would be a - nightmare. But, it's unlikely to change and is small, - so duplicate it here. Note we don't need to catch all - of the errors, since they'll be caught when parsing the - expression. We just need to match the non-error - cases. Thus we can ignore \n in single-quoted strings, - for example. Or non-terminated strings. */ - if (ch == quote_char) { - /* Does this match the string_type (single or triple - quoted)? */ - if (string_type == 3) { - if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) { - /* We're at the end of a triple quoted string. */ - *str += 2; - string_type = 0; - quote_char = 0; - continue; - } - } else { - /* We're at the end of a normal string. */ - quote_char = 0; - string_type = 0; - continue; - } - } - } else if (ch == '\'' || ch == '"') { - /* Is this a triple quoted string? */ - if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) { - string_type = 3; - *str += 2; - } else { - /* Start of a normal string. */ - string_type = 1; - } - /* Start looking for the end of the string. */ - quote_char = ch; - } else if (ch == '[' || ch == '{' || ch == '(') { - if (nested_depth >= MAXLEVEL) { - raise_syntax_error(p, "f-string: too many nested parenthesis"); - goto error; - } - parenstack[nested_depth] = ch; - nested_depth++; - } else if (ch == '#') { - /* Error: can't include a comment character, inside parens - or not. */ - raise_syntax_error(p, "f-string expression part cannot include '#'"); - goto error; - } else if (nested_depth == 0 && - (ch == '!' || ch == ':' || ch == '}' || - ch == '=' || ch == '>' || ch == '<')) { - /* See if there's a next character. */ - if (*str+1 < end) { - char next = *(*str+1); - - /* For "!=". since '=' is not an allowed conversion character, - nothing is lost in this test. */ - if ((ch == '!' && next == '=') || /* != */ - (ch == '=' && next == '=') || /* == */ - (ch == '<' && next == '=') || /* <= */ - (ch == '>' && next == '=') /* >= */ - ) { - *str += 1; - continue; - } - /* Don't get out of the loop for these, if they're single - chars (not part of 2-char tokens). If by themselves, they - don't end an expression (unlike say '!'). */ - if (ch == '>' || ch == '<') { - continue; - } - } - - /* Normal way out of this loop. */ - break; - } else if (ch == ']' || ch == '}' || ch == ')') { - if (!nested_depth) { - raise_syntax_error(p, "f-string: unmatched '%c'", ch); - goto error; - } - nested_depth--; - int opening = parenstack[nested_depth]; - if (!((opening == '(' && ch == ')') || - (opening == '[' && ch == ']') || - (opening == '{' && ch == '}'))) - { - raise_syntax_error(p, - "f-string: closing parenthesis '%c' " - "does not match opening parenthesis '%c'", - ch, opening); - goto error; - } - } else { - /* Just consume this char and loop around. */ - } - } - expr_end = *str; - /* If we leave this loop in a string or with mismatched parens, we - don't care. We'll get a syntax error when compiling the - expression. But, we can produce a better error message, so - let's just do that.*/ - if (quote_char) { - raise_syntax_error(p, "f-string: unterminated string"); - goto error; - } - if (nested_depth) { - int opening = parenstack[nested_depth - 1]; - raise_syntax_error(p, "f-string: unmatched '%c'", opening); - goto error; - } - - if (*str >= end) - goto unexpected_end_of_string; - - /* Compile the expression as soon as possible, so we show errors - related to the expression before errors related to the - conversion or format_spec. */ - simple_expression = fstring_compile_expr(p, expr_start, expr_end, t); - if (!simple_expression) - goto error; - - /* Check for =, which puts the text value of the expression in - expr_text. */ - if (**str == '=') { - *str += 1; - - /* Skip over ASCII whitespace. No need to test for end of string - here, since we know there's at least a trailing quote somewhere - ahead. */ - while (Py_ISSPACE(**str)) { - *str += 1; - } - - /* Set *expr_text to the text of the expression. */ - *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start); - if (!*expr_text) { - goto error; - } - } - - /* Check for a conversion char, if present. */ - if (**str == '!') { - *str += 1; - if (*str >= end) - goto unexpected_end_of_string; - - conversion = **str; - *str += 1; - - /* Validate the conversion. */ - if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) { - raise_syntax_error(p, - "f-string: invalid conversion character: " - "expected 's', 'r', or 'a'"); - goto error; - } - - } - - /* Check for the format spec, if present. */ - if (*str >= end) - goto unexpected_end_of_string; - if (**str == ':') { - *str += 1; - if (*str >= end) - goto unexpected_end_of_string; - - /* Parse the format spec. */ - format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1, - first_token, t, last_token); - if (!format_spec) - goto error; - } - - if (*str >= end || **str != '}') - goto unexpected_end_of_string; - - /* We're at a right brace. Consume it. */ - assert(*str < end); - assert(**str == '}'); - *str += 1; - - /* If we're in = mode (detected by non-NULL expr_text), and have no format - spec and no explicit conversion, set the conversion to 'r'. */ - if (*expr_text && format_spec == NULL && conversion == -1) { - conversion = 'r'; - } - - /* And now create the FormattedValue node that represents this - entire expression with the conversion and format spec. */ - //TODO: Fix this - *expression = FormattedValue(simple_expression, conversion, - format_spec, first_token->lineno, - first_token->col_offset, last_token->end_lineno, - last_token->end_col_offset, p->arena); - if (!*expression) - goto error; - - return 0; - -unexpected_end_of_string: - raise_syntax_error(p, "f-string: expecting '}'"); - /* Falls through to error. */ - -error: - Py_XDECREF(*expr_text); - return -1; - -} - -/* Return -1 on error. - - Return 0 if we have a literal (possible zero length) and an - expression (zero length if at the end of the string. - - Return 1 if we have a literal, but no expression, and we want the - caller to call us again. This is used to deal with doubled - braces. - - When called multiple times on the string 'a{{b{0}c', this function - will return: - - 1. the literal 'a{' with no expression, and a return value - of 1. Despite the fact that there's no expression, the return - value of 1 means we're not finished yet. - - 2. the literal 'b' and the expression '0', with a return value of - 0. The fact that there's an expression means we're not finished. - - 3. literal 'c' with no expression and a return value of 0. The - combination of the return value of 0 with no expression means - we're finished. -*/ -static int -fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw, - int recurse_lvl, PyObject **literal, - PyObject **expr_text, expr_ty *expression, - Token *first_token, Token *t, Token *last_token) -{ - int result; - - assert(*literal == NULL && *expression == NULL); - - /* Get any literal string. */ - result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl); - if (result < 0) - goto error; - - assert(result == 0 || result == 1); - - if (result == 1) - /* We have a literal, but don't look at the expression. */ - return 1; - - if (*str >= end || **str == '}') - /* We're at the end of the string or the end of a nested - f-string: no expression. The top-level error case where we - expect to be at the end of the string but we're at a '}' is - handled later. */ - return 0; - - /* We must now be the start of an expression, on a '{'. */ - assert(**str == '{'); - - if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text, - expression, first_token, t, last_token) < 0) - goto error; - - return 0; - -error: - Py_CLEAR(*literal); - return -1; -} - -#ifdef NDEBUG -#define ExprList_check_invariants(l) -#else -static void -ExprList_check_invariants(ExprList *l) -{ - /* Check our invariants. Make sure this object is "live", and - hasn't been deallocated. */ - assert(l->size >= 0); - assert(l->p != NULL); - if (l->size <= EXPRLIST_N_CACHED) - assert(l->data == l->p); -} -#endif - -static void -ExprList_Init(ExprList *l) -{ - l->allocated = EXPRLIST_N_CACHED; - l->size = 0; - - /* Until we start allocating dynamically, p points to data. */ - l->p = l->data; - - ExprList_check_invariants(l); -} - -static int -ExprList_Append(ExprList *l, expr_ty exp) -{ - ExprList_check_invariants(l); - if (l->size >= l->allocated) { - /* We need to alloc (or realloc) the memory. */ - Py_ssize_t new_size = l->allocated * 2; - - /* See if we've ever allocated anything dynamically. */ - if (l->p == l->data) { - Py_ssize_t i; - /* We're still using the cached data. Switch to - alloc-ing. */ - l->p = PyMem_RawMalloc(sizeof(expr_ty) * new_size); - if (!l->p) - return -1; - /* Copy the cached data into the new buffer. */ - for (i = 0; i < l->size; i++) - l->p[i] = l->data[i]; - } else { - /* Just realloc. */ - expr_ty *tmp = PyMem_RawRealloc(l->p, sizeof(expr_ty) * new_size); - if (!tmp) { - PyMem_RawFree(l->p); - l->p = NULL; - return -1; - } - l->p = tmp; - } - - l->allocated = new_size; - assert(l->allocated == 2 * l->size); - } - - l->p[l->size++] = exp; - - ExprList_check_invariants(l); - return 0; -} - -static void -ExprList_Dealloc(ExprList *l) -{ - ExprList_check_invariants(l); - - /* If there's been an error, or we've never dynamically allocated, - do nothing. */ - if (!l->p || l->p == l->data) { - /* Do nothing. */ - } else { - /* We have dynamically allocated. Free the memory. */ - PyMem_RawFree(l->p); - } - l->p = NULL; - l->size = -1; -} - -static asdl_seq * -ExprList_Finish(ExprList *l, PyArena *arena) -{ - asdl_seq *seq; - - ExprList_check_invariants(l); - - /* Allocate the asdl_seq and copy the expressions in to it. */ - seq = _Py_asdl_seq_new(l->size, arena); - if (seq) { - Py_ssize_t i; - for (i = 0; i < l->size; i++) - asdl_seq_SET(seq, i, l->p[i]); - } - ExprList_Dealloc(l); - return seq; -} - -#ifdef NDEBUG -#define FstringParser_check_invariants(state) -#else -static void -FstringParser_check_invariants(FstringParser *state) -{ - if (state->last_str) - assert(PyUnicode_CheckExact(state->last_str)); - ExprList_check_invariants(&state->expr_list); -} -#endif - -void -FstringParser_Init(FstringParser *state) -{ - state->last_str = NULL; - state->fmode = 0; - ExprList_Init(&state->expr_list); - FstringParser_check_invariants(state); -} - -void -FstringParser_Dealloc(FstringParser *state) -{ - FstringParser_check_invariants(state); - - Py_XDECREF(state->last_str); - ExprList_Dealloc(&state->expr_list); -} - -/* Make a Constant node, but decref the PyUnicode object being added. */ -static expr_ty -make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token) -{ - PyObject *s = *str; - PyObject *kind = NULL; - *str = NULL; - assert(PyUnicode_CheckExact(s)); - if (PyArena_AddPyObject(p->arena, s) < 0) { - Py_DECREF(s); - return NULL; - } - const char* the_str = PyBytes_AsString(first_token->bytes); - if (the_str && the_str[0] == 'u') { - kind = new_identifier(p, "u"); - } - - if (kind == NULL && PyErr_Occurred()) { - return NULL; - } - - return Constant(s, kind, first_token->lineno, first_token->col_offset, - last_token->end_lineno, last_token->end_col_offset, p->arena); - -} - - -/* Add a non-f-string (that is, a regular literal string). str is - decref'd. */ -int -FstringParser_ConcatAndDel(FstringParser *state, PyObject *str) -{ - FstringParser_check_invariants(state); - - assert(PyUnicode_CheckExact(str)); - - if (PyUnicode_GET_LENGTH(str) == 0) { - Py_DECREF(str); - return 0; - } - - if (!state->last_str) { - /* We didn't have a string before, so just remember this one. */ - state->last_str = str; - } else { - /* Concatenate this with the previous string. */ - PyUnicode_AppendAndDel(&state->last_str, str); - if (!state->last_str) - return -1; - } - FstringParser_check_invariants(state); - return 0; -} - -/* Parse an f-string. The f-string is in *str to end, with no - 'f' or quotes. */ -int -FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str, - const char *end, int raw, int recurse_lvl, - Token *first_token, Token* t, Token *last_token) -{ - FstringParser_check_invariants(state); - state->fmode = 1; - - /* Parse the f-string. */ - while (1) { - PyObject *literal = NULL; - PyObject *expr_text = NULL; - expr_ty expression = NULL; - - /* If there's a zero length literal in front of the - expression, literal will be NULL. If we're at the end of - the f-string, expression will be NULL (unless result == 1, - see below). */ - int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl, - &literal, &expr_text, - &expression, first_token, t, last_token); - if (result < 0) - return -1; - - /* Add the literal, if any. */ - if (literal && FstringParser_ConcatAndDel(state, literal) < 0) { - Py_XDECREF(expr_text); - return -1; - } - /* Add the expr_text, if any. */ - if (expr_text && FstringParser_ConcatAndDel(state, expr_text) < 0) { - return -1; - } - - /* We've dealt with the literal and expr_text, their ownership has - been transferred to the state object. Don't look at them again. */ - - /* See if we should just loop around to get the next literal - and expression, while ignoring the expression this - time. This is used for un-doubling braces, as an - optimization. */ - if (result == 1) - continue; - - if (!expression) - /* We're done with this f-string. */ - break; - - /* We know we have an expression. Convert any existing string - to a Constant node. */ - if (!state->last_str) { - /* Do nothing. No previous literal. */ - } else { - /* Convert the existing last_str literal to a Constant node. */ - expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token); - if (!str || ExprList_Append(&state->expr_list, str) < 0) - return -1; - } - - if (ExprList_Append(&state->expr_list, expression) < 0) - return -1; - } - - /* If recurse_lvl is zero, then we must be at the end of the - string. Otherwise, we must be at a right brace. */ - - if (recurse_lvl == 0 && *str < end-1) { - raise_syntax_error(p, "f-string: unexpected end of string"); - return -1; - } - if (recurse_lvl != 0 && **str != '}') { - raise_syntax_error(p, "f-string: expecting '}'"); - return -1; - } - - FstringParser_check_invariants(state); - return 0; -} - -/* Convert the partial state reflected in last_str and expr_list to an - expr_ty. The expr_ty can be a Constant, or a JoinedStr. */ -expr_ty -FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token, - Token *last_token) -{ - asdl_seq *seq; - - FstringParser_check_invariants(state); - - /* If we're just a constant string with no expressions, return - that. */ - if (!state->fmode) { - assert(!state->expr_list.size); - if (!state->last_str) { - /* Create a zero length string. */ - state->last_str = PyUnicode_FromStringAndSize(NULL, 0); - if (!state->last_str) - goto error; - } - return make_str_node_and_del(p, &state->last_str, first_token, last_token); - } - - /* Create a Constant node out of last_str, if needed. It will be the - last node in our expression list. */ - if (state->last_str) { - expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token); - if (!str || ExprList_Append(&state->expr_list, str) < 0) - goto error; - } - /* This has already been freed. */ - assert(state->last_str == NULL); - - seq = ExprList_Finish(&state->expr_list, p->arena); - if (!seq) - goto error; - - return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset, - last_token->end_lineno, last_token->end_col_offset, p->arena); - -error: - FstringParser_Dealloc(state); - return NULL; -} - -/* Given an f-string (with no 'f' or quotes) that's in *str and ends - at end, parse it into an expr_ty. Return NULL on error. Adjust - str to point past the parsed portion. */ -static expr_ty -fstring_parse(Parser *p, const char **str, const char *end, int raw, - int recurse_lvl, Token *first_token, Token* t, Token *last_token) -{ - FstringParser state; - - FstringParser_Init(&state); - if (FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl, - first_token, t, last_token) < 0) { - FstringParser_Dealloc(&state); - return NULL; - } - - return FstringParser_Finish(p, &state, t, t); -} diff --git a/Tools/peg_generator/peg_parser/parse_string.h b/Tools/peg_generator/peg_parser/parse_string.h deleted file mode 100644 index e769bd1dde9259..00000000000000 --- a/Tools/peg_generator/peg_parser/parse_string.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef STRINGS_H -#define STRINGS_H - -#include -#include -#include "pegen.h" - -#define EXPRLIST_N_CACHED 64 - -typedef struct { - /* Incrementally build an array of expr_ty, so be used in an - asdl_seq. Cache some small but reasonably sized number of - expr_ty's, and then after that start dynamically allocating, - doubling the number allocated each time. Note that the f-string - f'{0}a{1}' contains 3 expr_ty's: 2 FormattedValue's, and one - Constant for the literal 'a'. So you add expr_ty's about twice as - fast as you add expressions in an f-string. */ - - Py_ssize_t allocated; /* Number we've allocated. */ - Py_ssize_t size; /* Number we've used. */ - expr_ty *p; /* Pointer to the memory we're actually - using. Will point to 'data' until we - start dynamically allocating. */ - expr_ty data[EXPRLIST_N_CACHED]; -} ExprList; - -/* The FstringParser is designed to add a mix of strings and - f-strings, and concat them together as needed. Ultimately, it - generates an expr_ty. */ -typedef struct { - PyObject *last_str; - ExprList expr_list; - int fmode; -} FstringParser; - -void FstringParser_Init(FstringParser *); -int parsestr(Parser *, const char *, int *, int *, PyObject **, - const char **, Py_ssize_t *); -int FstringParser_ConcatFstring(Parser *, FstringParser *, const char **, - const char *, int, int, Token *, Token *, - Token *); -int FstringParser_ConcatAndDel(FstringParser *, PyObject *); -expr_ty FstringParser_Finish(Parser *, FstringParser *, Token *, Token *); -void FstringParser_Dealloc(FstringParser *); - -#endif diff --git a/Tools/peg_generator/peg_parser/pegen.c b/Tools/peg_generator/peg_parser/pegen.c deleted file mode 100644 index c3a3329a96f3a8..00000000000000 --- a/Tools/peg_generator/peg_parser/pegen.c +++ /dev/null @@ -1,1450 +0,0 @@ -#include -#include "pegen.h" -#include "v38tokenizer.h" -#include "parse_string.h" - -PyObject * -new_identifier(Parser *p, char *identifier) -{ - PyObject *id = PyUnicode_FromString(identifier); - if (id == NULL) { - return NULL; - } - if (PyArena_AddPyObject(p->arena, id) < 0) { - Py_DECREF(id); - return NULL; - } - return id; -} - -static PyObject * -_create_dummy_identifier(Parser *p) -{ - return new_identifier(p, ""); -} - -static inline Py_ssize_t -byte_offset_to_character_offset(PyObject *line, int col_offset) -{ - const char *str = PyUnicode_AsUTF8(line); - PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, NULL); - if (!text) { - return 0; - } - Py_ssize_t size = PyUnicode_GET_LENGTH(text); - Py_DECREF(text); - return size; -} - -static inline PyObject * -get_error_line(char *buffer) -{ - char *newline = strchr(buffer, '\n'); - if (newline) { - return PyUnicode_FromStringAndSize(buffer, newline - buffer); - } - else { - return PyUnicode_FromString(buffer); - } -} - -int -raise_syntax_error(Parser *p, const char *errmsg, ...) -{ - PyObject *value = NULL; - PyObject *errstr = NULL; - PyObject *loc = NULL; - PyObject *tmp = NULL; - Token *t = p->tokens[p->fill - 1]; - va_list va; - - va_start(va, errmsg); - errstr = PyUnicode_FromFormatV(errmsg, va); - va_end(va); - if (!errstr) { - goto error; - } - if (p->input_mode == FILE_INPUT) { - loc = PyErr_ProgramTextObject(p->tok->filename, t->lineno); - if (!loc) { - Py_INCREF(Py_None); - loc = Py_None; - } - } - else { - assert(p->input_mode == STRING_INPUT); - loc = get_error_line(p->tok->buf); - if (!loc) { - goto error; - } - } - // We may receive tokens with the col_offset not initialized (-1) since - // emitted by fill_token(). For instance, this can happen in some error - // situations involving invalid indentation. - int col_offset = t->col_offset == -1 ? 0 : t->col_offset; - Py_ssize_t col_number = byte_offset_to_character_offset(loc, col_offset) + 1; - tmp = Py_BuildValue("(OiiN)", p->tok->filename, t->lineno, col_number, loc); - if (!tmp) { - goto error; - } - value = PyTuple_Pack(2, errstr, tmp); - Py_DECREF(tmp); - if (!value) { - goto error; - } - PyErr_SetObject(PyExc_SyntaxError, value); - Py_DECREF(errstr); - Py_DECREF(value); - return 0; - -error: - Py_XDECREF(errstr); - Py_XDECREF(loc); - return -1; -} - -// Enable this if you uncomment any of the comments calling token_name(). -#if 0 -static const char * -token_name(int type) -{ - if (0 <= type && type <= N_TOKENS) { - return _PyParser_TokenNames[type]; - } - return ""; -} -#endif - -// Here, mark is the start of the node, while p->mark is the end. -// If node==NULL, they should be the same. -int -insert_memo(Parser *p, int mark, int type, void *node) -{ - // Insert in front - Memo *m = PyArena_Malloc(p->arena, sizeof(Memo)); - if (m == NULL) { - return -1; - } - m->type = type; - m->node = node; - m->mark = p->mark; - m->next = p->tokens[mark]->memo; - p->tokens[mark]->memo = m; - return 0; -} - -// Like insert_memo(), but updates an existing node if found. -int -update_memo(Parser *p, int mark, int type, void *node) -{ - for (Memo *m = p->tokens[mark]->memo; m != NULL; m = m->next) { - if (m->type == type) { - // Update existing node. - m->node = node; - m->mark = p->mark; - return 0; - } - } - // Insert new node. - return insert_memo(p, mark, type, node); -} - -// Return dummy NAME. -void * -CONSTRUCTOR(Parser *p, ...) -{ - static void *cache = NULL; - - if (cache != NULL) { - return cache; - } - - PyObject *id = _create_dummy_identifier(p); - if (!id) { - return NULL; - } - cache = Name(id, Load, 1, 0, 1, 0, p->arena); - return cache; -} - -static int -_get_keyword_or_name_type(Parser *p, char *name, int name_len) -{ - if (name_len >= p->n_keyword_lists || p->keywords[name_len] == NULL) { - return NAME; - } - for (KeywordToken *k = p->keywords[name_len]; k->type != -1; k++) { - if (strncmp(k->str, name, name_len) == 0) { - return k->type; - } - } - return NAME; -} - -int -fill_token(Parser *p) -{ - char *start, *end; - int type = PyTokenizer_Get(p->tok, &start, &end); - if (type == ERRORTOKEN) { - if (!PyErr_Occurred()) { - PyErr_Format(PyExc_SyntaxError, "Tokenizer returned error token"); - // There is no reliable column information for this error - PyErr_SyntaxLocationObject(p->tok->filename, p->tok->lineno, 0); - } - return -1; - } - - if (p->fill == p->size) { - int newsize = p->size * 2; - p->tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *)); - if (p->tokens == NULL) { - PyErr_Format(PyExc_MemoryError, "Realloc tokens failed"); - return -1; - } - for (int i = p->size; i < newsize; i++) { - p->tokens[i] = PyMem_Malloc(sizeof(Token)); - memset(p->tokens[i], '\0', sizeof(Token)); - } - p->size = newsize; - } - - Token *t = p->tokens[p->fill]; - t->type = (type == NAME) ? _get_keyword_or_name_type(p, start, (int)(end - start)) : type; - t->bytes = PyBytes_FromStringAndSize(start, end - start); - if (t->bytes == NULL) { - return -1; - } - PyArena_AddPyObject(p->arena, t->bytes); - - int lineno = type == STRING ? p->tok->first_lineno : p->tok->lineno; - const char *line_start = type == STRING ? p->tok->multi_line_start : p->tok->line_start; - int end_lineno = p->tok->lineno; - int col_offset = -1, end_col_offset = -1; - if (start != NULL && start >= line_start) { - col_offset = start - line_start; - } - if (end != NULL && end >= p->tok->line_start) { - end_col_offset = end - p->tok->line_start; - } - - t->lineno = lineno; - t->col_offset = col_offset; - t->end_lineno = end_lineno; - t->end_col_offset = end_col_offset; - - // if (p->fill % 100 == 0) fprintf(stderr, "Filled at %d: %s \"%s\"\n", p->fill, - // token_name(type), PyBytes_AsString(t->bytes)); - p->fill += 1; - return 0; -} - -int // bool -is_memoized(Parser *p, int type, void *pres) -{ - if (p->mark == p->fill) { - if (fill_token(p) < 0) { - return -1; - } - } - - Token *t = p->tokens[p->mark]; - - for (Memo *m = t->memo; m != NULL; m = m->next) { - if (m->type == type) { - p->mark = m->mark; - *(void **)(pres) = m->node; - // fprintf(stderr, "%d < %d: memoized!\n", p->mark, p->fill); - return 1; - } - } - // fprintf(stderr, "%d < %d: not memoized\n", p->mark, p->fill); - return 0; -} - -int -lookahead_with_string(int positive, void *(func)(Parser *, const char *), Parser *p, - const char *arg) -{ - int mark = p->mark; - void *res = func(p, arg); - p->mark = mark; - return (res != NULL) == positive; -} - -int -lookahead_with_int(int positive, Token *(func)(Parser *, int), Parser *p, int arg) -{ - int mark = p->mark; - void *res = func(p, arg); - p->mark = mark; - return (res != NULL) == positive; -} - -int -lookahead(int positive, void *(func)(Parser *), Parser *p) -{ - int mark = p->mark; - void *res = func(p); - p->mark = mark; - return (res != NULL) == positive; -} - -Token * -expect_token(Parser *p, int type) -{ - if (p->mark == p->fill) { - if (fill_token(p) < 0) { - return NULL; - } - } - Token *t = p->tokens[p->mark]; - if (t->type != type) { - // fprintf(stderr, "No %s at %d\n", token_name(type), p->mark); - return NULL; - } - p->mark += 1; - // fprintf(stderr, "Got %s at %d: %s\n", token_name(type), p->mark, - // PyBytes_AsString(t->bytes)); - - return t; -} - -Token * -get_last_nonnwhitespace_token(Parser *p) -{ - assert(p->mark >= 0); - Token *token = NULL; - for (int m = p->mark - 1; m >= 0; m--) { - token = p->tokens[m]; - if (token->type != ENDMARKER && (token->type < NEWLINE || token->type > DEDENT)) { - break; - } - } - return token; -} - -void * -async_token(Parser *p) -{ - return expect_token(p, ASYNC); -} - -void * -await_token(Parser *p) -{ - return expect_token(p, AWAIT); -} - -void * -endmarker_token(Parser *p) -{ - return expect_token(p, ENDMARKER); -} - -expr_ty -name_token(Parser *p) -{ - Token *t = expect_token(p, NAME); - if (t == NULL) { - return NULL; - } - char *s; - Py_ssize_t n; - if (PyBytes_AsStringAndSize(t->bytes, &s, &n) < 0) { - return NULL; - } - PyObject *id = PyUnicode_DecodeUTF8(s, n, NULL); - if (id == NULL) { - return NULL; - } - if (PyArena_AddPyObject(p->arena, id) < 0) { - Py_DECREF(id); - return NULL; - } - // TODO: What new_identifier() does. - return Name(id, Load, t->lineno, t->col_offset, t->end_lineno, t->end_col_offset, - p->arena); -} - -void * -string_token(Parser *p) -{ - return expect_token(p, STRING); -} - -void * -newline_token(Parser *p) -{ - return expect_token(p, NEWLINE); -} - -void * -indent_token(Parser *p) -{ - return expect_token(p, INDENT); -} - -void * -dedent_token(Parser *p) -{ - return expect_token(p, DEDENT); -} - -static PyObject * -parsenumber_raw(const char *s) -{ - const char *end; - long x; - double dx; - Py_complex compl; - int imflag; - - assert(s != NULL); - errno = 0; - end = s + strlen(s) - 1; - imflag = *end == 'j' || *end == 'J'; - if (s[0] == '0') { - x = (long)PyOS_strtoul(s, (char **)&end, 0); - if (x < 0 && errno == 0) { - return PyLong_FromString(s, (char **)0, 0); - } - } - else - x = PyOS_strtol(s, (char **)&end, 0); - if (*end == '\0') { - if (errno != 0) - return PyLong_FromString(s, (char **)0, 0); - return PyLong_FromLong(x); - } - /* XXX Huge floats may silently fail */ - if (imflag) { - compl.real = 0.; - compl.imag = PyOS_string_to_double(s, (char **)&end, NULL); - if (compl.imag == -1.0 && PyErr_Occurred()) - return NULL; - return PyComplex_FromCComplex(compl); - } - else { - dx = PyOS_string_to_double(s, NULL, NULL); - if (dx == -1.0 && PyErr_Occurred()) - return NULL; - return PyFloat_FromDouble(dx); - } -} - -static PyObject * -parsenumber(const char *s) -{ - char *dup, *end; - PyObject *res = NULL; - - assert(s != NULL); - - if (strchr(s, '_') == NULL) { - return parsenumber_raw(s); - } - /* Create a duplicate without underscores. */ - dup = PyMem_Malloc(strlen(s) + 1); - if (dup == NULL) { - return PyErr_NoMemory(); - } - end = dup; - for (; *s; s++) { - if (*s != '_') { - *end++ = *s; - } - } - *end = '\0'; - res = parsenumber_raw(dup); - PyMem_Free(dup); - return res; -} - -expr_ty -number_token(Parser *p) -{ - Token *t = expect_token(p, NUMBER); - if (t == NULL) { - return NULL; - } - - char *num_raw = PyBytes_AsString(t->bytes); - - if (num_raw == NULL) { - return NULL; - } - - PyObject *c = parsenumber(num_raw); - - if (c == NULL) { - return NULL; - } - - if (PyArena_AddPyObject(p->arena, c) < 0) { - Py_DECREF(c); - return NULL; - } - - return Constant(c, NULL, t->lineno, t->col_offset, t->end_lineno, t->end_col_offset, - p->arena); -} - -void -Parser_Free(Parser *p) -{ - for (int i = 0; i < p->size; i++) { - PyMem_Free(p->tokens[i]); - } - PyMem_Free(p->tokens); - PyMem_Free(p); -} - -Parser * -Parser_New(struct tok_state *tok, mod_ty (*parse_func)(Parser *), int input_mode, - PyArena *arena) -{ - Parser *p = PyMem_Malloc(sizeof(Parser)); - if (p == NULL) { - PyErr_Format(PyExc_MemoryError, "Out of memory for Parser"); - return NULL; - } - assert(tok != NULL); - p->tok = tok; - p->input_mode = input_mode; - p->keywords = reserved_keywords; - p->n_keyword_lists = n_keyword_lists; - p->tokens = PyMem_Malloc(sizeof(Token *)); - if (!p->tokens) { - PyErr_Format(PyExc_MemoryError, "Out of memory for tokens"); - return NULL; - } - p->tokens[0] = PyMem_Malloc(sizeof(Token)); - memset(p->tokens[0], '\0', sizeof(Token)); - p->mark = 0; - p->fill = 0; - p->size = 1; - - p->arena = arena; - - if (fill_token(p) < 0) { - return NULL; - } - - p->start_rule_func = parse_func; - - return p; -} - -mod_ty -run_parser(Parser *p) -{ - int error = setjmp(p->error_env); - if (error) { - return NULL; - } - - mod_ty (*parse_func)(Parser *) = p->start_rule_func; - mod_ty res = (*parse_func)(p); - if (res == NULL) { - if (PyErr_Occurred()) { - return NULL; - } - if (p->fill == 0) { - raise_syntax_error(p, "error at start before reading any input"); - } - else { - raise_syntax_error(p, "invalid syntax"); - } - return NULL; - } - - return res; -} - -mod_ty -run_parser_from_file(const char *filename, mod_ty (*parse_func)(Parser *), - PyObject *filename_ob, PyArena *arena) -{ - FILE *fp = fopen(filename, "rb"); - if (fp == NULL) { - PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename); - return NULL; - } - - // From here on we need to clean up even if there's an error - mod_ty result = NULL; - - struct tok_state *tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL); - if (tok == NULL) { - goto error; - } - // This transfers the ownership to the tokenizer - tok->filename = filename_ob; - Py_INCREF(filename_ob); - - Parser *p = Parser_New(tok, parse_func, FILE_INPUT, arena); - if (p == NULL) { - goto after_tok_error; - } - - result = run_parser(p); - Parser_Free(p); - -after_tok_error: - PyTokenizer_Free(tok); -error: - fclose(fp); - return result; -} - -mod_ty -run_parser_from_string(const char *str, mod_ty (*parse_func)(Parser *), PyObject *filename_ob, - PyArena *arena) -{ - struct tok_state *tok = PyTokenizer_FromString(str, 1); - if (tok == NULL) { - return NULL; - } - // This transfers the ownership to the tokenizer - tok->filename = filename_ob; - Py_INCREF(filename_ob); - - // We need to clear up from here on - mod_ty result = NULL; - - Parser *p = Parser_New(tok, parse_func, STRING_INPUT, arena); - if (p == NULL) { - goto error; - } - - result = run_parser(p); - Parser_Free(p); - -error: - PyTokenizer_Free(tok); - return result; -} - -/* Creates a single-element asdl_seq* that contains a */ -asdl_seq * -singleton_seq(Parser *p, void *a) -{ - assert(a != NULL); - asdl_seq *seq = _Py_asdl_seq_new(1, p->arena); - if (!seq) { - return NULL; - } - asdl_seq_SET(seq, 0, a); - return seq; -} - -/* Creates a copy of seq and prepends a to it */ -asdl_seq * -seq_insert_in_front(Parser *p, void *a, asdl_seq *seq) -{ - assert(a != NULL); - if (!seq) { - return singleton_seq(p, a); - } - - asdl_seq *new_seq = _Py_asdl_seq_new(asdl_seq_LEN(seq) + 1, p->arena); - if (!new_seq) { - return NULL; - } - - asdl_seq_SET(new_seq, 0, a); - for (int i = 1, l = asdl_seq_LEN(new_seq); i < l; i++) { - asdl_seq_SET(new_seq, i, asdl_seq_GET(seq, i - 1)); - } - return new_seq; -} - -static int -_get_flattened_seq_size(asdl_seq *seqs) -{ - int size = 0; - for (int i = 0, l = asdl_seq_LEN(seqs); i < l; i++) { - asdl_seq *inner_seq = asdl_seq_GET(seqs, i); - size += asdl_seq_LEN(inner_seq); - } - return size; -} - -/* Flattens an asdl_seq* of asdl_seq*s */ -asdl_seq * -seq_flatten(Parser *p, asdl_seq *seqs) -{ - int flattened_seq_size = _get_flattened_seq_size(seqs); - assert(flattened_seq_size > 0); - - asdl_seq *flattened_seq = _Py_asdl_seq_new(flattened_seq_size, p->arena); - if (!flattened_seq) { - return NULL; - } - - int flattened_seq_idx = 0; - for (int i = 0, l = asdl_seq_LEN(seqs); i < l; i++) { - asdl_seq *inner_seq = asdl_seq_GET(seqs, i); - for (int j = 0, li = asdl_seq_LEN(inner_seq); j < li; j++) { - asdl_seq_SET(flattened_seq, flattened_seq_idx++, asdl_seq_GET(inner_seq, j)); - } - } - assert(flattened_seq_idx == flattened_seq_size); - - return flattened_seq; -} - -/* Creates a new name of the form . */ -expr_ty -join_names_with_dot(Parser *p, expr_ty first_name, expr_ty second_name) -{ - assert(first_name != NULL && second_name != NULL); - PyObject *first_identifier = first_name->v.Name.id; - PyObject *second_identifier = second_name->v.Name.id; - - if (PyUnicode_READY(first_identifier) == -1) { - return NULL; - } - if (PyUnicode_READY(second_identifier) == -1) { - return NULL; - } - const char *first_str = PyUnicode_AsUTF8(first_identifier); - if (!first_str) { - return NULL; - } - const char *second_str = PyUnicode_AsUTF8(second_identifier); - if (!second_str) { - return NULL; - } - ssize_t len = strlen(first_str) + strlen(second_str) + 1; // +1 for the dot - - PyObject *str = PyBytes_FromStringAndSize(NULL, len); - if (!str) { - return NULL; - } - - char *s = PyBytes_AS_STRING(str); - if (!s) { - return NULL; - } - - strcpy(s, first_str); - s += strlen(first_str); - *s++ = '.'; - strcpy(s, second_str); - s += strlen(second_str); - *s = '\0'; - - PyObject *uni = PyUnicode_DecodeUTF8(PyBytes_AS_STRING(str), PyBytes_GET_SIZE(str), NULL); - Py_DECREF(str); - if (!uni) { - return NULL; - } - PyUnicode_InternInPlace(&uni); - if (PyArena_AddPyObject(p->arena, uni) < 0) { - Py_DECREF(uni); - return NULL; - } - - return _Py_Name(uni, Load, EXTRA_EXPR(first_name, second_name)); -} - -/* Counts the total number of dots in seq's tokens */ -int -seq_count_dots(asdl_seq *seq) -{ - int number_of_dots = 0; - for (int i = 0, l = asdl_seq_LEN(seq); i < l; i++) { - Token *current_expr = asdl_seq_GET(seq, i); - switch (current_expr->type) { - case ELLIPSIS: - number_of_dots += 3; - break; - case DOT: - number_of_dots += 1; - break; - default: - assert(current_expr->type == ELLIPSIS || current_expr->type == DOT); - } - } - - return number_of_dots; -} - -/* Creates an alias with '*' as the identifier name */ -alias_ty -alias_for_star(Parser *p) -{ - PyObject *str = PyUnicode_InternFromString("*"); - if (!str) { - return NULL; - } - if (PyArena_AddPyObject(p->arena, str) < 0) { - Py_DECREF(str); - return NULL; - } - return alias(str, NULL, p->arena); -} - -/* Creates a new asdl_seq* with the identifiers of all the names in seq */ -asdl_seq * -map_names_to_ids(Parser *p, asdl_seq *seq) -{ - int len = asdl_seq_LEN(seq); - assert(len > 0); - - asdl_seq *new_seq = _Py_asdl_seq_new(len, p->arena); - if (!new_seq) { - return NULL; - } - for (int i = 0; i < len; i++) { - expr_ty e = asdl_seq_GET(seq, i); - asdl_seq_SET(new_seq, i, e->v.Name.id); - } - return new_seq; -} - -/* Constructs a CmpopExprPair */ -CmpopExprPair * -cmpop_expr_pair(Parser *p, cmpop_ty cmpop, expr_ty expr) -{ - assert(expr != NULL); - CmpopExprPair *a = PyArena_Malloc(p->arena, sizeof(CmpopExprPair)); - if (!a) { - return NULL; - } - a->cmpop = cmpop; - a->expr = expr; - return a; -} - -asdl_int_seq * -get_cmpops(Parser *p, asdl_seq *seq) -{ - int len = asdl_seq_LEN(seq); - assert(len > 0); - - asdl_int_seq *new_seq = _Py_asdl_int_seq_new(len, p->arena); - if (!new_seq) { - return NULL; - } - for (int i = 0; i < len; i++) { - CmpopExprPair *pair = asdl_seq_GET(seq, i); - asdl_seq_SET(new_seq, i, pair->cmpop); - } - return new_seq; -} - -asdl_seq * -get_exprs(Parser *p, asdl_seq *seq) -{ - int len = asdl_seq_LEN(seq); - assert(len > 0); - - asdl_seq *new_seq = _Py_asdl_seq_new(len, p->arena); - if (!new_seq) { - return NULL; - } - for (int i = 0; i < len; i++) { - CmpopExprPair *pair = asdl_seq_GET(seq, i); - asdl_seq_SET(new_seq, i, pair->expr); - } - return new_seq; -} - -/* Creates an asdl_seq* where all the elements have been changed to have ctx as context */ -static asdl_seq * -_set_seq_context(Parser *p, asdl_seq *seq, expr_context_ty ctx) -{ - int len = asdl_seq_LEN(seq); - if (len == 0) { - return NULL; - } - - asdl_seq *new_seq = _Py_asdl_seq_new(len, p->arena); - if (!new_seq) { - return NULL; - } - for (int i = 0; i < len; i++) { - expr_ty e = asdl_seq_GET(seq, i); - asdl_seq_SET(new_seq, i, set_expr_context(p, e, ctx)); - } - return new_seq; -} - -static expr_ty -_set_name_context(Parser *p, expr_ty e, expr_context_ty ctx) -{ - return _Py_Name(e->v.Name.id, ctx, EXTRA_EXPR(e, e)); -} - -static expr_ty -_set_tuple_context(Parser *p, expr_ty e, expr_context_ty ctx) -{ - return _Py_Tuple(_set_seq_context(p, e->v.Tuple.elts, ctx), ctx, EXTRA_EXPR(e, e)); -} - -static expr_ty -_set_list_context(Parser *p, expr_ty e, expr_context_ty ctx) -{ - return _Py_List(_set_seq_context(p, e->v.List.elts, ctx), ctx, EXTRA_EXPR(e, e)); -} - -static expr_ty -_set_subscript_context(Parser *p, expr_ty e, expr_context_ty ctx) -{ - return _Py_Subscript(e->v.Subscript.value, e->v.Subscript.slice, ctx, EXTRA_EXPR(e, e)); -} - -static expr_ty -_set_attribute_context(Parser *p, expr_ty e, expr_context_ty ctx) -{ - return _Py_Attribute(e->v.Attribute.value, e->v.Attribute.attr, ctx, EXTRA_EXPR(e, e)); -} - -expr_ty -_set_starred_context(Parser *p, expr_ty e, expr_context_ty ctx) -{ - return _Py_Starred(set_expr_context(p, e->v.Starred.value, ctx), ctx, EXTRA_EXPR(e, e)); -} - -/* Creates an `expr_ty` equivalent to `expr` but with `ctx` as context */ -expr_ty -set_expr_context(Parser *p, expr_ty expr, expr_context_ty ctx) -{ - assert(expr != NULL); - - expr_ty new = NULL; - switch (expr->kind) { - case Name_kind: - new = _set_name_context(p, expr, ctx); - break; - case Tuple_kind: - new = _set_tuple_context(p, expr, ctx); - break; - case List_kind: - new = _set_list_context(p, expr, ctx); - break; - case Subscript_kind: - new = _set_subscript_context(p, expr, ctx); - break; - case Attribute_kind: - new = _set_attribute_context(p, expr, ctx); - break; - case Starred_kind: - new = _set_starred_context(p, expr, ctx); - break; - default: - new = expr; - } - return new; -} - -/* Constructs a KeyValuePair that is used when parsing a dict's key value pairs */ -KeyValuePair * -key_value_pair(Parser *p, expr_ty key, expr_ty value) -{ - KeyValuePair *a = PyArena_Malloc(p->arena, sizeof(KeyValuePair)); - if (!a) { - return NULL; - } - a->key = key; - a->value = value; - return a; -} - -/* Extracts all keys from an asdl_seq* of KeyValuePair*'s */ -asdl_seq * -get_keys(Parser *p, asdl_seq *seq) -{ - int len = asdl_seq_LEN(seq); - asdl_seq *new_seq = _Py_asdl_seq_new(len, p->arena); - if (!new_seq) { - return NULL; - } - for (int i = 0; i < len; i++) { - KeyValuePair *pair = asdl_seq_GET(seq, i); - asdl_seq_SET(new_seq, i, pair->key); - } - return new_seq; -} - -/* Extracts all values from an asdl_seq* of KeyValuePair*'s */ -asdl_seq * -get_values(Parser *p, asdl_seq *seq) -{ - int len = asdl_seq_LEN(seq); - asdl_seq *new_seq = _Py_asdl_seq_new(len, p->arena); - if (!new_seq) { - return NULL; - } - for (int i = 0; i < len; i++) { - KeyValuePair *pair = asdl_seq_GET(seq, i); - asdl_seq_SET(new_seq, i, pair->value); - } - return new_seq; -} - -/* Constructs a NameDefaultPair */ -NameDefaultPair * -name_default_pair(Parser *p, arg_ty arg, expr_ty value) -{ - NameDefaultPair *a = PyArena_Malloc(p->arena, sizeof(NameDefaultPair)); - if (!a) { - return NULL; - } - a->arg = arg; - a->value = value; - return a; -} - -/* Constructs a SlashWithDefault */ -SlashWithDefault * -slash_with_default(Parser *p, asdl_seq *plain_names, asdl_seq *names_with_defaults) -{ - SlashWithDefault *a = PyArena_Malloc(p->arena, sizeof(SlashWithDefault)); - if (!a) { - return NULL; - } - a->plain_names = plain_names; - a->names_with_defaults = names_with_defaults; - return a; -} - -/* Constructs a StarEtc */ -StarEtc * -star_etc(Parser *p, arg_ty vararg, asdl_seq *kwonlyargs, arg_ty kwarg) -{ - StarEtc *a = PyArena_Malloc(p->arena, sizeof(StarEtc)); - if (!a) { - return NULL; - } - a->vararg = vararg; - a->kwonlyargs = kwonlyargs; - a->kwarg = kwarg; - return a; -} - -static asdl_seq * -_join_seqs(Parser *p, asdl_seq *a, asdl_seq *b) -{ - int first_len = asdl_seq_LEN(a); - int second_len = asdl_seq_LEN(b); - asdl_seq *new_seq = _Py_asdl_seq_new(first_len + second_len, p->arena); - if (!new_seq) { - return NULL; - } - - int k = 0; - for (int i = 0; i < first_len; i++) { - asdl_seq_SET(new_seq, k++, asdl_seq_GET(a, i)); - } - for (int i = 0; i < second_len; i++) { - asdl_seq_SET(new_seq, k++, asdl_seq_GET(b, i)); - } - - return new_seq; -} - -static asdl_seq * -_get_names(Parser *p, asdl_seq *names_with_defaults) -{ - int len = asdl_seq_LEN(names_with_defaults); - asdl_seq *seq = _Py_asdl_seq_new(len, p->arena); - if (!seq) { - return NULL; - } - for (int i = 0; i < len; i++) { - NameDefaultPair *pair = asdl_seq_GET(names_with_defaults, i); - asdl_seq_SET(seq, i, pair->arg); - } - return seq; -} - -static asdl_seq * -_get_defaults(Parser *p, asdl_seq *names_with_defaults) -{ - int len = asdl_seq_LEN(names_with_defaults); - asdl_seq *seq = _Py_asdl_seq_new(len, p->arena); - if (!seq) { - return NULL; - } - for (int i = 0; i < len; i++) { - NameDefaultPair *pair = asdl_seq_GET(names_with_defaults, i); - asdl_seq_SET(seq, i, pair->value); - } - return seq; -} - -/* Constructs an arguments_ty object out of all the parsed constructs in the parameters rule */ -arguments_ty -make_arguments(Parser *p, asdl_seq *slash_without_default, - SlashWithDefault *slash_with_default, asdl_seq *plain_names, - asdl_seq *names_with_default, StarEtc *star_etc) -{ - asdl_seq *posonlyargs; - if (slash_without_default != NULL) { - posonlyargs = slash_without_default; - } - else if (slash_with_default != NULL) { - asdl_seq *slash_with_default_names = - _get_names(p, slash_with_default->names_with_defaults); - if (!slash_with_default_names) { - return NULL; - } - posonlyargs = _join_seqs(p, slash_with_default->plain_names, slash_with_default_names); - if (!posonlyargs) { - return NULL; - } - } - else { - posonlyargs = _Py_asdl_seq_new(0, p->arena); - if (!posonlyargs) { - return NULL; - } - } - - asdl_seq *posargs; - if (plain_names != NULL && names_with_default != NULL) { - asdl_seq *names_with_default_names = _get_names(p, names_with_default); - if (!names_with_default_names) { - return NULL; - } - posargs = _join_seqs(p, plain_names, names_with_default_names); - if (!posargs) { - return NULL; - } - } - else if (plain_names == NULL && names_with_default != NULL) { - posargs = _get_names(p, names_with_default); - if (!posargs) { - return NULL; - } - } - else if (plain_names != NULL && names_with_default == NULL) { - posargs = plain_names; - } - else { - posargs = _Py_asdl_seq_new(0, p->arena); - if (!posargs) { - return NULL; - } - } - - asdl_seq *posdefaults; - if (slash_with_default != NULL && names_with_default != NULL) { - asdl_seq *slash_with_default_values = - _get_defaults(p, slash_with_default->names_with_defaults); - if (!slash_with_default_values) { - return NULL; - } - asdl_seq *names_with_default_values = _get_defaults(p, names_with_default); - if (!names_with_default_values) { - return NULL; - } - posdefaults = _join_seqs(p, slash_with_default_values, names_with_default_values); - if (!posdefaults) { - return NULL; - } - } - else if (slash_with_default == NULL && names_with_default != NULL) { - posdefaults = _get_defaults(p, names_with_default); - if (!posdefaults) { - return NULL; - } - } - else if (slash_with_default != NULL && names_with_default == NULL) { - posdefaults = _get_defaults(p, slash_with_default->names_with_defaults); - if (!posdefaults) { - return NULL; - } - } - else { - posdefaults = _Py_asdl_seq_new(0, p->arena); - if (!posdefaults) { - return NULL; - } - } - - arg_ty vararg = NULL; - if (star_etc != NULL && star_etc->vararg != NULL) { - vararg = star_etc->vararg; - } - - asdl_seq *kwonlyargs; - if (star_etc != NULL && star_etc->kwonlyargs != NULL) { - kwonlyargs = _get_names(p, star_etc->kwonlyargs); - if (!kwonlyargs) { - return NULL; - } - } - else { - kwonlyargs = _Py_asdl_seq_new(0, p->arena); - if (!kwonlyargs) { - return NULL; - } - } - - asdl_seq *kwdefaults; - if (star_etc != NULL && star_etc->kwonlyargs != NULL) { - kwdefaults = _get_defaults(p, star_etc->kwonlyargs); - if (!kwdefaults) { - return NULL; - } - } - else { - kwdefaults = _Py_asdl_seq_new(0, p->arena); - if (!kwdefaults) { - return NULL; - } - } - - arg_ty kwarg = NULL; - if (star_etc != NULL && star_etc->kwarg != NULL) { - kwarg = star_etc->kwarg; - } - - return _Py_arguments(posonlyargs, posargs, vararg, kwonlyargs, kwdefaults, kwarg, - posdefaults, p->arena); -} - -/* Constructs an empty arguments_ty object, that gets used when a function accepts no - * arguments. */ -arguments_ty -empty_arguments(Parser *p) -{ - asdl_seq *posonlyargs = _Py_asdl_seq_new(0, p->arena); - if (!posonlyargs) { - return NULL; - } - asdl_seq *posargs = _Py_asdl_seq_new(0, p->arena); - if (!posargs) { - return NULL; - } - asdl_seq *posdefaults = _Py_asdl_seq_new(0, p->arena); - if (!posdefaults) { - return NULL; - } - asdl_seq *kwonlyargs = _Py_asdl_seq_new(0, p->arena); - if (!kwonlyargs) { - return NULL; - } - asdl_seq *kwdefaults = _Py_asdl_seq_new(0, p->arena); - if (!kwdefaults) { - return NULL; - } - - return _Py_arguments(posonlyargs, posargs, NULL, kwonlyargs, kwdefaults, NULL, kwdefaults, - p->arena); -} - -/* Encapsulates the value of an operator_ty into an AugOperator struct */ -AugOperator * -augoperator(Parser *p, operator_ty kind) -{ - AugOperator *a = PyArena_Malloc(p->arena, sizeof(AugOperator)); - if (!a) { - return NULL; - } - a->kind = kind; - return a; -} - -/* Construct a FunctionDef equivalent to function_def, but with decorators */ -stmt_ty -function_def_decorators(Parser *p, asdl_seq *decorators, stmt_ty function_def) -{ - assert(function_def != NULL); - if (function_def->kind == AsyncFunctionDef_kind) { - return _Py_AsyncFunctionDef( - function_def->v.FunctionDef.name, function_def->v.FunctionDef.args, - function_def->v.FunctionDef.body, decorators, function_def->v.FunctionDef.returns, - function_def->v.FunctionDef.type_comment, function_def->lineno, - function_def->col_offset, function_def->end_lineno, function_def->end_col_offset, - p->arena); - } - - return _Py_FunctionDef(function_def->v.FunctionDef.name, function_def->v.FunctionDef.args, - function_def->v.FunctionDef.body, decorators, - function_def->v.FunctionDef.returns, - function_def->v.FunctionDef.type_comment, function_def->lineno, - function_def->col_offset, function_def->end_lineno, - function_def->end_col_offset, p->arena); -} - -/* Construct a ClassDef equivalent to class_def, but with decorators */ -stmt_ty -class_def_decorators(Parser *p, asdl_seq *decorators, stmt_ty class_def) -{ - assert(class_def != NULL); - return _Py_ClassDef(class_def->v.ClassDef.name, class_def->v.ClassDef.bases, - class_def->v.ClassDef.keywords, class_def->v.ClassDef.body, decorators, - class_def->lineno, class_def->col_offset, class_def->end_lineno, - class_def->end_col_offset, p->arena); -} - -/* Construct a KeywordOrStarred */ -KeywordOrStarred * -keyword_or_starred(Parser *p, void *element, int is_keyword) -{ - KeywordOrStarred *a = PyArena_Malloc(p->arena, sizeof(KeywordOrStarred)); - if (!a) { - return NULL; - } - a->element = element; - a->is_keyword = is_keyword; - return a; -} - -/* Get the number of starred expressions in an asdl_seq* of KeywordOrStarred*s */ -static int -_seq_number_of_starred_exprs(asdl_seq *seq) -{ - int n = 0; - for (int i = 0, l = asdl_seq_LEN(seq); i < l; i++) { - KeywordOrStarred *k = asdl_seq_GET(seq, i); - if (!k->is_keyword) { - n++; - } - } - return n; -} - -/* Extract the starred expressions of an asdl_seq* of KeywordOrStarred*s */ -asdl_seq * -seq_extract_starred_exprs(Parser *p, asdl_seq *kwargs) -{ - int new_len = _seq_number_of_starred_exprs(kwargs); - if (new_len == 0) { - return NULL; - } - asdl_seq *new_seq = _Py_asdl_seq_new(new_len, p->arena); - if (!new_seq) { - return NULL; - } - - int idx = 0; - for (int i = 0, len = asdl_seq_LEN(kwargs); i < len; i++) { - KeywordOrStarred *k = asdl_seq_GET(kwargs, i); - if (!k->is_keyword) { - asdl_seq_SET(new_seq, idx++, k->element); - } - } - return new_seq; -} - -/* Return a new asdl_seq* with only the keywords in kwargs */ -asdl_seq * -seq_delete_starred_exprs(Parser *p, asdl_seq *kwargs) -{ - int len = asdl_seq_LEN(kwargs); - int new_len = len - _seq_number_of_starred_exprs(kwargs); - if (new_len == 0) { - return NULL; - } - asdl_seq *new_seq = _Py_asdl_seq_new(new_len, p->arena); - if (!new_seq) { - return NULL; - } - - int idx = 0; - for (int i = 0; i < len; i++) { - KeywordOrStarred *k = asdl_seq_GET(kwargs, i); - if (k->is_keyword) { - asdl_seq_SET(new_seq, idx++, k->element); - } - } - return new_seq; -} - -expr_ty -concatenate_strings(Parser *p, asdl_seq *strings) -{ - int len = asdl_seq_LEN(strings); - assert(len > 0); - - Token *first = asdl_seq_GET(strings, 0); - Token *last = asdl_seq_GET(strings, len - 1); - - int bytesmode = 0; - PyObject *bytes_str = NULL; - - FstringParser state; - FstringParser_Init(&state); - - for (int i = 0; i < len; i++) { - Token *t = asdl_seq_GET(strings, i); - - int this_bytesmode; - int this_rawmode; - PyObject *s; - const char *fstr; - Py_ssize_t fstrlen = -1; - - char *this_str = PyBytes_AsString(t->bytes); - if (!this_str) { - goto error; - } - - if (parsestr(p, this_str, &this_bytesmode, &this_rawmode, &s, &fstr, &fstrlen) != 0) { - goto error; - } - - /* Check that we are not mixing bytes with unicode. */ - if (i != 0 && bytesmode != this_bytesmode) { - raise_syntax_error(p, "cannot mix bytes and nonbytes literals"); - Py_XDECREF(s); - goto error; - } - bytesmode = this_bytesmode; - - if (fstr != NULL) { - assert(s == NULL && !bytesmode); - - int result = FstringParser_ConcatFstring(p, &state, &fstr, fstr + fstrlen, - this_rawmode, 0, first, t, last); - if (result < 0) { - goto error; - } - } - else { - /* String or byte string. */ - assert(s != NULL && fstr == NULL); - assert(bytesmode ? PyBytes_CheckExact(s) : PyUnicode_CheckExact(s)); - - if (bytesmode) { - if (i == 0) { - bytes_str = s; - } - else { - PyBytes_ConcatAndDel(&bytes_str, s); - if (!bytes_str) { - goto error; - } - } - } - else { - /* This is a regular string. Concatenate it. */ - if (FstringParser_ConcatAndDel(&state, s) < 0) { - goto error; - } - } - } - } - - if (bytesmode) { - if (PyArena_AddPyObject(p->arena, bytes_str) < 0) { - goto error; - } - return Constant(bytes_str, NULL, first->lineno, first->col_offset, last->end_lineno, - last->end_col_offset, p->arena); - } - - return FstringParser_Finish(p, &state, first, last); - -error: - Py_XDECREF(bytes_str); - FstringParser_Dealloc(&state); - return NULL; -} diff --git a/Tools/peg_generator/peg_parser/pegen.h b/Tools/peg_generator/peg_parser/pegen.h deleted file mode 100644 index 0c9b4639039191..00000000000000 --- a/Tools/peg_generator/peg_parser/pegen.h +++ /dev/null @@ -1,169 +0,0 @@ -#ifndef PEGEN_H -#define PEGEN_H - -#define PY_SSIZE_T_CLEAN -#include -#include -#include -#include -#include - -enum INPUT_MODE { - FILE_INPUT, - STRING_INPUT, -}; -typedef enum INPUT_MODE INPUT_MODE; - -typedef struct _memo { - int type; - void *node; - int mark; - struct _memo *next; -} Memo; - -typedef struct { - int type; - PyObject *bytes; - int lineno, col_offset, end_lineno, end_col_offset; - Memo *memo; -} Token; - -typedef struct { - char *str; - int type; -} KeywordToken; - -typedef struct { - struct tok_state *tok; - Token **tokens; - int mark; - int fill, size; - PyArena *arena; - KeywordToken **keywords; - int n_keyword_lists; - void *start_rule_func; - INPUT_MODE input_mode; - jmp_buf error_env; -} Parser; - -typedef struct { - cmpop_ty cmpop; - expr_ty expr; -} CmpopExprPair; - -typedef struct { - expr_ty key; - expr_ty value; -} KeyValuePair; - -typedef struct { - arg_ty arg; - expr_ty value; -} NameDefaultPair; - -typedef struct { - asdl_seq *plain_names; - asdl_seq *names_with_defaults; // asdl_seq* of NameDefaultsPair's -} SlashWithDefault; - -typedef struct { - arg_ty vararg; - asdl_seq *kwonlyargs; // asdl_seq* of NameDefaultsPair's - arg_ty kwarg; -} StarEtc; - -typedef struct { - operator_ty kind; -} AugOperator; - -typedef struct { - void *element; - int is_keyword; -} KeywordOrStarred; - -extern const int n_keyword_lists; -extern KeywordToken *reserved_keywords[]; - -int insert_memo(Parser *p, int mark, int type, void *node); -int update_memo(Parser *p, int mark, int type, void *node); -int is_memoized(Parser *p, int type, void *pres); - -int lookahead_with_string(int, void *(func)(Parser *, const char *), Parser *, const char *); -int lookahead_with_int(int, Token *(func)(Parser *, int), Parser *, int); -int lookahead(int, void *(func)(Parser *), Parser *); - -Token *expect_token(Parser *p, int type); -Token *get_last_nonnwhitespace_token(Parser *); -int fill_token(Parser *p); -void *async_token(Parser *p); -void *await_token(Parser *p); -void *endmarker_token(Parser *p); -expr_ty name_token(Parser *p); -void *newline_token(Parser *p); -void *indent_token(Parser *p); -void *dedent_token(Parser *p); -expr_ty number_token(Parser *p); -void *string_token(Parser *p); -int raise_syntax_error(Parser *p, const char *errmsg, ...); -void *CONSTRUCTOR(Parser *p, ...); - -#define UNUSED(expr) do { (void)(expr); } while (0) -#define EXTRA_EXPR(head, tail) head->lineno, head->col_offset, tail->end_lineno, tail->end_col_offset, p->arena -#define EXTRA start_lineno, start_col_offset, end_lineno, end_col_offset, p->arena - -Py_LOCAL_INLINE(void *) -CHECK_CALL(Parser *p, void *result) -{ - if (result == NULL) { - assert(PyErr_Occurred()); - longjmp(p->error_env, 1); - } - return result; -} - -/* This is needed for helper functions that are allowed to - return NULL without an error. Example: seq_extract_starred_exprs */ -Py_LOCAL_INLINE(void *) -CHECK_CALL_NULL_ALLOWED(Parser *p, void *result) -{ - if (result == NULL && PyErr_Occurred()) { - longjmp(p->error_env, 1); - } - return result; -} - -#define CHECK(result) CHECK_CALL(p, result) -#define CHECK_NULL_ALLOWED(result) CHECK_CALL_NULL_ALLOWED(p, result) - -PyObject *new_identifier(Parser *, char *); -mod_ty run_parser_from_file(const char *, mod_ty(*)(Parser *), PyObject *, PyArena *); -mod_ty run_parser_from_string(const char *, mod_ty(*)(Parser *), PyObject *, PyArena *); -asdl_seq *singleton_seq(Parser *, void *); -asdl_seq *seq_insert_in_front(Parser *, void *, asdl_seq *); -asdl_seq *seq_flatten(Parser *, asdl_seq *); -expr_ty join_names_with_dot(Parser *, expr_ty, expr_ty); -int seq_count_dots(asdl_seq *); -alias_ty alias_for_star(Parser *); -asdl_seq *map_names_to_ids(Parser *, asdl_seq *); -CmpopExprPair *cmpop_expr_pair(Parser *, cmpop_ty, expr_ty); -asdl_int_seq *get_cmpops(Parser *p, asdl_seq *); -asdl_seq *get_exprs(Parser *, asdl_seq *); -expr_ty set_expr_context(Parser *, expr_ty, expr_context_ty); -KeyValuePair *key_value_pair(Parser *, expr_ty, expr_ty); -asdl_seq *get_keys(Parser *, asdl_seq *); -asdl_seq *get_values(Parser *, asdl_seq *); -NameDefaultPair *name_default_pair(Parser *, arg_ty, expr_ty); -SlashWithDefault *slash_with_default(Parser *, asdl_seq *, asdl_seq *); -StarEtc *star_etc(Parser *, arg_ty, asdl_seq *, arg_ty); -arguments_ty make_arguments(Parser *, asdl_seq *, SlashWithDefault *, - asdl_seq *, asdl_seq *, StarEtc *); -arguments_ty empty_arguments(Parser *); -AugOperator *augoperator(Parser*, operator_ty type); -stmt_ty function_def_decorators(Parser *, asdl_seq *, stmt_ty); -stmt_ty class_def_decorators(Parser *, asdl_seq *, stmt_ty); -KeywordOrStarred *keyword_or_starred(Parser *, void *, int); -asdl_seq *seq_extract_starred_exprs(Parser *, asdl_seq *); -asdl_seq *seq_delete_starred_exprs(Parser *, asdl_seq *); -expr_ty concatenate_strings(Parser *p, asdl_seq *); - -#endif diff --git a/Tools/peg_generator/peg_parser/v38tokenizer.h b/Tools/peg_generator/peg_parser/v38tokenizer.h deleted file mode 100644 index 92669bfd8a1607..00000000000000 --- a/Tools/peg_generator/peg_parser/v38tokenizer.h +++ /dev/null @@ -1,88 +0,0 @@ -#ifndef Py_TOKENIZER_H -#define Py_TOKENIZER_H -#ifdef __cplusplus -extern "C" { -#endif - -#include "object.h" - -/* Tokenizer interface */ - -#include "token.h" /* For token types */ - -#define MAXINDENT 100 /* Max indentation level */ -#define MAXLEVEL 200 /* Max parentheses level */ - -enum decoding_state { - STATE_INIT, - STATE_RAW, - STATE_NORMAL /* have a codec associated with input */ -}; - -/* Tokenizer state */ -struct tok_state { - /* Input state; buf <= cur <= inp <= end */ - /* NB an entire line is held in the buffer */ - char *buf; /* Input buffer, or NULL; malloc'ed if fp != NULL */ - char *cur; /* Next character in buffer */ - char *inp; /* End of data in buffer */ - char *end; /* End of input buffer if buf != NULL */ - char *start; /* Start of current token if not NULL */ - int done; /* E_OK normally, E_EOF at EOF, otherwise error code */ - /* NB If done != E_OK, cur must be == inp!!! */ - FILE *fp; /* Rest of input; NULL if tokenizing a string */ - int tabsize; /* Tab spacing */ - int indent; /* Current indentation index */ - int indstack[MAXINDENT]; /* Stack of indents */ - int atbol; /* Nonzero if at begin of new line */ - int pendin; /* Pending indents (if > 0) or dedents (if < 0) */ - const char *prompt, *nextprompt; /* For interactive prompting */ - int lineno; /* Current line number */ - int first_lineno; /* First line of a single line or multi line string - expression (cf. issue 16806) */ - int level; /* () [] {} Parentheses nesting level */ - /* Used to allow free continuations inside them */ - char parenstack[MAXLEVEL]; - int parenlinenostack[MAXLEVEL]; - PyObject *filename; - /* Stuff for checking on different tab sizes */ - int altindstack[MAXINDENT]; /* Stack of alternate indents */ - /* Stuff for PEP 0263 */ - enum decoding_state decoding_state; - int decoding_erred; /* whether erred in decoding */ - int read_coding_spec; /* whether 'coding:...' has been read */ - char *encoding; /* Source encoding. */ - int cont_line; /* whether we are in a continuation line. */ - const char* line_start; /* pointer to start of current line */ - const char* multi_line_start; /* pointer to start of first line of - a single line or multi line string - expression (cf. issue 16806) */ - PyObject *decoding_readline; /* open(...).readline */ - PyObject *decoding_buffer; - const char* enc; /* Encoding for the current str. */ - const char* str; - const char* input; /* Tokenizer's newline translated copy of the string. */ - - int type_comments; /* Whether to look for type comments */ - - /* async/await related fields (still needed depending on feature_version) */ - int async_hacks; /* =1 if async/await aren't always keywords */ - int async_def; /* =1 if tokens are inside an 'async def' body. */ - int async_def_indent; /* Indentation level of the outermost 'async def'. */ - int async_def_nl; /* =1 if the outermost 'async def' had at least one - NEWLINE token after it. */ -}; - -extern struct tok_state *PyTokenizer_FromString(const char *, int); -extern struct tok_state *PyTokenizer_FromUTF8(const char *, int); -extern struct tok_state *PyTokenizer_FromFile(FILE *, const char*, - const char *, const char *); -extern void PyTokenizer_Free(struct tok_state *); -extern int PyTokenizer_Get(struct tok_state *, char **, char **); - -#define tok_dump _Py_tok_dump - -#ifdef __cplusplus -} -#endif -#endif /* !Py_TOKENIZER_H */ diff --git a/Tools/peg_generator/pegen/build.py b/Tools/peg_generator/pegen/build.py index 09862e456a7a70..623b4aeb66069b 100644 --- a/Tools/peg_generator/pegen/build.py +++ b/Tools/peg_generator/pegen/build.py @@ -48,11 +48,19 @@ def compile_c_extension( Extension( extension_name, sources=[ - str(MOD_DIR.parent / "peg_parser" / "pegen.c"), - str(MOD_DIR.parent / "peg_parser" / "parse_string.c"), + str(MOD_DIR.parent.parent.parent / "Python" / "Python-ast.c"), + str(MOD_DIR.parent.parent.parent / "Python" / "asdl.c"), + str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer.c"), + str(MOD_DIR.parent.parent.parent / "Parser" / "pegen" / "pegen.c"), + str(MOD_DIR.parent.parent.parent / "Parser" / "pegen" / "parse_string.c"), + str(MOD_DIR.parent / "peg_extension" / "peg_extension.c"), generated_source_path, ], - include_dirs=[str(MOD_DIR.parent / "peg_parser")], + include_dirs=[ + str(MOD_DIR.parent.parent.parent / "Include" / "internal"), + str(MOD_DIR.parent.parent.parent / "Parser"), + str(MOD_DIR.parent.parent.parent / "Parser" / "pegen"), + ], extra_compile_args=extra_compile_args, ) ] diff --git a/Tools/peg_generator/pegen/c_generator.py b/Tools/peg_generator/pegen/c_generator.py index 3a8b402077ccdc..0f895c65c1f8bf 100644 --- a/Tools/peg_generator/pegen/c_generator.py +++ b/Tools/peg_generator/pegen/c_generator.py @@ -30,117 +30,15 @@ """ EXTENSION_SUFFIX = """ -PyObject * -_build_return_object(mod_ty module, int mode, PyObject *filename_ob, PyArena *arena) +void * +parse(Parser *p) { - if (mode == 2) { - return (PyObject *)PyAST_CompileObject(module, filename_ob, NULL, -1, arena); - } else if (mode == 1) { - return PyAST_mod2obj(module); - } else { - Py_INCREF(Py_None); - return Py_None; - } -} + // Initialize keywords + p->keywords = reserved_keywords; + p->n_keyword_lists = n_keyword_lists; -static PyObject * -parse_file(PyObject *self, PyObject *args, PyObject *kwds) -{ - static char *keywords[] = {"file", "mode", NULL}; - const char *filename; - int mode = %(mode)s; - if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|i", keywords, &filename, &mode)) { - return NULL; - } - if (mode < 0 || mode > %(mode)s) { - return PyErr_Format(PyExc_ValueError, "Bad mode, must be 0 <= mode <= %(mode)s"); - } - - PyArena *arena = PyArena_New(); - if (arena == NULL) { - return NULL; - } - - PyObject *result = NULL; - - PyObject *filename_ob = PyUnicode_FromString(filename); - if (filename_ob == NULL) { - goto error; - } - - mod_ty res = run_parser_from_file(filename, start_rule, filename_ob, arena); - if (res == NULL) { - goto error; - } - - result = _build_return_object(res, mode, filename_ob, arena); - -error: - Py_XDECREF(filename_ob); - PyArena_Free(arena); - return result; -} - -static PyObject * -parse_string(PyObject *self, PyObject *args, PyObject *kwds) -{ - static char *keywords[] = {"str", "mode", NULL}; - const char *the_string; - int mode = %(mode)s; - if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|i", keywords, &the_string, &mode)) { - return NULL; - } - if (mode < 0 || mode > %(mode)s) { - return PyErr_Format(PyExc_ValueError, "Bad mode, must be 0 <= mode <= %(mode)s"); - } - - PyArena *arena = PyArena_New(); - if (arena == NULL) { - return NULL; - } - - PyObject *result = NULL; - - PyObject *filename_ob = PyUnicode_FromString(""); - if (filename_ob == NULL) { - goto error; - } - - mod_ty res = run_parser_from_string(the_string, start_rule, filename_ob, arena); - if (res == NULL) { - goto error; - } - result = _build_return_object(res, mode, filename_ob, arena); - -error: - Py_XDECREF(filename_ob); - PyArena_Free(arena); - return result; + return start_rule(p); } - -static PyMethodDef ParseMethods[] = { - {"parse_file", (PyCFunction)(void(*)(void))parse_file, METH_VARARGS|METH_KEYWORDS, "Parse a file."}, - {"parse_string", (PyCFunction)(void(*)(void))parse_string, METH_VARARGS|METH_KEYWORDS, "Parse a string."}, - {NULL, NULL, 0, NULL} /* Sentinel */ -}; - -static struct PyModuleDef parsemodule = { - PyModuleDef_HEAD_INIT, - .m_name = "%(modulename)s", - .m_doc = "A parser.", - .m_methods = ParseMethods, -}; - -PyMODINIT_FUNC -PyInit_%(modulename)s(void) -{ - PyObject *m = PyModule_Create(&parsemodule); - if (m == NULL) - return NULL; - return m; -} - -// The end """ diff --git a/Tools/peg_generator/pegen/testutil.py b/Tools/peg_generator/pegen/testutil.py index 9a4ddd9401fb34..3616effe6b4f9d 100644 --- a/Tools/peg_generator/pegen/testutil.py +++ b/Tools/peg_generator/pegen/testutil.py @@ -6,8 +6,7 @@ import textwrap import tokenize -from typing import Any, cast, Dict, IO, Type -from typing_extensions import Final +from typing import Any, cast, Dict, IO, Type, Final from pegen.build import compile_c_extension from pegen.c_generator import CParserGenerator diff --git a/Tools/peg_generator/pytest.ini b/Tools/peg_generator/pytest.ini deleted file mode 100644 index 7b533f7d6c7bb7..00000000000000 --- a/Tools/peg_generator/pytest.ini +++ /dev/null @@ -1,4 +0,0 @@ -[pytest] -norecursedirs = - data/failset - cpython diff --git a/Tools/peg_generator/requirements-test.pip b/Tools/peg_generator/requirements-test.pip deleted file mode 100644 index 955c1c3e20f81e..00000000000000 --- a/Tools/peg_generator/requirements-test.pip +++ /dev/null @@ -1,6 +0,0 @@ --r requirements.pip -coveralls==1.8.2 -mypy==0.740 -pytest==5.2.2 -pytest-cov==2.8.1 -black==19.10b0 diff --git a/Tools/peg_generator/requirements.pip b/Tools/peg_generator/requirements.pip deleted file mode 100644 index ad2ed380036f72..00000000000000 --- a/Tools/peg_generator/requirements.pip +++ /dev/null @@ -1 +0,0 @@ -typing-extensions==3.7.4.1 diff --git a/Tools/peg_generator/test/test_ast_generation.py b/Tools/peg_generator/test/test_ast_generation.py deleted file mode 100644 index 49f22b5235de66..00000000000000 --- a/Tools/peg_generator/test/test_ast_generation.py +++ /dev/null @@ -1,666 +0,0 @@ -import ast -import os -from pathlib import PurePath -from typing import Any, Union, Iterable, Tuple -from textwrap import dedent - -import pytest # type: ignore - -from pegen.grammar_parser import GeneratedParser as GrammarParser -from pegen.testutil import parse_string, generate_parser_c_extension - -# fmt: off - -TEST_CASES = [ - ('annotated_assignment', 'x: int = 42'), - ('annotated_assignment_with_tuple', 'x: tuple = 1, 2'), - ('annotated_assignment_with_parens', '(paren): int = 3+2'), - ('annotated_assignment_with_yield', 'x: int = yield 42'), - ('annotated_no_assignment', 'x: int'), - ('annotation_with_multiple_parens', '((parens)): int'), - ('annotation_with_parens', '(parens): int'), - ('annotated_assignment_with_attr', 'a.b: int'), - ('annotated_assignment_with_subscript', 'a[b]: int'), - ('annotated_assignment_with_attr_and_parens', '(a.b): int'), - ('annotated_assignment_with_subscript_and_parens', '(a[b]): int'), - ('assert', 'assert a'), - ('assert_message', 'assert a, b'), - ('assignment_false', 'a = False'), - ('assignment_none', 'a = None'), - ('assignment_true', 'a = True'), - ('assignment_paren', '(a) = 42'), - ('assignment_paren_multiple', '(a, b) = (0, 1)'), - ('asyncfor', - ''' - async for i in a: - pass - '''), - ('attribute_call', 'a.b()'), - ('attribute_multiple_names', 'abcd.efg.hij'), - ('attribute_simple', 'a.b'), - ('attributes_subscript', 'a.b[0]'), - ('augmented_assignment', 'x += 42'), - ('binop_add', '1 + 1'), - ('binop_add_multiple', '1 + 1 + 1 + 1'), - ('binop_all', '1 + 2 * 5 + 3 ** 2 - -3'), - ('binop_boolop_comp', '1 + 1 == 2 or 1 + 1 == 3 and not b'), - ('boolop_or', 'a or b'), - ('boolop_or_multiple', 'a or b or c'), - ('class_def_bases', - ''' - class C(A, B): - pass - '''), - ('class_def_decorators', - ''' - @a - class C: - pass - '''), - ('class_def_keywords', - ''' - class C(keyword=a+b, **c): - pass - '''), - ('class_def_mixed', - ''' - class C(A, B, keyword=0, **a): - pass - '''), - ('class_def_simple', - ''' - class C: - pass - '''), - ('class_def_starred_and_kwarg', - ''' - class C(A, B, *x, **y): - pass - '''), - ('class_def_starred_in_kwargs', - ''' - class C(A, x=2, *[B, C], y=3): - pass - '''), - ('call_attribute', 'f().b'), - ('call_genexp', 'f(i for i in a)'), - ('call_mixed_args', 'f(a, b, *c, **d)'), - ('call_mixed_args_named', 'f(a, b, *c, d=4, **v)'), - ('call_one_arg', 'f(a)'), - ('call_posarg_genexp', 'f(a, (i for i in a))'), - ('call_simple', 'f()'), - ('call_subscript', 'f()[0]'), - ('comp', 'a == b'), - ('comp_multiple', 'a == b == c'), - ('comp_paren_end', 'a == (b-1)'), - ('comp_paren_start', '(a-1) == b'), - ('decorator', - ''' - @a - def f(): - pass - '''), - ('decorator_async', - ''' - @a - async def d(): - pass - '''), - ('del_attribute', 'del a.b'), - ('del_call_attribute', 'del a().c'), - ('del_call_genexp_attribute', 'del a(i for i in b).c'), - ('del_empty', 'del()'), - ('del_list', 'del a, [b, c]'), - ('del_mixed', 'del a[0].b().c'), - ('del_multiple', 'del a, b'), - ('del_multiple_calls_attribute', 'del a()().b'), - ('del_paren', 'del(a,b)'), - ('del_paren_single_target', 'del(a)'), - ('del_subscript_attribute', 'del a[0].b'), - ('del_tuple', 'del a, (b, c)'), - ('delete', 'del a'), - ('dict', - ''' - { - a: 1, - b: 2, - c: 3 - } - '''), - ('dict_comp', '{x:1 for x in a}'), - ('dict_comp_if', '{x:1+2 for x in a if b}'), - ('dict_empty', '{}'), - ('for', - ''' - for i in a: - pass - '''), - ('for_else', - ''' - for i in a: - pass - else: - pass - '''), - ('for_star_target_in_paren', 'for (a) in b: pass'), - ('for_star_targets_attribute', 'for a.b in c: pass'), - ('for_star_targets_call_attribute', 'for a().c in b: pass'), - ('for_star_targets_empty', 'for () in a: pass'), - ('for_star_targets_mixed', 'for a[0].b().c in d: pass'), - ('for_star_targets_mixed_starred', - ''' - for a, *b, (c, d) in e: - pass - '''), - ('for_star_targets_multiple', 'for a, b in c: pass'), - ('for_star_targets_nested_starred', 'for *[*a] in b: pass'), - ('for_star_targets_starred', 'for *a in b: pass'), - ('for_star_targets_subscript_attribute', 'for a[0].b in c: pass'), - ('for_star_targets_trailing_comma', - ''' - for a, (b, c), in d: - pass - '''), - ('for_star_targets_tuple', 'for a, (b, c) in d: pass'), - ('for_underscore', - ''' - for _ in a: - pass - '''), - ('function_return_type', - ''' - def f() -> Any: - pass - '''), - ('f-string_slice', "f'{x[2]}'"), - ('f-string_slice_upper', "f'{x[2:3]}'"), - ('f-string_slice_step', "f'{x[2:3:-2]}'"), - ('f-string_constant', "f'{42}'"), - ('f-string_boolop', "f'{x and y}'"), - ('f-string_named_expr', "f'{(x:=42)}'"), - ('f-string_binop', "f'{x+y}'"), - ('f-string_unaryop', "f'{not x}'"), - ('f-string_lambda', "f'{(lambda x, /, y, y2=42 , *z, k1, k2=34, **k3: 42)}'"), - ('f-string_lambda_call', "f'{(lambda: 2)(2)}'"), - ('f-string_ifexpr', "f'{x if y else z}'"), - ('f-string_dict', "f'{ {2:34, 3:34} }'"), - ('f-string_set', "f'{ {2,-45} }'"), - ('f-string_list', "f'{ [2,-45] }'"), - ('f-string_tuple', "f'{ (2,-45) }'"), - ('f-string_listcomp', "f'{[x for x in y if z]}'"), - ('f-string_setcomp', "f'{ {x for x in y if z} }'"), - ('f-string_dictcomp', "f'{ {x:x for x in y if z} }'"), - ('f-string_genexpr', "f'{ (x for x in y if z) }'"), - ('f-string_yield', "f'{ (yield x) }'"), - ('f-string_yieldfrom', "f'{ (yield from x) }'"), - ('f-string_await', "f'{ await x }'"), - ('f-string_compare', "f'{ x == y }'"), - ('f-string_call', "f'{ f(x,y,z) }'"), - ('f-string_attribute', "f'{ f.x.y.z }'"), - ('f-string_starred', "f'{ *x, }'"), - ('f-string_doublestarred', "f'{ {**x} }'"), - ('f-string_escape_brace', "f'{{Escape'"), - ('f-string_escape_closing_brace', "f'Escape}}'"), - ('f-string_repr', "f'{a!r}'"), - ('f-string_str', "f'{a!s}'"), - ('f-string_ascii', "f'{a!a}'"), - ('f-string_debug', "f'{a=}'"), - ('f-string_padding', "f'{a:03d}'"), - ('f-string_multiline', - """ - f''' - {hello} - ''' - """), - ('f-string_multiline_in_expr', - """ - f''' - { - hello - } - ''' - """), - ('f-string_multiline_in_call', - """ - f''' - {f( - a, b, c - )} - ''' - """), - ('global', 'global a, b'), - ('group', '(yield a)'), - ('if_elif', - ''' - if a: - pass - elif b: - pass - '''), - ('if_elif_elif', - ''' - if a: - pass - elif b: - pass - elif c: - pass - '''), - ('if_elif_else', - ''' - if a: - pass - elif b: - pass - else: - pass - '''), - ('if_else', - ''' - if a: - pass - else: - pass - '''), - ('if_simple', 'if a: pass'), - ('import', 'import a'), - ('import_alias', 'import a as b'), - ('import_dotted', 'import a.b'), - ('import_dotted_alias', 'import a.b as c'), - ('import_dotted_multichar', 'import ab.cd'), - ('import_from', 'from a import b'), - ('import_from_alias', 'from a import b as c'), - ('import_from_dotted', 'from a.b import c'), - ('import_from_dotted_alias', 'from a.b import c as d'), - ('import_from_multiple_aliases', 'from a import b as c, d as e'), - ('import_from_one_dot', 'from .a import b'), - ('import_from_one_dot_alias', 'from .a import b as c'), - ('import_from_star', 'from a import *'), - ('import_from_three_dots', 'from ...a import b'), - ('kwarg', - ''' - def f(**a): - pass - '''), - ('kwonly_args', - ''' - def f(*, a, b): - pass - '''), - ('kwonly_args_with_default', - ''' - def f(*, a=2, b): - pass - '''), - ('lambda_kwarg', 'lambda **a: 42'), - ('lambda_kwonly_args', 'lambda *, a, b: 42'), - ('lambda_kwonly_args_with_default', 'lambda *, a=2, b: 42'), - ('lambda_mixed_args', 'lambda a, /, b, *, c: 42'), - ('lambda_mixed_args_with_default', 'lambda a, b=2, /, c=3, *e, f, **g: 42'), - ('lambda_no_args', 'lambda: 42'), - ('lambda_pos_args', 'lambda a,b: 42'), - ('lambda_pos_args_with_default', 'lambda a, b=2: 42'), - ('lambda_pos_only_args', 'lambda a, /: 42'), - ('lambda_pos_only_args_with_default', 'lambda a=0, /: 42'), - ('lambda_pos_posonly_args', 'lambda a, b, /, c, d: 42'), - ('lambda_pos_posonly_args_with_default', 'lambda a, b=0, /, c=2: 42'), - ('lambda_vararg', 'lambda *a: 42'), - ('lambda_vararg_kwonly_args', 'lambda *a, b: 42'), - ('list', '[1, 2, a]'), - ('list_comp', '[i for i in a]'), - ('list_comp_if', '[i for i in a if b]'), - ('list_trailing_comma', '[1+2, a, 3+4,]'), - ('mixed_args', - ''' - def f(a, /, b, *, c): - pass - '''), - ('mixed_args_with_default', - ''' - def f(a, b=2, /, c=3, *e, f, **g): - pass - '''), - ('multipart_string_bytes', 'b"Hola" b"Hello" b"Bye"'), - ('multipart_string_triple', '"""Something here""" "and now"'), - ('multipart_string_different_prefixes', 'u"Something" "Other thing" r"last thing"'), - ('multiple_assignments', 'x = y = z = 42'), - ('multiple_assignments_with_yield', 'x = y = z = yield 42'), - ('multiple_pass', - ''' - pass; pass - pass - '''), - ('namedexpr', '(x := [1, 2, 3])'), - ('namedexpr_false', '(x := False)'), - ('namedexpr_none', '(x := None)'), - ('namedexpr_true', '(x := True)'), - ('nonlocal', 'nonlocal a, b'), - ('number_complex', '-2.234+1j'), - ('number_float', '-34.2333'), - ('number_imaginary_literal', '1.1234j'), - ('number_integer', '-234'), - ('number_underscores', '1_234_567'), - ('pass', 'pass'), - ('pos_args', - ''' - def f(a, b): - pass - '''), - ('pos_args_with_default', - ''' - def f(a, b=2): - pass - '''), - ('pos_only_args', - ''' - def f(a, /): - pass - '''), - ('pos_only_args_with_default', - ''' - def f(a=0, /): - pass - '''), - ('pos_posonly_args', - ''' - def f(a, b, /, c, d): - pass - '''), - ('pos_posonly_args_with_default', - ''' - def f(a, b=0, /, c=2): - pass - '''), - ('primary_mixed', 'a.b.c().d[0]'), - ('raise', 'raise'), - ('raise_ellipsis', 'raise ...'), - ('raise_expr', 'raise a'), - ('raise_from', 'raise a from b'), - ('return', 'return'), - ('return_expr', 'return a'), - ('set', '{1, 2+4, 3+5}'), - ('set_comp', '{i for i in a}'), - ('set_trailing_comma', '{1, 2, 3,}'), - ('simple_assignment', 'x = 42'), - ('simple_assignment_with_yield', 'x = yield 42'), - ('string_bytes', 'b"hello"'), - ('string_concatenation_bytes', 'b"hello" b"world"'), - ('string_concatenation_simple', '"abcd" "efgh"'), - ('string_format_simple', 'f"hello"'), - ('string_format_with_formatted_value', 'f"hello {world}"'), - ('string_simple', '"hello"'), - ('string_unicode', 'u"hello"'), - ('subscript_attribute', 'a[0].b'), - ('subscript_call', 'a[b]()'), - ('subscript_multiple_slices', 'a[0:a:2, 1]'), - ('subscript_simple', 'a[0]'), - ('subscript_single_element_tuple', 'a[0,]'), - ('subscript_trailing_comma', 'a[0, 1, 2,]'), - ('subscript_tuple', 'a[0, 1, 2]'), - ('subscript_whole_slice', 'a[0+1:b:c]'), - ('try_except', - ''' - try: - pass - except: - pass - '''), - ('try_except_else', - ''' - try: - pass - except: - pass - else: - pass - '''), - ('try_except_else_finally', - ''' - try: - pass - except: - pass - else: - pass - finally: - pass - '''), - ('try_except_expr', - ''' - try: - pass - except a: - pass - '''), - ('try_except_expr_target', - ''' - try: - pass - except a as b: - pass - '''), - ('try_except_finally', - ''' - try: - pass - except: - pass - finally: - pass - '''), - ('try_finally', - ''' - try: - pass - finally: - pass - '''), - ('unpacking_binop', '[*([1, 2, 3] + [3, 4, 5])]'), - ('unpacking_call', '[*b()]'), - ('unpacking_compare', '[*(x < y)]'), - ('unpacking_constant', '[*3]'), - ('unpacking_dict', '[*{1: 2, 3: 4}]'), - ('unpacking_dict_comprehension', '[*{x:y for x,y in z}]'), - ('unpacking_ifexpr', '[*([1, 2, 3] if x else y)]'), - ('unpacking_list', '[*[1,2,3]]'), - ('unpacking_list_comprehension', '[*[x for x in y]]'), - ('unpacking_namedexpr', '[*(x:=[1, 2, 3])]'), - ('unpacking_set', '[*{1,2,3}]'), - ('unpacking_set_comprehension', '[*{x for x in y}]'), - ('unpacking_string', '[*"myvalue"]'), - ('unpacking_tuple', '[*(1,2,3)]'), - ('unpacking_unaryop', '[*(not [1, 2, 3])]'), - ('unpacking_yield', '[*(yield 42)]'), - ('unpacking_yieldfrom', '[*(yield from x)]'), - ('tuple', '(1, 2, 3)'), - ('vararg', - ''' - def f(*a): - pass - '''), - ('vararg_kwonly_args', - ''' - def f(*a, b): - pass - '''), - ('while', - ''' - while a: - pass - '''), - ('while_else', - ''' - while a: - pass - else: - pass - '''), - ('with', - ''' - with a: - pass - '''), - ('with_as', - ''' - with a as b: - pass - '''), - ('with_as_paren', - ''' - with a as (b): - pass - '''), - ('with_as_empty', 'with a as (): pass'), - ('with_list_recursive', - ''' - with a as [x, [y, z]]: - pass - '''), - ('with_tuple_recursive', - ''' - with a as ((x, y), z): - pass - '''), - ('with_tuple_target', - ''' - with a as (x, y): - pass - '''), - ('yield', 'yield'), - ('yield_expr', 'yield a'), - ('yield_from', 'yield from a'), -] - -FAIL_TEST_CASES = [ - ("annotation_multiple_targets", "(a, b): int = 42"), - ("annotation_nested_tuple", "((a, b)): int"), - ("annotation_list", "[a]: int"), - ("annotation_lambda", "lambda: int = 42"), - ("annotation_tuple", "(a,): int"), - ("annotation_tuple_without_paren", "a,: int"), - ("assignment_keyword", "a = if"), - ("comprehension_lambda", "(a for a in lambda: b)"), - ("comprehension_else", "(a for a in b if c else d"), - ("del_call", "del a()"), - ("del_call_genexp", "del a(i for i in b)"), - ("del_subscript_call", "del a[b]()"), - ("del_attribute_call", "del a.b()"), - ("del_mixed_call", "del a[0].b().c.d()"), - ("for_star_targets_call", "for a() in b: pass"), - ("for_star_targets_subscript_call", "for a[b]() in c: pass"), - ("for_star_targets_attribute_call", "for a.b() in c: pass"), - ("for_star_targets_mixed_call", "for a[0].b().c.d() in e: pass"), - ("for_star_targets_in", "for a, in in b: pass"), - ("f-string_assignment", "f'{x = 42}'"), - ("f-string_empty", "f'{}'"), - ("f-string_function_def", "f'{def f(): pass}'"), - ("f-string_lambda", "f'{lambda x: 42}'"), - ("f-string_singe_brace", "f'{'"), - ("f-string_single_closing_brace", "f'}'"), - ("from_import_invalid", "from import import a"), - # This test case checks error paths involving tokens with uninitialized - # values of col_offset and end_col_offset. - ("invalid indentation", - """ - def f(): - a - a - """), -] - -GOOD_BUT_FAIL_TEST_CASES = [ - ('string_concatenation_format', 'f"{hello} world" f"again {and_again}"'), - ('string_concatenation_multiple', - ''' - f"hello" f"{world} again" f"and_again" - '''), - ('f-string_multiline_comp', - """ - f''' - {(i for i in a - if b)} - ''' - """), -] - -# fmt: on - - -def cleanup_source(source: Any) -> str: - if isinstance(source, str): - result = dedent(source) - elif not isinstance(source, (list, tuple)): - result = "\n".join(source) - else: - raise TypeError(f"Invalid type for test source: {source}") - return result - - -def prepare_test_cases( - test_cases: Iterable[Tuple[str, Union[str, Iterable[str]]]] -) -> Tuple[Iterable[str], Iterable[str]]: - - test_ids, _test_sources = zip(*test_cases) - test_sources = list(_test_sources) - for index, source in enumerate(test_sources): - result = cleanup_source(source) - test_sources[index] = result - return test_ids, test_sources - - -TEST_IDS, TEST_SOURCES = prepare_test_cases(TEST_CASES) - -GOOD_BUT_FAIL_TEST_IDS, GOOD_BUT_FAIL_SOURCES = prepare_test_cases(GOOD_BUT_FAIL_TEST_CASES) - -FAIL_TEST_IDS, FAIL_SOURCES = prepare_test_cases(FAIL_TEST_CASES) - - -def create_tmp_extension(tmp_path: PurePath) -> Any: - with open(os.path.join("data", "python.gram"), "r") as grammar_file: - grammar_source = grammar_file.read() - grammar = parse_string(grammar_source, GrammarParser) - extension = generate_parser_c_extension(grammar, tmp_path) - return extension - - -@pytest.fixture(scope="module") -def parser_extension(tmp_path_factory: Any) -> Any: - tmp_path = tmp_path_factory.mktemp("extension") - extension = create_tmp_extension(tmp_path) - return extension - - -@pytest.mark.parametrize("source", TEST_SOURCES, ids=TEST_IDS) -def test_correct_ast_generation_on_source_files(parser_extension: Any, source: str) -> None: - actual_ast = parser_extension.parse_string(source, mode=1) - expected_ast = ast.parse(source) - assert ast.dump(actual_ast, include_attributes=True) == ast.dump( - expected_ast, include_attributes=True - ), f"Wrong AST generation for source: {source}" - - -@pytest.mark.parametrize("source", FAIL_SOURCES, ids=FAIL_TEST_IDS) -def test_incorrect_ast_generation_on_source_files(parser_extension: Any, source: str) -> None: - with pytest.raises(SyntaxError): - parser_extension.parse_string(source, mode=0) - - -@pytest.mark.xfail -@pytest.mark.parametrize("source", GOOD_BUT_FAIL_SOURCES, ids=GOOD_BUT_FAIL_TEST_IDS) -def test_correct_but_known_to_fail_ast_generation_on_source_files( - parser_extension: Any, source: str -) -> None: - actual_ast = parser_extension.parse_string(source, mode=1) - expected_ast = ast.parse(source) - assert ast.dump(actual_ast, include_attributes=True) == ast.dump( - expected_ast, include_attributes=True - ), f"Wrong AST generation for source: {source}" - - -@pytest.mark.parametrize("source", GOOD_BUT_FAIL_SOURCES, ids=GOOD_BUT_FAIL_TEST_IDS) -def test_correct_ast_generation_without_pos_info(parser_extension: Any, source: str) -> None: - actual_ast = parser_extension.parse_string(source, mode=1) - expected_ast = ast.parse(source) - assert ast.dump(actual_ast) == ast.dump( - expected_ast - ), f"Wrong AST generation for source: {source}" diff --git a/Tools/peg_generator/test/test_c_parser.py b/Tools/peg_generator/test/test_c_parser.py deleted file mode 100644 index d8f2967d86164b..00000000000000 --- a/Tools/peg_generator/test/test_c_parser.py +++ /dev/null @@ -1,358 +0,0 @@ -import ast -from pathlib import PurePath -import textwrap -from typing import Optional, Sequence -import traceback - -import pytest # type: ignore - -from pegen.grammar_parser import GeneratedParser as GrammarParser -from pegen.testutil import parse_string, generate_parser_c_extension, generate_c_parser_source - - -def check_input_strings_for_grammar( - source: str, - tmp_path: PurePath, - valid_cases: Sequence[str] = (), - invalid_cases: Sequence[str] = (), -) -> None: - grammar = parse_string(source, GrammarParser) - extension = generate_parser_c_extension(grammar, tmp_path) - - if valid_cases: - for case in valid_cases: - extension.parse_string(case) - - if invalid_cases: - for case in invalid_cases: - with pytest.raises(SyntaxError): - extension.parse_string(case) - - -def verify_ast_generation(source: str, stmt: str, tmp_path: PurePath) -> None: - grammar = parse_string(source, GrammarParser) - extension = generate_parser_c_extension(grammar, tmp_path) - - expected_ast = ast.parse(stmt) - actual_ast = extension.parse_string(stmt) - assert ast.dump(expected_ast) == ast.dump(actual_ast) - - -def test_c_parser(tmp_path: PurePath) -> None: - grammar_source = """ - start[mod_ty]: a=stmt* $ { Module(a, NULL, p->arena) } - stmt[stmt_ty]: a=expr_stmt { a } - expr_stmt[stmt_ty]: a=expression NEWLINE { _Py_Expr(a, EXTRA) } - expression[expr_ty]: ( l=expression '+' r=term { _Py_BinOp(l, Add, r, EXTRA) } - | l=expression '-' r=term { _Py_BinOp(l, Sub, r, EXTRA) } - | t=term { t } - ) - term[expr_ty]: ( l=term '*' r=factor { _Py_BinOp(l, Mult, r, EXTRA) } - | l=term '/' r=factor { _Py_BinOp(l, Div, r, EXTRA) } - | f=factor { f } - ) - factor[expr_ty]: ('(' e=expression ')' { e } - | a=atom { a } - ) - atom[expr_ty]: ( n=NAME { n } - | n=NUMBER { n } - | s=STRING { s } - ) - """ - grammar = parse_string(grammar_source, GrammarParser) - extension = generate_parser_c_extension(grammar, tmp_path) - - expressions = [ - "4+5", - "4-5", - "4*5", - "1+4*5", - "1+4/5", - "(1+1) + (1+1)", - "(1+1) - (1+1)", - "(1+1) * (1+1)", - "(1+1) / (1+1)", - ] - - for expr in expressions: - the_ast = extension.parse_string(expr) - expected_ast = ast.parse(expr) - assert ast.dump(the_ast) == ast.dump(expected_ast) - - -def test_lookahead(tmp_path: PurePath) -> None: - grammar = """ - start: NAME &NAME expr NEWLINE? ENDMARKER - expr: NAME | NUMBER - """ - valid_cases = ["foo bar"] - invalid_cases = ["foo 34"] - check_input_strings_for_grammar(grammar, tmp_path, valid_cases, invalid_cases) - - -def test_negative_lookahead(tmp_path: PurePath) -> None: - grammar = """ - start: NAME !NAME expr NEWLINE? ENDMARKER - expr: NAME | NUMBER - """ - valid_cases = ["foo 34"] - invalid_cases = ["foo bar"] - check_input_strings_for_grammar(grammar, tmp_path, valid_cases, invalid_cases) - - -def test_cut(tmp_path: PurePath) -> None: - grammar = """ - start: X ~ Y Z | X Q S - X: 'x' - Y: 'y' - Z: 'z' - Q: 'q' - S: 's' - """ - valid_cases = ["x y z"] - invalid_cases = ["x q s"] - check_input_strings_for_grammar(grammar, tmp_path, valid_cases, invalid_cases) - - -def test_gather(tmp_path: PurePath) -> None: - grammar = """ - start: ';'.pass_stmt+ NEWLINE - pass_stmt: 'pass' - """ - valid_cases = ["pass", "pass; pass"] - invalid_cases = ["pass;", "pass; pass;"] - check_input_strings_for_grammar(grammar, tmp_path, valid_cases, invalid_cases) - - -def test_left_recursion(tmp_path: PurePath) -> None: - grammar = """ - start: expr NEWLINE - expr: ('-' term | expr '+' term | term) - term: NUMBER - """ - valid_cases = ["-34", "34", "34 + 12", "1 + 1 + 2 + 3"] - check_input_strings_for_grammar(grammar, tmp_path, valid_cases) - - -def test_advanced_left_recursive(tmp_path: PurePath) -> None: - grammar = """ - start: NUMBER | sign start - sign: ['-'] - """ - valid_cases = ["23", "-34"] - check_input_strings_for_grammar(grammar, tmp_path, valid_cases) - - -def test_mutually_left_recursive(tmp_path: PurePath) -> None: - grammar = """ - start: foo 'E' - foo: bar 'A' | 'B' - bar: foo 'C' | 'D' - """ - valid_cases = ["B E", "D A C A E"] - check_input_strings_for_grammar(grammar, tmp_path, valid_cases) - - -def test_nasty_mutually_left_recursive(tmp_path: PurePath) -> None: - grammar = """ - start: target '=' - target: maybe '+' | NAME - maybe: maybe '-' | target - """ - valid_cases = ["x ="] - invalid_cases = ["x - + ="] - check_input_strings_for_grammar(grammar, tmp_path, valid_cases, invalid_cases) - - -def test_return_stmt_noexpr_action(tmp_path: PurePath) -> None: - grammar = """ - start[mod_ty]: a=[statements] ENDMARKER { Module(a, NULL, p->arena) } - statements[asdl_seq*]: a=statement+ { a } - statement[stmt_ty]: simple_stmt - simple_stmt[stmt_ty]: small_stmt - small_stmt[stmt_ty]: return_stmt - return_stmt[stmt_ty]: a='return' NEWLINE { _Py_Return(NULL, EXTRA) } - """ - stmt = "return" - verify_ast_generation(grammar, stmt, tmp_path) - - -def test_gather_action_ast(tmp_path: PurePath) -> None: - grammar = """ - start[mod_ty]: a=';'.pass_stmt+ NEWLINE ENDMARKER { Module(a, NULL, p->arena) } - pass_stmt[stmt_ty]: a='pass' { _Py_Pass(EXTRA)} - """ - stmt = "pass; pass" - verify_ast_generation(grammar, stmt, tmp_path) - - -def test_pass_stmt_action(tmp_path: PurePath) -> None: - grammar = """ - start[mod_ty]: a=[statements] ENDMARKER { Module(a, NULL, p->arena) } - statements[asdl_seq*]: a=statement+ { a } - statement[stmt_ty]: simple_stmt - simple_stmt[stmt_ty]: small_stmt - small_stmt[stmt_ty]: pass_stmt - pass_stmt[stmt_ty]: a='pass' NEWLINE { _Py_Pass(EXTRA) } - """ - stmt = "pass" - verify_ast_generation(grammar, stmt, tmp_path) - - -def test_if_stmt_action(tmp_path: PurePath) -> None: - grammar = """ - start[mod_ty]: a=[statements] ENDMARKER { Module(a, NULL, p->arena) } - statements[asdl_seq*]: a=statement+ { seq_flatten(p, a) } - statement[asdl_seq*]: a=compound_stmt { singleton_seq(p, a) } | simple_stmt - - simple_stmt[asdl_seq*]: a=small_stmt b=further_small_stmt* [';'] NEWLINE { seq_insert_in_front(p, a, b) } - further_small_stmt[stmt_ty]: ';' a=small_stmt { a } - - block: simple_stmt | NEWLINE INDENT a=statements DEDENT { a } - - compound_stmt: if_stmt - - if_stmt: 'if' a=full_expression ':' b=block { _Py_If(a, b, NULL, EXTRA) } - - small_stmt[stmt_ty]: pass_stmt - - pass_stmt[stmt_ty]: a='pass' { _Py_Pass(EXTRA) } - - full_expression: NAME - """ - stmt = "pass" - verify_ast_generation(grammar, stmt, tmp_path) - - -@pytest.mark.parametrize("stmt", ["from a import b as c", "from . import a as b"]) -def test_same_name_different_types(stmt: str, tmp_path: PurePath) -> None: - grammar = """ - start[mod_ty]: a=import_from+ NEWLINE ENDMARKER { Module(a, NULL, p->arena)} - import_from[stmt_ty]: ( a='from' !'import' c=simple_name 'import' d=import_as_names_from { - _Py_ImportFrom(c->v.Name.id, d, 0, EXTRA) } - | a='from' '.' 'import' c=import_as_names_from { - _Py_ImportFrom(NULL, c, 1, EXTRA) } - ) - simple_name[expr_ty]: NAME - import_as_names_from[asdl_seq*]: a=','.import_as_name_from+ { a } - import_as_name_from[alias_ty]: a=NAME 'as' b=NAME { _Py_alias(((expr_ty) a)->v.Name.id, ((expr_ty) b)->v.Name.id, p->arena) } - """ - verify_ast_generation(grammar, stmt, tmp_path) - - -def test_with_stmt_with_paren(tmp_path: PurePath) -> None: - grammar_source = """ - start[mod_ty]: a=[statements] ENDMARKER { Module(a, NULL, p->arena) } - statements[asdl_seq*]: a=statement+ { seq_flatten(p, a) } - statement[asdl_seq*]: a=compound_stmt { singleton_seq(p, a) } - compound_stmt[stmt_ty]: with_stmt - with_stmt[stmt_ty]: ( - a='with' '(' b=','.with_item+ ')' ':' c=block { - _Py_With(b, singleton_seq(p, c), NULL, EXTRA) } - ) - with_item[withitem_ty]: ( - e=NAME o=['as' t=NAME { t }] { _Py_withitem(e, set_expr_context(p, o, Store), p->arena) } - ) - block[stmt_ty]: a=pass_stmt NEWLINE { a } | NEWLINE INDENT a=pass_stmt DEDENT { a } - pass_stmt[stmt_ty]: a='pass' { _Py_Pass(EXTRA) } - """ - stmt = "with (\n a as b,\n c as d\n): pass" - grammar = parse_string(grammar_source, GrammarParser) - extension = generate_parser_c_extension(grammar, tmp_path) - the_ast = extension.parse_string(stmt) - assert ast.dump(the_ast).startswith( - "Module(body=[With(items=[withitem(context_expr=Name(id='a', ctx=Load()), optional_vars=Name(id='b', ctx=Store())), " - "withitem(context_expr=Name(id='c', ctx=Load()), optional_vars=Name(id='d', ctx=Store()))]" - ) - - -def test_ternary_operator(tmp_path: PurePath) -> None: - grammar_source = """ - start[mod_ty]: a=expr ENDMARKER { Module(a, NULL, p->arena) } - expr[asdl_seq*]: a=listcomp NEWLINE { singleton_seq(p, _Py_Expr(a, EXTRA)) } - listcomp[expr_ty]: ( - a='[' b=NAME c=for_if_clauses d=']' { _Py_ListComp(b, c, EXTRA) } - ) - for_if_clauses[asdl_seq*]: ( - a=(y=[ASYNC] 'for' a=NAME 'in' b=NAME c=('if' z=NAME { z })* - { _Py_comprehension(_Py_Name(((expr_ty) a)->v.Name.id, Store, EXTRA), b, c, (y == NULL) ? 0 : 1, p->arena) })+ { a } - ) - """ - stmt = "[i for i in a if b]" - verify_ast_generation(grammar_source, stmt, tmp_path) - - -@pytest.mark.parametrize("text", ["a b 42 b a", "名 名 42 名 名"]) -def test_syntax_error_for_string(text: str, tmp_path: PurePath) -> None: - grammar_source = """ - start: expr+ NEWLINE? ENDMARKER - expr: NAME - """ - grammar = parse_string(grammar_source, GrammarParser) - extension = generate_parser_c_extension(grammar, tmp_path) - try: - extension.parse_string(text) - except SyntaxError as e: - tb = traceback.format_exc() - assert 'File "", line 1' in tb - assert f"{text}\n ^" in tb - - -@pytest.mark.parametrize("text", ["a b 42 b a", "名 名 42 名 名"]) -def test_syntax_error_for_file(text: str, tmp_path: PurePath) -> None: - grammar_source = """ - start: expr+ NEWLINE? ENDMARKER - expr: NAME - """ - grammar = parse_string(grammar_source, GrammarParser) - extension = generate_parser_c_extension(grammar, tmp_path) - the_file = tmp_path / "some_file.py" - with open(the_file, "w") as fd: - fd.write(text) - try: - extension.parse_file(str(the_file)) - except SyntaxError as e: - tb = traceback.format_exc() - assert 'some_file.py", line 1' in tb - assert f"{text}\n ^" in tb - - -def test_headers_and_trailer(tmp_path: PurePath) -> None: - grammar_source = """ - @header 'SOME HEADER' - @subheader 'SOME SUBHEADER' - @trailer 'SOME TRAILER' - start: expr+ NEWLINE? ENDMARKER - expr: x=NAME - """ - grammar = parse_string(grammar_source, GrammarParser) - parser_source = generate_c_parser_source(grammar) - - assert "SOME HEADER" in parser_source - assert "SOME SUBHEADER" in parser_source - assert "SOME TRAILER" in parser_source - - -def test_extension_name(tmp_path: PurePath) -> None: - grammar_source = """ - @modulename 'alternative_name' - start: expr+ NEWLINE? ENDMARKER - expr: x=NAME - """ - grammar = parse_string(grammar_source, GrammarParser) - parser_source = generate_c_parser_source(grammar) - - assert "PyInit_alternative_name" in parser_source - assert '.m_name = "alternative_name"' in parser_source - - -def test_error_in_rules(tmp_path: PurePath) -> None: - grammar_source = """ - start: expr+ NEWLINE? ENDMARKER - expr: NAME {PyTuple_New(-1)} - """ - grammar = parse_string(grammar_source, GrammarParser) - extension = generate_parser_c_extension(grammar, tmp_path) - # PyTuple_New raises SystemError if an invalid argument was passed. - with pytest.raises(SystemError): - extension.parse_string("a") diff --git a/Tools/peg_generator/test/test_first_sets.py b/Tools/peg_generator/test/test_first_sets.py deleted file mode 100644 index 2e0d72bbf2e2c4..00000000000000 --- a/Tools/peg_generator/test/test_first_sets.py +++ /dev/null @@ -1,240 +0,0 @@ -from typing import Set, Dict - -from pegen.first_sets import FirstSetCalculator -from pegen.grammar import Grammar -from pegen.grammar_parser import GeneratedParser as GrammarParser -from pegen.testutil import parse_string - - -def calculate_first_sets(grammar_source: str) -> Dict[str, Set[str]]: - grammar: Grammar = parse_string(grammar_source, GrammarParser) - return FirstSetCalculator(grammar.rules).calculate() - - -def test_alternatives() -> None: - grammar = """ - start: expr NEWLINE? ENDMARKER - expr: A | B - A: 'a' | '-' - B: 'b' | '+' - """ - assert calculate_first_sets(grammar) == { - "A": {"'a'", "'-'"}, - "B": {"'+'", "'b'"}, - "expr": {"'+'", "'a'", "'b'", "'-'"}, - "start": {"'+'", "'a'", "'b'", "'-'"}, - } - - -def test_optionals() -> None: - grammar = """ - start: expr NEWLINE - expr: ['a'] ['b'] 'c' - """ - assert calculate_first_sets(grammar) == { - "expr": {"'c'", "'a'", "'b'"}, - "start": {"'c'", "'a'", "'b'"}, - } - - -def test_repeat_with_separator() -> None: - grammar = """ - start: ','.thing+ NEWLINE - thing: NUMBER - """ - assert calculate_first_sets(grammar) == {"thing": {"NUMBER"}, "start": {"NUMBER"}} - - -def test_optional_operator() -> None: - grammar = """ - start: sum NEWLINE - sum: (term)? 'b' - term: NUMBER - """ - assert calculate_first_sets(grammar) == { - "term": {"NUMBER"}, - "sum": {"NUMBER", "'b'"}, - "start": {"'b'", "NUMBER"}, - } - - -def test_optional_literal() -> None: - grammar = """ - start: sum NEWLINE - sum: '+' ? term - term: NUMBER - """ - assert calculate_first_sets(grammar) == { - "term": {"NUMBER"}, - "sum": {"'+'", "NUMBER"}, - "start": {"'+'", "NUMBER"}, - } - - -def test_optional_after() -> None: - grammar = """ - start: term NEWLINE - term: NUMBER ['+'] - """ - assert calculate_first_sets(grammar) == {"term": {"NUMBER"}, "start": {"NUMBER"}} - - -def test_optional_before() -> None: - grammar = """ - start: term NEWLINE - term: ['+'] NUMBER - """ - assert calculate_first_sets(grammar) == {"term": {"NUMBER", "'+'"}, "start": {"NUMBER", "'+'"}} - - -def test_repeat_0() -> None: - grammar = """ - start: thing* "+" NEWLINE - thing: NUMBER - """ - assert calculate_first_sets(grammar) == {"thing": {"NUMBER"}, "start": {'"+"', "NUMBER"}} - - -def test_repeat_0_with_group() -> None: - grammar = """ - start: ('+' '-')* term NEWLINE - term: NUMBER - """ - assert calculate_first_sets(grammar) == {"term": {"NUMBER"}, "start": {"'+'", "NUMBER"}} - - -def test_repeat_1() -> None: - grammar = """ - start: thing+ '-' NEWLINE - thing: NUMBER - """ - assert calculate_first_sets(grammar) == {"thing": {"NUMBER"}, "start": {"NUMBER"}} - - -def test_repeat_1_with_group() -> None: - grammar = """ - start: ('+' term)+ term NEWLINE - term: NUMBER - """ - assert calculate_first_sets(grammar) == {"term": {"NUMBER"}, "start": {"'+'"}} - - -def test_gather() -> None: - grammar = """ - start: ','.thing+ NEWLINE - thing: NUMBER - """ - assert calculate_first_sets(grammar) == {"thing": {"NUMBER"}, "start": {"NUMBER"}} - - -def test_positive_lookahead() -> None: - grammar = """ - start: expr NEWLINE - expr: &'a' opt - opt: 'a' | 'b' | 'c' - """ - assert calculate_first_sets(grammar) == { - "expr": {"'a'"}, - "start": {"'a'"}, - "opt": {"'b'", "'c'", "'a'"}, - } - - -def test_negative_lookahead() -> None: - grammar = """ - start: expr NEWLINE - expr: !'a' opt - opt: 'a' | 'b' | 'c' - """ - assert calculate_first_sets(grammar) == { - "opt": {"'b'", "'a'", "'c'"}, - "expr": {"'b'", "'c'"}, - "start": {"'b'", "'c'"}, - } - - -def test_left_recursion() -> None: - grammar = """ - start: expr NEWLINE - expr: ('-' term | expr '+' term | term) - term: NUMBER - foo: 'foo' - bar: 'bar' - baz: 'baz' - """ - assert calculate_first_sets(grammar) == { - "expr": {"NUMBER", "'-'"}, - "term": {"NUMBER"}, - "start": {"NUMBER", "'-'"}, - "foo": {"'foo'"}, - "bar": {"'bar'"}, - "baz": {"'baz'"}, - } - - -def test_advance_left_recursion() -> None: - grammar = """ - start: NUMBER | sign start - sign: ['-'] - """ - assert calculate_first_sets(grammar) == {"sign": {"'-'", ""}, "start": {"'-'", "NUMBER"}} - - -def test_mutual_left_recursion() -> None: - grammar = """ - start: foo 'E' - foo: bar 'A' | 'B' - bar: foo 'C' | 'D' - """ - assert calculate_first_sets(grammar) == { - "foo": {"'D'", "'B'"}, - "bar": {"'D'"}, - "start": {"'D'", "'B'"}, - } - - -def test_nasty_left_recursion() -> None: - # TODO: Validate this - grammar = """ - start: target '=' - target: maybe '+' | NAME - maybe: maybe '-' | target - """ - assert calculate_first_sets(grammar) == {"maybe": set(), "target": {"NAME"}, "start": {"NAME"}} - - -def test_nullable_rule() -> None: - grammar = """ - start: sign thing $ - sign: ['-'] - thing: NUMBER - """ - assert calculate_first_sets(grammar) == { - "sign": {"", "'-'"}, - "thing": {"NUMBER"}, - "start": {"NUMBER", "'-'"}, - } - - -def test_epsilon_production_in_start_rule() -> None: - grammar = """ - start: ['-'] $ - """ - assert calculate_first_sets(grammar) == {"start": {"ENDMARKER", "'-'"}} - - -def test_multiple_nullable_rules() -> None: - grammar = """ - start: sign thing other another $ - sign: ['-'] - thing: ['+'] - other: '*' - another: '/' - """ - assert calculate_first_sets(grammar) == { - "sign": {"", "'-'"}, - "thing": {"'+'", ""}, - "start": {"'+'", "'-'", "'*'"}, - "other": {"'*'"}, - "another": {"'/'"}, - } diff --git a/Tools/peg_generator/test/test_pegen.py b/Tools/peg_generator/test/test_pegen.py deleted file mode 100644 index dca038447aefc7..00000000000000 --- a/Tools/peg_generator/test/test_pegen.py +++ /dev/null @@ -1,749 +0,0 @@ -import io -import textwrap - -from tokenize import TokenInfo, NAME, NEWLINE, NUMBER, OP - -from typing import Any, Dict, List, Type - -import pytest # type: ignore - -from pegen.grammar_parser import GeneratedParser as GrammarParser -from pegen.grammar import GrammarVisitor, GrammarError, Grammar -from pegen.grammar_visualizer import ASTGrammarPrinter -from pegen.parser import Parser -from pegen.python_generator import PythonParserGenerator - -from pegen.testutil import generate_parser, parse_string, make_parser - - -def test_parse_grammar() -> None: - grammar_source = """ - start: sum NEWLINE - sum: t1=term '+' t2=term { action } | term - term: NUMBER - """ - expected = """ - start: sum NEWLINE - sum: term '+' term | term - term: NUMBER - """ - grammar: Grammar = parse_string(grammar_source, GrammarParser) - rules = grammar.rules - assert str(grammar) == textwrap.dedent(expected).strip() - # Check the str() and repr() of a few rules; AST nodes don't support ==. - assert str(rules["start"]) == "start: sum NEWLINE" - assert str(rules["sum"]) == "sum: term '+' term | term" - expected_repr = "Rule('term', None, Rhs([Alt([NamedItem(None, NameLeaf('NUMBER'))])]))" - assert repr(rules["term"]) == expected_repr - - -def test_long_rule_str() -> None: - grammar_source = """ - start: zero | one | one zero | one one | one zero zero | one zero one | one one zero | one one one - """ - expected = """ - start: - | zero - | one - | one zero - | one one - | one zero zero - | one zero one - | one one zero - | one one one - """ - grammar: Grammar = parse_string(grammar_source, GrammarParser) - assert str(grammar.rules["start"]) == textwrap.dedent(expected).strip() - - -def test_typed_rules() -> None: - grammar = """ - start[int]: sum NEWLINE - sum[int]: t1=term '+' t2=term { action } | term - term[int]: NUMBER - """ - rules = parse_string(grammar, GrammarParser).rules - # Check the str() and repr() of a few rules; AST nodes don't support ==. - assert str(rules["start"]) == "start: sum NEWLINE" - assert str(rules["sum"]) == "sum: term '+' term | term" - assert ( - repr(rules["term"]) - == "Rule('term', 'int', Rhs([Alt([NamedItem(None, NameLeaf('NUMBER'))])]))" - ) - - -def test_repeat_with_separator_rules() -> None: - grammar = """ - start: ','.thing+ NEWLINE - thing: NUMBER - """ - rules = parse_string(grammar, GrammarParser).rules - assert str(rules["start"]) == "start: ','.thing+ NEWLINE" - print(repr(rules["start"])) - assert repr(rules["start"]).startswith( - "Rule('start', None, Rhs([Alt([NamedItem(None, Gather(StringLeaf(\"','\"), NameLeaf('thing'" - ) - assert str(rules["thing"]) == "thing: NUMBER" - - -def test_expr_grammar() -> None: - grammar = """ - start: sum NEWLINE - sum: term '+' term | term - term: NUMBER - """ - parser_class = make_parser(grammar) - node = parse_string("42\n", parser_class) - assert node == [ - [[TokenInfo(NUMBER, string="42", start=(1, 0), end=(1, 2), line="42\n")]], - TokenInfo(NEWLINE, string="\n", start=(1, 2), end=(1, 3), line="42\n"), - ] - - -def test_optional_operator() -> None: - grammar = """ - start: sum NEWLINE - sum: term ('+' term)? - term: NUMBER - """ - parser_class = make_parser(grammar) - node = parse_string("1+2\n", parser_class) - assert node == [ - [ - [TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1+2\n")], - [ - TokenInfo(OP, string="+", start=(1, 1), end=(1, 2), line="1+2\n"), - [TokenInfo(NUMBER, string="2", start=(1, 2), end=(1, 3), line="1+2\n")], - ], - ], - TokenInfo(NEWLINE, string="\n", start=(1, 3), end=(1, 4), line="1+2\n"), - ] - node = parse_string("1\n", parser_class) - assert node == [ - [[TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1\n")], None], - TokenInfo(NEWLINE, string="\n", start=(1, 1), end=(1, 2), line="1\n"), - ] - - -def test_optional_literal() -> None: - grammar = """ - start: sum NEWLINE - sum: term '+' ? - term: NUMBER - """ - parser_class = make_parser(grammar) - node = parse_string("1+\n", parser_class) - assert node == [ - [ - [TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1+\n")], - TokenInfo(OP, string="+", start=(1, 1), end=(1, 2), line="1+\n"), - ], - TokenInfo(NEWLINE, string="\n", start=(1, 2), end=(1, 3), line="1+\n"), - ] - node = parse_string("1\n", parser_class) - assert node == [ - [[TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1\n")], None], - TokenInfo(NEWLINE, string="\n", start=(1, 1), end=(1, 2), line="1\n"), - ] - - -def test_alt_optional_operator() -> None: - grammar = """ - start: sum NEWLINE - sum: term ['+' term] - term: NUMBER - """ - parser_class = make_parser(grammar) - node = parse_string("1 + 2\n", parser_class) - assert node == [ - [ - [TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 + 2\n")], - [ - TokenInfo(OP, string="+", start=(1, 2), end=(1, 3), line="1 + 2\n"), - [TokenInfo(NUMBER, string="2", start=(1, 4), end=(1, 5), line="1 + 2\n")], - ], - ], - TokenInfo(NEWLINE, string="\n", start=(1, 5), end=(1, 6), line="1 + 2\n"), - ] - node = parse_string("1\n", parser_class) - assert node == [ - [[TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1\n")], None], - TokenInfo(NEWLINE, string="\n", start=(1, 1), end=(1, 2), line="1\n"), - ] - - -def test_repeat_0_simple() -> None: - grammar = """ - start: thing thing* NEWLINE - thing: NUMBER - """ - parser_class = make_parser(grammar) - node = parse_string("1 2 3\n", parser_class) - assert node == [ - [TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 2 3\n")], - [ - [[TokenInfo(NUMBER, string="2", start=(1, 2), end=(1, 3), line="1 2 3\n")]], - [[TokenInfo(NUMBER, string="3", start=(1, 4), end=(1, 5), line="1 2 3\n")]], - ], - TokenInfo(NEWLINE, string="\n", start=(1, 5), end=(1, 6), line="1 2 3\n"), - ] - node = parse_string("1\n", parser_class) - assert node == [ - [TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1\n")], - [], - TokenInfo(NEWLINE, string="\n", start=(1, 1), end=(1, 2), line="1\n"), - ] - - -def test_repeat_0_complex() -> None: - grammar = """ - start: term ('+' term)* NEWLINE - term: NUMBER - """ - parser_class = make_parser(grammar) - node = parse_string("1 + 2 + 3\n", parser_class) - assert node == [ - [TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 + 2 + 3\n")], - [ - [ - [ - TokenInfo(OP, string="+", start=(1, 2), end=(1, 3), line="1 + 2 + 3\n"), - [TokenInfo(NUMBER, string="2", start=(1, 4), end=(1, 5), line="1 + 2 + 3\n")], - ] - ], - [ - [ - TokenInfo(OP, string="+", start=(1, 6), end=(1, 7), line="1 + 2 + 3\n"), - [TokenInfo(NUMBER, string="3", start=(1, 8), end=(1, 9), line="1 + 2 + 3\n")], - ] - ], - ], - TokenInfo(NEWLINE, string="\n", start=(1, 9), end=(1, 10), line="1 + 2 + 3\n"), - ] - - -def test_repeat_1_simple() -> None: - grammar = """ - start: thing thing+ NEWLINE - thing: NUMBER - """ - parser_class = make_parser(grammar) - node = parse_string("1 2 3\n", parser_class) - assert node == [ - [TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 2 3\n")], - [ - [[TokenInfo(NUMBER, string="2", start=(1, 2), end=(1, 3), line="1 2 3\n")]], - [[TokenInfo(NUMBER, string="3", start=(1, 4), end=(1, 5), line="1 2 3\n")]], - ], - TokenInfo(NEWLINE, string="\n", start=(1, 5), end=(1, 6), line="1 2 3\n"), - ] - with pytest.raises(SyntaxError): - parse_string("1\n", parser_class) - - -def test_repeat_1_complex() -> None: - grammar = """ - start: term ('+' term)+ NEWLINE - term: NUMBER - """ - parser_class = make_parser(grammar) - node = parse_string("1 + 2 + 3\n", parser_class) - assert node == [ - [TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 + 2 + 3\n")], - [ - [ - [ - TokenInfo(OP, string="+", start=(1, 2), end=(1, 3), line="1 + 2 + 3\n"), - [TokenInfo(NUMBER, string="2", start=(1, 4), end=(1, 5), line="1 + 2 + 3\n")], - ] - ], - [ - [ - TokenInfo(OP, string="+", start=(1, 6), end=(1, 7), line="1 + 2 + 3\n"), - [TokenInfo(NUMBER, string="3", start=(1, 8), end=(1, 9), line="1 + 2 + 3\n")], - ] - ], - ], - TokenInfo(NEWLINE, string="\n", start=(1, 9), end=(1, 10), line="1 + 2 + 3\n"), - ] - with pytest.raises(SyntaxError): - parse_string("1\n", parser_class) - - -def test_repeat_with_sep_simple() -> None: - grammar = """ - start: ','.thing+ NEWLINE - thing: NUMBER - """ - parser_class = make_parser(grammar) - node = parse_string("1, 2, 3\n", parser_class) - assert node == [ - [ - [TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1, 2, 3\n")], - [TokenInfo(NUMBER, string="2", start=(1, 3), end=(1, 4), line="1, 2, 3\n")], - [TokenInfo(NUMBER, string="3", start=(1, 6), end=(1, 7), line="1, 2, 3\n")], - ], - TokenInfo(NEWLINE, string="\n", start=(1, 7), end=(1, 8), line="1, 2, 3\n"), - ] - - -def test_left_recursive() -> None: - grammar_source = """ - start: expr NEWLINE - expr: ('-' term | expr '+' term | term) - term: NUMBER - foo: NAME+ - bar: NAME* - baz: NAME? - """ - grammar: Grammar = parse_string(grammar_source, GrammarParser) - parser_class = generate_parser(grammar) - rules = grammar.rules - assert not rules["start"].left_recursive - assert rules["expr"].left_recursive - assert not rules["term"].left_recursive - assert not rules["foo"].left_recursive - assert not rules["bar"].left_recursive - assert not rules["baz"].left_recursive - node = parse_string("1 + 2 + 3\n", parser_class) - assert node == [ - [ - [ - [[TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 + 2 + 3\n")]], - TokenInfo(OP, string="+", start=(1, 2), end=(1, 3), line="1 + 2 + 3\n"), - [TokenInfo(NUMBER, string="2", start=(1, 4), end=(1, 5), line="1 + 2 + 3\n")], - ], - TokenInfo(OP, string="+", start=(1, 6), end=(1, 7), line="1 + 2 + 3\n"), - [TokenInfo(NUMBER, string="3", start=(1, 8), end=(1, 9), line="1 + 2 + 3\n")], - ], - TokenInfo(NEWLINE, string="\n", start=(1, 9), end=(1, 10), line="1 + 2 + 3\n"), - ] - - -def test_python_expr() -> None: - grammar = """ - start: expr NEWLINE? $ { ast.Expression(expr, lineno=1, col_offset=0) } - expr: ( expr '+' term { ast.BinOp(expr, ast.Add(), term, lineno=expr.lineno, col_offset=expr.col_offset, end_lineno=term.end_lineno, end_col_offset=term.end_col_offset) } - | expr '-' term { ast.BinOp(expr, ast.Sub(), term, lineno=expr.lineno, col_offset=expr.col_offset, end_lineno=term.end_lineno, end_col_offset=term.end_col_offset) } - | term { term } - ) - term: ( l=term '*' r=factor { ast.BinOp(l, ast.Mult(), r, lineno=l.lineno, col_offset=l.col_offset, end_lineno=r.end_lineno, end_col_offset=r.end_col_offset) } - | l=term '/' r=factor { ast.BinOp(l, ast.Div(), r, lineno=l.lineno, col_offset=l.col_offset, end_lineno=r.end_lineno, end_col_offset=r.end_col_offset) } - | factor { factor } - ) - factor: ( '(' expr ')' { expr } - | atom { atom } - ) - atom: ( n=NAME { ast.Name(id=n.string, ctx=ast.Load(), lineno=n.start[0], col_offset=n.start[1], end_lineno=n.end[0], end_col_offset=n.end[1]) } - | n=NUMBER { ast.Constant(value=ast.literal_eval(n.string), lineno=n.start[0], col_offset=n.start[1], end_lineno=n.end[0], end_col_offset=n.end[1]) } - ) - """ - parser_class = make_parser(grammar) - node = parse_string("(1 + 2*3 + 5)/(6 - 2)\n", parser_class) - code = compile(node, "", "eval") - val = eval(code) - assert val == 3.0 - - -def test_nullable() -> None: - grammar_source = """ - start: sign NUMBER - sign: ['-' | '+'] - """ - grammar: Grammar = parse_string(grammar_source, GrammarParser) - out = io.StringIO() - genr = PythonParserGenerator(grammar, out) - rules = grammar.rules - assert rules["start"].nullable is False # Not None! - assert rules["sign"].nullable - - -def test_advanced_left_recursive() -> None: - grammar_source = """ - start: NUMBER | sign start - sign: ['-'] - """ - grammar: Grammar = parse_string(grammar_source, GrammarParser) - out = io.StringIO() - genr = PythonParserGenerator(grammar, out) - rules = grammar.rules - assert rules["start"].nullable is False # Not None! - assert rules["sign"].nullable - assert rules["start"].left_recursive - assert not rules["sign"].left_recursive - - -def test_mutually_left_recursive() -> None: - grammar_source = """ - start: foo 'E' - foo: bar 'A' | 'B' - bar: foo 'C' | 'D' - """ - grammar: Grammar = parse_string(grammar_source, GrammarParser) - out = io.StringIO() - genr = PythonParserGenerator(grammar, out) - rules = grammar.rules - assert not rules["start"].left_recursive - assert rules["foo"].left_recursive - assert rules["bar"].left_recursive - genr.generate("") - ns: Dict[str, Any] = {} - exec(out.getvalue(), ns) - parser_class: Type[Parser] = ns["GeneratedParser"] - node = parse_string("D A C A E", parser_class) - assert node == [ - [ - [ - [ - [TokenInfo(type=NAME, string="D", start=(1, 0), end=(1, 1), line="D A C A E")], - TokenInfo(type=NAME, string="A", start=(1, 2), end=(1, 3), line="D A C A E"), - ], - TokenInfo(type=NAME, string="C", start=(1, 4), end=(1, 5), line="D A C A E"), - ], - TokenInfo(type=NAME, string="A", start=(1, 6), end=(1, 7), line="D A C A E"), - ], - TokenInfo(type=NAME, string="E", start=(1, 8), end=(1, 9), line="D A C A E"), - ] - node = parse_string("B C A E", parser_class) - assert node != None - assert node == [ - [ - [ - [TokenInfo(type=NAME, string="B", start=(1, 0), end=(1, 1), line="B C A E")], - TokenInfo(type=NAME, string="C", start=(1, 2), end=(1, 3), line="B C A E"), - ], - TokenInfo(type=NAME, string="A", start=(1, 4), end=(1, 5), line="B C A E"), - ], - TokenInfo(type=NAME, string="E", start=(1, 6), end=(1, 7), line="B C A E"), - ] - - -def test_nasty_mutually_left_recursive() -> None: - # This grammar does not recognize 'x - + =', much to my chagrin. - # But that's the way PEG works. - # [Breathlessly] - # The problem is that the toplevel target call - # recurses into maybe, which recognizes 'x - +', - # and then the toplevel target looks for another '+', - # which fails, so it retreats to NAME, - # which succeeds, so we end up just recognizing 'x', - # and then start fails because there's no '=' after that. - grammar_source = """ - start: target '=' - target: maybe '+' | NAME - maybe: maybe '-' | target - """ - grammar: Grammar = parse_string(grammar_source, GrammarParser) - out = io.StringIO() - genr = PythonParserGenerator(grammar, out) - genr.generate("") - ns: Dict[str, Any] = {} - exec(out.getvalue(), ns) - parser_class = ns["GeneratedParser"] - with pytest.raises(SyntaxError): - parse_string("x - + =", parser_class) - - -def test_lookahead() -> None: - grammar = """ - start: (expr_stmt | assign_stmt) &'.' - expr_stmt: !(target '=') expr - assign_stmt: target '=' expr - expr: term ('+' term)* - target: NAME - term: NUMBER - """ - parser_class = make_parser(grammar) - node = parse_string("foo = 12 + 12 .", parser_class) - assert node == [ - [ - [ - [TokenInfo(NAME, string="foo", start=(1, 0), end=(1, 3), line="foo = 12 + 12 .")], - TokenInfo(OP, string="=", start=(1, 4), end=(1, 5), line="foo = 12 + 12 ."), - [ - [ - TokenInfo( - NUMBER, string="12", start=(1, 6), end=(1, 8), line="foo = 12 + 12 ." - ) - ], - [ - [ - [ - TokenInfo( - OP, - string="+", - start=(1, 9), - end=(1, 10), - line="foo = 12 + 12 .", - ), - [ - TokenInfo( - NUMBER, - string="12", - start=(1, 11), - end=(1, 13), - line="foo = 12 + 12 .", - ) - ], - ] - ] - ], - ], - ] - ] - ] - - -def test_named_lookahead_error() -> None: - grammar = """ - start: foo=!'x' NAME - """ - with pytest.raises(SyntaxError): - make_parser(grammar) - - -def test_start_leader() -> None: - grammar = """ - start: attr | NAME - attr: start '.' NAME - """ - # Would assert False without a special case in compute_left_recursives(). - make_parser(grammar) - - -def test_left_recursion_too_complex() -> None: - grammar = """ - start: foo - foo: bar '+' | baz '+' | '+' - bar: baz '-' | foo '-' | '-' - baz: foo '*' | bar '*' | '*' - """ - with pytest.raises(ValueError) as errinfo: - make_parser(grammar) - assert "no leader" in str(errinfo.value) - - -def test_cut() -> None: - grammar = """ - start: '(' ~ expr ')' - expr: NUMBER - """ - parser_class = make_parser(grammar) - node = parse_string("(1)", parser_class, verbose=True) - assert node == [ - TokenInfo(OP, string="(", start=(1, 0), end=(1, 1), line="(1)"), - [TokenInfo(NUMBER, string="1", start=(1, 1), end=(1, 2), line="(1)")], - TokenInfo(OP, string=")", start=(1, 2), end=(1, 3), line="(1)"), - ] - - -def test_dangling_reference() -> None: - grammar = """ - start: foo ENDMARKER - foo: bar NAME - """ - with pytest.raises(GrammarError): - parser_class = make_parser(grammar) - - -def test_bad_token_reference() -> None: - grammar = """ - start: foo - foo: NAMEE - """ - with pytest.raises(GrammarError): - parser_class = make_parser(grammar) - - -def test_missing_start() -> None: - grammar = """ - foo: NAME - """ - with pytest.raises(GrammarError): - parser_class = make_parser(grammar) - - -class TestGrammarVisitor: - class Visitor(GrammarVisitor): - def __init__(self) -> None: - self.n_nodes = 0 - - def visit(self, node: Any, *args: Any, **kwargs: Any) -> None: - self.n_nodes += 1 - super().visit(node, *args, **kwargs) - - def test_parse_trivial_grammar(self) -> None: - grammar = """ - start: 'a' - """ - rules = parse_string(grammar, GrammarParser) - visitor = self.Visitor() - - visitor.visit(rules) - - assert visitor.n_nodes == 6 - - def test_parse_or_grammar(self) -> None: - grammar = """ - start: rule - rule: 'a' | 'b' - """ - rules = parse_string(grammar, GrammarParser) - visitor = self.Visitor() - - visitor.visit(rules) - - # Grammar/Rule/Rhs/Alt/NamedItem/NameLeaf -> 6 - # Rule/Rhs/ -> 2 - # Alt/NamedItem/StringLeaf -> 3 - # Alt/NamedItem/StringLeaf -> 3 - - assert visitor.n_nodes == 14 - - def test_parse_repeat1_grammar(self) -> None: - grammar = """ - start: 'a'+ - """ - rules = parse_string(grammar, GrammarParser) - visitor = self.Visitor() - - visitor.visit(rules) - - # Grammar/Rule/Rhs/Alt/NamedItem/Repeat1/StringLeaf -> 6 - assert visitor.n_nodes == 7 - - def test_parse_repeat0_grammar(self) -> None: - grammar = """ - start: 'a'* - """ - rules = parse_string(grammar, GrammarParser) - visitor = self.Visitor() - - visitor.visit(rules) - - # Grammar/Rule/Rhs/Alt/NamedItem/Repeat0/StringLeaf -> 6 - - assert visitor.n_nodes == 7 - - def test_parse_optional_grammar(self) -> None: - grammar = """ - start: 'a' ['b'] - """ - rules = parse_string(grammar, GrammarParser) - visitor = self.Visitor() - - visitor.visit(rules) - - # Grammar/Rule/Rhs/Alt/NamedItem/StringLeaf -> 6 - # NamedItem/Opt/Rhs/Alt/NamedItem/Stringleaf -> 6 - - assert visitor.n_nodes == 12 - - -class TestGrammarVisualizer: - def test_simple_rule(self) -> None: - grammar = """ - start: 'a' 'b' - """ - rules = parse_string(grammar, GrammarParser) - - printer = ASTGrammarPrinter() - lines: List[str] = [] - printer.print_grammar_ast(rules, printer=lines.append) - - output = "\n".join(lines) - expected_output = textwrap.dedent( - """\ - └──Rule - └──Rhs - └──Alt - ├──NamedItem - │ └──StringLeaf("'a'") - └──NamedItem - └──StringLeaf("'b'") - """ - ) - - assert output == expected_output - - def test_multiple_rules(self) -> None: - grammar = """ - start: a b - a: 'a' - b: 'b' - """ - rules = parse_string(grammar, GrammarParser) - - printer = ASTGrammarPrinter() - lines: List[str] = [] - printer.print_grammar_ast(rules, printer=lines.append) - - output = "\n".join(lines) - expected_output = textwrap.dedent( - """\ - └──Rule - └──Rhs - └──Alt - ├──NamedItem - │ └──NameLeaf('a') - └──NamedItem - └──NameLeaf('b') - - └──Rule - └──Rhs - └──Alt - └──NamedItem - └──StringLeaf("'a'") - - └──Rule - └──Rhs - └──Alt - └──NamedItem - └──StringLeaf("'b'") - """ - ) - - assert output == expected_output - - def test_deep_nested_rule(self) -> None: - grammar = """ - start: 'a' ['b'['c'['d']]] - """ - rules = parse_string(grammar, GrammarParser) - - printer = ASTGrammarPrinter() - lines: List[str] = [] - printer.print_grammar_ast(rules, printer=lines.append) - - output = "\n".join(lines) - print() - print(output) - expected_output = textwrap.dedent( - """\ - └──Rule - └──Rhs - └──Alt - ├──NamedItem - │ └──StringLeaf("'a'") - └──NamedItem - └──Opt - └──Rhs - └──Alt - ├──NamedItem - │ └──StringLeaf("'b'") - └──NamedItem - └──Opt - └──Rhs - └──Alt - ├──NamedItem - │ └──StringLeaf("'c'") - └──NamedItem - └──Opt - └──Rhs - └──Alt - └──NamedItem - └──StringLeaf("'d'") - """ - ) - - assert output == expected_output diff --git a/Tools/peg_generator/test/test_tracebacks.py b/Tools/peg_generator/test/test_tracebacks.py deleted file mode 100644 index f5cc6f1c220d65..00000000000000 --- a/Tools/peg_generator/test/test_tracebacks.py +++ /dev/null @@ -1,65 +0,0 @@ -import os -from pathlib import PurePath -from typing import Any, Dict, Tuple -from textwrap import dedent - -import pytest # type: ignore - -from pegen.grammar_parser import GeneratedParser as GrammarParser -from pegen.testutil import parse_string, generate_parser_c_extension - -# fmt: off - -FSTRINGS: Dict[str, Tuple[str, str]] = { - 'multiline_fstrings_same_line_with_brace': ( - """ - f''' - {a$b} - ''' - """, - '(a$b)', - ), - 'multiline_fstring_brace_on_next_line': ( - """ - f''' - {a$b - }''' - """, - '(a$b', - ), - 'multiline_fstring_brace_on_previous_line': ( - """ - f''' - { - a$b}''' - """, - 'a$b)', - ), -} - -# fmt: on - - -def create_tmp_extension(tmp_path: PurePath) -> Any: - with open(os.path.join("data", "python.gram"), "r") as grammar_file: - grammar_source = grammar_file.read() - grammar = parse_string(grammar_source, GrammarParser) - extension = generate_parser_c_extension(grammar, tmp_path) - return extension - - -@pytest.fixture(scope="module") -def parser_extension(tmp_path_factory: Any) -> Any: - tmp_path = tmp_path_factory.mktemp("extension") - extension = create_tmp_extension(tmp_path) - return extension - - -@pytest.mark.parametrize("fstring,error_line", FSTRINGS.values(), ids=tuple(FSTRINGS.keys())) -def test_fstring_syntax_error_tracebacks( - parser_extension: Any, fstring: str, error_line: str -) -> None: - try: - parser_extension.parse_string(dedent(fstring)) - except SyntaxError as se: - assert se.text == error_line