-
Notifications
You must be signed in to change notification settings - Fork 2k
Expand file tree
/
Copy pathxearley.py
More file actions
156 lines (124 loc) · 6.06 KB
/
xearley.py
File metadata and controls
156 lines (124 loc) · 6.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"This module implements an experimental Earley Parser with a dynamic lexer"
# The parser uses a parse-forest to keep track of derivations and ambiguations.
# When the parse ends successfully, a disambiguation stage resolves all ambiguity
# (right now ambiguity resolution is not developed beyond the needs of lark)
# Afterwards the parse tree is reduced (transformed) according to user callbacks.
# I use the no-recursion version of Transformer and Visitor, because the tree might be
# deeper than Python's recursion limit (a bit absurd, but that's life)
#
# The algorithm keeps track of each state set, using a corresponding Column instance.
# Column keeps track of new items using NewsList instances.
#
# Instead of running a lexer beforehand, or using a costy char-by-char method, this parser
# uses regular expressions by necessity, achieving high-performance while maintaining all of
# Earley's power in parsing any CFG.
#
#
# Author: Erez Shinan (2017)
# Email : [email protected]
from collections import defaultdict
from ..exceptions import ParseError, UnexpectedCharacters
from ..lexer import Token
from ..tree import Tree
from .grammar_analysis import GrammarAnalyzer
from ..grammar import NonTerminal, Terminal
from .earley import ApplyCallbacks, Item, Column
class Parser:
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False, complete_lex=False):
self.analysis = GrammarAnalyzer(parser_conf)
self.parser_conf = parser_conf
self.resolve_ambiguity = resolve_ambiguity
self.ignore = [Terminal(t) for t in ignore]
self.predict_all = predict_all
self.complete_lex = complete_lex
self.FIRST = self.analysis.FIRST
self.postprocess = {}
self.predictions = {}
for rule in parser_conf.rules:
self.postprocess[rule] = getattr(parser_conf.callback, rule.alias)
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
self.term_matcher = term_matcher
def parse(self, stream, start_symbol=None):
# Define parser functions
start_symbol = NonTerminal(start_symbol or self.parser_conf.start)
delayed_matches = defaultdict(list)
match = self.term_matcher
text_line = 1
text_column = 1
def predict(nonterm, column):
assert not nonterm.is_term, nonterm
return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]]
def complete(item):
name = item.rule.origin
return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name]
def predict_and_complete(column):
while True:
to_predict = {x.expect for x in column.to_predict.get_news()
if x.ptr} # if not part of an already predicted batch
to_reduce = column.to_reduce.get_news()
if not (to_predict or to_reduce):
break
for nonterm in to_predict:
column.add( predict(nonterm, column) )
for item in to_reduce:
new_items = list(complete(item))
if item in new_items:
raise ParseError('Infinite recursion detected! (rule %s)' % item.rule)
column.add(new_items)
def scan(i, column):
to_scan = column.to_scan
for x in self.ignore:
m = match(x, stream, i)
if m:
delayed_matches[m.end()] += set(to_scan)
delayed_matches[m.end()] += set(column.to_reduce)
# TODO add partial matches for ignore too?
# s = m.group(0)
# for j in range(1, len(s)):
# m = x.match(s[:-j])
# if m:
# delayed_matches[m.end()] += to_scan
for item in to_scan:
m = match(item.expect, stream, i)
if m:
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
delayed_matches[m.end()].append(item.advance(t))
if self.complete_lex:
s = m.group(0)
for j in range(1, len(s)):
m = match(item.expect, s[:-j])
if m:
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
delayed_matches[i+m.end()].append(item.advance(t))
next_set = Column(i+1, self.FIRST, predict_all=self.predict_all)
next_set.add(delayed_matches[i+1])
del delayed_matches[i+1] # No longer needed, so unburden memory
if not next_set and not delayed_matches:
raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan))
return next_set
# Main loop starts
column0 = Column(0, self.FIRST, predict_all=self.predict_all)
column0.add(predict(start_symbol, column0))
column = column0
for i, token in enumerate(stream):
predict_and_complete(column)
column = scan(i, column)
if token == '\n':
text_line += 1
text_column = 1
else:
text_column += 1
predict_and_complete(column)
# Parse ended. Now build a parse tree
solutions = [n.tree for n in column.to_reduce
if n.rule.origin==start_symbol and n.start is column0]
if not solutions:
expected_tokens = [t.expect for t in column.to_scan]
raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens)
elif len(solutions) == 1:
tree = solutions[0]
else:
tree = Tree('_ambig', solutions)
if self.resolve_ambiguity:
tree = self.resolve_ambiguity(tree)
return ApplyCallbacks(self.postprocess).transform(tree)