Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 9c88079

Browse files
committed
Add lexer, with tests
1 parent e3391c3 commit 9c88079

3 files changed

Lines changed: 156 additions & 0 deletions

File tree

jsonpath_rw/lexer.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
import sys
2+
import logging
3+
4+
import ply.lex
5+
6+
logger = logging.getLogger(__name__)
7+
8+
class JsonPathLexer(object):
9+
'''
10+
A Lexical analyzer for JsonPath.
11+
'''
12+
13+
def __init__(self, debug=False):
14+
self.debug = debug
15+
16+
def tokenize(self, string):
17+
'''
18+
Maps a string to an iterator over tokens. In other words: [char] -> [token]
19+
'''
20+
21+
new_lexer = ply.lex.lex(module=self, debug=self.debug, errorlog=logger)
22+
new_lexer.latest_newline = 0
23+
new_lexer.input(string)
24+
25+
while True:
26+
t = new_lexer.token()
27+
if t is None: break
28+
t.col = t.lexpos - new_lexer.latest_newline
29+
yield t
30+
31+
# ============== PLY Lexer specification ==================
32+
#
33+
# This probably should be private but:
34+
# - the parser requires access to `tokens` (perhaps they should be defined in a third, shared dependency)
35+
# - things like `literals` might be a legitimate part of the public interface.
36+
#
37+
# Anyhow, it is pythonic to give some rope to hang oneself with :-)
38+
39+
literals = ['*', '.', '[', ']', '(', ')', '$', ',', ':', '|', '&', '@']
40+
41+
reserved_words = { 'where': 'WHERE' }
42+
43+
tokens = ['DOUBLEDOT', 'NUMBER', 'ID'] + reserved_words.values()
44+
45+
states = [ ('singlequote', 'exclusive'),
46+
('doublequote', 'exclusive') ]
47+
48+
# Normal lexing, rather easy
49+
t_DOUBLEDOT = r'\.\.'
50+
t_ignore = ' \t'
51+
52+
def t_ID(self, t):
53+
r'[a-zA-Z_][a-zA-Z0-9_]*'
54+
t.type = self.reserved_words.get(t.value, 'ID')
55+
return t
56+
57+
def t_NUMBER(self, t):
58+
r'\d+'
59+
t.value = int(t.value)
60+
return t
61+
62+
# Single-quoted strings
63+
t_singlequote_ignore = ''
64+
def t_SINGLEQUOTE(self, t):
65+
r'\''
66+
t.lexer.string_start = t.lexer.lexpos
67+
t.lexer.push_state('singlequote')
68+
69+
def t_singlequote_SINGLEQUOTE(self, t):
70+
r"([^']|\\')*'"
71+
t.value = t.value[:-1]
72+
t.type = 'ID'
73+
t.lexer.pop_state()
74+
return t
75+
76+
def t_singlequote_error(self, t):
77+
raise Exception('Error on line %s, col %s while lexing singlequoted field: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.latest_newline, t.value[0]))
78+
79+
80+
# Double-quoted strings
81+
t_doublequote_ignore = ''
82+
def t_DOUBLEQUOTE(self, t):
83+
r'"'
84+
t.lexer.string_start = t.lexer.lexpos
85+
t.lexer.push_state('doublequote')
86+
87+
def t_doublequote_DOUBLEQUOTE(self, t):
88+
r'([^"]|\\")*"'
89+
t.value = t.value[:-1]
90+
t.type = 'ID'
91+
t.lexer.pop_state()
92+
return t
93+
94+
def t_doublequote_error(self, t):
95+
raise Exception('Error on line %s, col %s while lexing doublequoted field: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.latest_newline, t.value[0]))
96+
97+
# Counting lines, handling errors
98+
def t_newline(self, t):
99+
r'\n'
100+
t.lexer.lineno += 1
101+
t.lexer.latest_newline = t.lexpos
102+
103+
def t_error(self, t):
104+
raise Exception('Error on line %s, col %s: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.latest_newline, t.value[0]))
105+
106+
if __name__ == '__main__':
107+
logging.basicConfig()
108+
lexer = JsonPathLexer(debug=True)
109+
for token in lexer.tokenize(sys.stdin.read()):
110+
print '%-20s%s' % (token.value, token.type)

tests/__init__.py

Whitespace-only changes.

tests/test_lexer.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import logging
2+
import unittest
3+
4+
from ply.lex import LexToken
5+
6+
from jsonpath_rw.lexer import JsonPathLexer
7+
8+
class TestLexer(unittest.TestCase):
9+
10+
def token(self, value, ty=None):
11+
t = LexToken()
12+
t.type = ty if ty != None else value
13+
t.value = value
14+
t.lineno = -1
15+
t.lexpos = -1
16+
return t
17+
18+
def assert_lex_equiv(self, s, stream2):
19+
# NOTE: lexer fails to reset after call?
20+
l = JsonPathLexer(debug=True)
21+
stream1 = list(l.tokenize(s)) # Save the stream for debug output when a test fails
22+
stream2 = list(stream2)
23+
assert len(stream1) == len(stream2)
24+
for token1, token2 in zip(stream1, stream2):
25+
print token1, token2
26+
assert token1.type == token2.type
27+
assert token1.value == token2.value
28+
29+
@classmethod
30+
def setup_class(cls):
31+
logging.basicConfig()
32+
33+
def test_simple_inputs(self):
34+
self.assert_lex_equiv('$', [self.token('$', '$')])
35+
self.assert_lex_equiv('"hello"', [self.token('hello', 'ID')])
36+
self.assert_lex_equiv("'goodbye'", [self.token('goodbye', 'ID')])
37+
self.assert_lex_equiv('fuzz', [self.token('fuzz', 'ID')])
38+
self.assert_lex_equiv('1', [self.token(1, 'NUMBER')])
39+
self.assert_lex_equiv('45', [self.token(45, 'NUMBER')])
40+
self.assert_lex_equiv('"fuzz.bang"', [self.token('fuzz.bang', 'ID')])
41+
self.assert_lex_equiv('fuzz.bang', [self.token('fuzz', 'ID'), self.token('.', '.'), self.token('bang', 'ID')])
42+
self.assert_lex_equiv('fuzz.*', [self.token('fuzz', 'ID'), self.token('.', '.'), self.token('*', '*')])
43+
self.assert_lex_equiv('fuzz..bang', [self.token('fuzz', 'ID'), self.token('..', 'DOUBLEDOT'), self.token('bang', 'ID')])
44+
self.assert_lex_equiv('&', [self.token('&', '&')])
45+
self.assert_lex_equiv('|', [self.token('|', '|')])
46+
self.assert_lex_equiv('where', [self.token('where', 'WHERE')])

0 commit comments

Comments
 (0)