|
| 1 | +import sys |
| 2 | +import logging |
| 3 | + |
| 4 | +import ply.lex |
| 5 | + |
| 6 | +logger = logging.getLogger(__name__) |
| 7 | + |
| 8 | +class JsonPathLexer(object): |
| 9 | + ''' |
| 10 | + A Lexical analyzer for JsonPath. |
| 11 | + ''' |
| 12 | + |
| 13 | + def __init__(self, debug=False): |
| 14 | + self.debug = debug |
| 15 | + |
| 16 | + def tokenize(self, string): |
| 17 | + ''' |
| 18 | + Maps a string to an iterator over tokens. In other words: [char] -> [token] |
| 19 | + ''' |
| 20 | + |
| 21 | + new_lexer = ply.lex.lex(module=self, debug=self.debug, errorlog=logger) |
| 22 | + new_lexer.latest_newline = 0 |
| 23 | + new_lexer.input(string) |
| 24 | + |
| 25 | + while True: |
| 26 | + t = new_lexer.token() |
| 27 | + if t is None: break |
| 28 | + t.col = t.lexpos - new_lexer.latest_newline |
| 29 | + yield t |
| 30 | + |
| 31 | + # ============== PLY Lexer specification ================== |
| 32 | + # |
| 33 | + # This probably should be private but: |
| 34 | + # - the parser requires access to `tokens` (perhaps they should be defined in a third, shared dependency) |
| 35 | + # - things like `literals` might be a legitimate part of the public interface. |
| 36 | + # |
| 37 | + # Anyhow, it is pythonic to give some rope to hang oneself with :-) |
| 38 | + |
| 39 | + literals = ['*', '.', '[', ']', '(', ')', '$', ',', ':', '|', '&', '@'] |
| 40 | + |
| 41 | + reserved_words = { 'where': 'WHERE' } |
| 42 | + |
| 43 | + tokens = ['DOUBLEDOT', 'NUMBER', 'ID'] + reserved_words.values() |
| 44 | + |
| 45 | + states = [ ('singlequote', 'exclusive'), |
| 46 | + ('doublequote', 'exclusive') ] |
| 47 | + |
| 48 | + # Normal lexing, rather easy |
| 49 | + t_DOUBLEDOT = r'\.\.' |
| 50 | + t_ignore = ' \t' |
| 51 | + |
| 52 | + def t_ID(self, t): |
| 53 | + r'[a-zA-Z_][a-zA-Z0-9_]*' |
| 54 | + t.type = self.reserved_words.get(t.value, 'ID') |
| 55 | + return t |
| 56 | + |
| 57 | + def t_NUMBER(self, t): |
| 58 | + r'\d+' |
| 59 | + t.value = int(t.value) |
| 60 | + return t |
| 61 | + |
| 62 | + # Single-quoted strings |
| 63 | + t_singlequote_ignore = '' |
| 64 | + def t_SINGLEQUOTE(self, t): |
| 65 | + r'\'' |
| 66 | + t.lexer.string_start = t.lexer.lexpos |
| 67 | + t.lexer.push_state('singlequote') |
| 68 | + |
| 69 | + def t_singlequote_SINGLEQUOTE(self, t): |
| 70 | + r"([^']|\\')*'" |
| 71 | + t.value = t.value[:-1] |
| 72 | + t.type = 'ID' |
| 73 | + t.lexer.pop_state() |
| 74 | + return t |
| 75 | + |
| 76 | + def t_singlequote_error(self, t): |
| 77 | + raise Exception('Error on line %s, col %s while lexing singlequoted field: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.latest_newline, t.value[0])) |
| 78 | + |
| 79 | + |
| 80 | + # Double-quoted strings |
| 81 | + t_doublequote_ignore = '' |
| 82 | + def t_DOUBLEQUOTE(self, t): |
| 83 | + r'"' |
| 84 | + t.lexer.string_start = t.lexer.lexpos |
| 85 | + t.lexer.push_state('doublequote') |
| 86 | + |
| 87 | + def t_doublequote_DOUBLEQUOTE(self, t): |
| 88 | + r'([^"]|\\")*"' |
| 89 | + t.value = t.value[:-1] |
| 90 | + t.type = 'ID' |
| 91 | + t.lexer.pop_state() |
| 92 | + return t |
| 93 | + |
| 94 | + def t_doublequote_error(self, t): |
| 95 | + raise Exception('Error on line %s, col %s while lexing doublequoted field: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.latest_newline, t.value[0])) |
| 96 | + |
| 97 | + # Counting lines, handling errors |
| 98 | + def t_newline(self, t): |
| 99 | + r'\n' |
| 100 | + t.lexer.lineno += 1 |
| 101 | + t.lexer.latest_newline = t.lexpos |
| 102 | + |
| 103 | + def t_error(self, t): |
| 104 | + raise Exception('Error on line %s, col %s: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.latest_newline, t.value[0])) |
| 105 | + |
| 106 | +if __name__ == '__main__': |
| 107 | + logging.basicConfig() |
| 108 | + lexer = JsonPathLexer(debug=True) |
| 109 | + for token in lexer.tokenize(sys.stdin.read()): |
| 110 | + print '%-20s%s' % (token.value, token.type) |
0 commit comments