|
1 | 1 | """Tokenization help for Python programs. |
2 | 2 |
|
3 | | -This module compiles a regular expression that recognizes Python |
4 | | -tokens in individual lines of text. The regular expression handles |
5 | | -everything except indentation, continuations, and triple-quoted |
6 | | -strings. The function 'tokenize.tokenize()' takes care of these |
7 | | -things for streams of text. It accepts a readline-like function which |
8 | | -is called repeatedly to come up with the next input line (or "" for |
9 | | -EOF), and a "token-eater" function which is called for each token |
10 | | -found, passing its type, a string containing the token, the line |
11 | | -number, the line, and the starting and ending positions of the token |
12 | | -within the line. It is designed to match the working of the Python |
13 | | -tokenizer exactly. |
14 | | -
|
15 | | -""" |
16 | | - |
17 | | -__version__ = "Ka-Ping Yee, 4 March 1997, updated by GvR, 10 March 1997" |
| 3 | +This module exports a function called 'tokenize()' that breaks a stream of |
| 4 | +text into Python tokens. It accepts a readline-like method which is called |
| 5 | +repeatedly to get the next line of input (or "" for EOF) and a "token-eater" |
| 6 | +function which is called once for each token found. The latter function is |
| 7 | +passed the token type, a string containing the token, the starting and |
| 8 | +ending (row, column) coordinates of the token, and the original line. It is |
| 9 | +designed to match the working of the Python tokenizer exactly, except that |
| 10 | +it produces COMMENT tokens for comments and gives type OP for all operators. |
| 11 | +
|
| 12 | +For compatibility with the older 'tokenize' module, this also compiles a |
| 13 | +regular expression into 'tokenprog' that matches Python tokens in individual |
| 14 | +lines of text, leaving the token in 'tokenprog.group(3)', but does not |
| 15 | +handle indentation, continuations, or multi-line strings.""" |
| 16 | + |
| 17 | +__version__ = "Ka-Ping Yee, 26 March 1997" |
18 | 18 |
|
19 | 19 | import string, regex |
20 | 20 | from token import * |
21 | 21 |
|
| 22 | +COMMENT = N_TOKENS |
| 23 | +tok_name[COMMENT] = 'COMMENT' |
| 24 | + |
| 25 | +# Changes from 1.3: |
| 26 | +# Ignore now accepts \f as whitespace. Operator now includes '**'. |
| 27 | +# Ignore and Special now accept \n or \r\n at the end of a line. |
| 28 | +# Imagnumber is new. Expfloat is corrected to reject '0e4'. |
| 29 | +# Note: to get a quoted backslash in a regex, it must be enclosed in brackets. |
| 30 | + |
22 | 31 | def group(*choices): return '\(' + string.join(choices, '\|') + '\)' |
23 | 32 |
|
24 | | -Ignore = '[ \f\t]*\([\]\r?\n[ \t]*\)*\(#.*\)?' |
| 33 | +Whitespace = '[ \f\t]*' |
| 34 | +Comment = '\(#[^\r\n]*\)' |
| 35 | +Ignore = Whitespace + group('[\]\r?\n' + Whitespace)+'*' + Comment+'?' |
25 | 36 | Name = '[a-zA-Z_][a-zA-Z0-9_]*' |
26 | 37 |
|
27 | | -ImagZero = '0[jJ]' # This is not caught by any of the following |
28 | 38 | Hexnumber = '0[xX][0-9a-fA-F]*[lL]?' |
29 | 39 | Octnumber = '0[0-7]*[lL]?' |
30 | | -Decnumber = '[1-9][0-9]*[lLjJ]?' |
31 | | -Intnumber = group(ImagZero, Hexnumber, Octnumber, Decnumber) |
| 40 | +Decnumber = '[1-9][0-9]*[lL]?' |
| 41 | +Intnumber = group(Hexnumber, Octnumber, Decnumber) |
32 | 42 | Exponent = '[eE][-+]?[0-9]+' |
33 | 43 | Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?' |
34 | | -Expfloat = '[0-9]+' + Exponent |
35 | | -Floatnumber = group(Pointfloat, Expfloat) + "[jJ]?" |
36 | | -Number = group(Floatnumber, Intnumber) |
| 44 | +Expfloat = '[1-9][0-9]*' + Exponent |
| 45 | +Floatnumber = group(Pointfloat, Expfloat) |
| 46 | +Imagnumber = group('0[jJ]', '[1-9][0-9]*[jJ]', Floatnumber + '[jJ]') |
| 47 | +Number = group(Imagnumber, Floatnumber, Intnumber) |
37 | 48 |
|
38 | | -Single = group('^\'', '[^\]\'') |
| 49 | +Single = group("^'", "[^\]'") |
39 | 50 | Double = group('^"', '[^\]"') |
40 | | -Tsingle = group('^\'\'\'', '[^\]\'\'\'') |
41 | | -Tdouble = group('^"""', '[^\]"""') |
42 | | -Triple = group('\'\'\'', '"""') |
43 | | -String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'), |
44 | | - '"' + group('[\].', '[^"\]') + '*' + group('"', '[\]\n')) |
| 51 | +Single3 = group("^'''", "[^\]'''") |
| 52 | +Double3 = group('^"""', '[^\]"""') |
| 53 | +Triple = group("'''", '"""') |
| 54 | +String = group("'" + group('[\].', "[^\n'\]") + "*'", |
| 55 | + '"' + group('[\].', '[^\n"\]') + '*"') |
45 | 56 |
|
46 | 57 | Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|', |
47 | 58 | '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>') |
48 | 59 | Bracket = '[][(){}]' |
49 | | -Special = group('[\]?\r?\n', '[:;.,`\f]') |
| 60 | +Special = group('\r?\n', '[:;.,`]') |
50 | 61 | Funny = group(Operator, Bracket, Special) |
51 | 62 |
|
52 | | -PlainToken = group(Name, Number, Triple, String, Funny) |
| 63 | +PlainToken = group(Name, Number, String, Funny) |
53 | 64 | Token = Ignore + PlainToken |
54 | 65 |
|
| 66 | +ContStr = group("'" + group('[\].', "[^\n'\]")+'*' + group("'", '[\]\r?\n'), |
| 67 | + '"' + group('[\].', '[^\n"\]')+'*' + group('"', '[\]\r?\n')) |
| 68 | +PseudoExtras = group('[\]\r?\n', Comment, Triple) |
| 69 | +PseudoToken = Whitespace + group(PseudoExtras, Name, Number, ContStr, Funny) |
| 70 | + |
55 | 71 | try: |
56 | | - save_syntax = regex.set_syntax(0) # use default syntax |
| 72 | + saved_syntax = regex.set_syntax(0) # use default syntax |
57 | 73 | tokenprog = regex.compile(Token) |
| 74 | + pseudoprog = regex.compile(PseudoToken) |
58 | 75 | endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double), |
59 | | - '\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) } |
| 76 | + '\'\'\'': regex.compile(Single3), '"""': regex.compile(Double3) } |
60 | 77 | finally: |
61 | | - regex.set_syntax(save_syntax) # restore original syntax |
| 78 | + regex.set_syntax(saved_syntax) # restore original syntax |
62 | 79 |
|
63 | 80 | tabsize = 8 |
64 | 81 | TokenError = 'TokenError' |
65 | | -def printtoken(type, string, linenum, line, start, end): # for testing |
66 | | - print `linenum` + ':', tok_name[type], repr(string) |
| 82 | +def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing |
| 83 | + print "%d,%d-%d,%d:\t%s\t%s" % \ |
| 84 | + (srow, scol, erow, ecol, tok_name[type], repr(token)) |
67 | 85 |
|
68 | | -def tokenize(readline, tokeneater = printtoken): |
69 | | - linenum = parenlev = continued = 0 |
| 86 | +def tokenize(readline, tokeneater=printtoken): |
| 87 | + lnum = parenlev = continued = 0 |
70 | 88 | namechars, numchars = string.letters + '_', string.digits |
71 | 89 | contstr = '' |
72 | 90 | indents = [0] |
| 91 | + |
73 | 92 | while 1: # loop over lines in stream |
74 | 93 | line = readline() |
75 | | - linenum = linenum + 1 |
76 | | - if line[-2:] == '\r\n': line = line[:-2] + '\n' |
| 94 | + lnum = lnum + 1 |
77 | 95 | pos, max = 0, len(line) |
78 | 96 |
|
79 | 97 | if contstr: # continued string |
80 | 98 | if not line: raise TokenError, "EOF within multi-line string" |
81 | | - if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n' |
82 | 99 | if endprog.search(line) >= 0: |
83 | 100 | pos = end = endprog.regs[0][1] |
84 | | - tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0) |
| 101 | + tokeneater(STRING, contstr + line[:end], |
| 102 | + strstart, (lnum, end), line) |
85 | 103 | contstr = '' |
86 | 104 | else: |
87 | 105 | contstr = contstr + line |
88 | 106 | continue |
89 | 107 |
|
90 | | - elif parenlev == 0 and not continued: # this is a new statement |
| 108 | + elif parenlev == 0 and not continued: # new statement |
91 | 109 | if not line: break |
92 | 110 | column = 0 |
93 | | - while 1: # measure leading whitespace |
| 111 | + while pos < max: # measure leading whitespace |
94 | 112 | if line[pos] == ' ': column = column + 1 |
95 | | - elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize |
| 113 | + elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize |
96 | 114 | elif line[pos] == '\f': column = 0 |
97 | 115 | else: break |
98 | 116 | pos = pos + 1 |
99 | | - if line[pos] in '#\n': continue # skip comments or blank lines |
| 117 | + if pos == max: break |
| 118 | + |
| 119 | + if line[pos] in '#\r\n': # skip comments or blank lines |
| 120 | + tokeneater((NEWLINE, COMMENT)[line[pos] == '#'], line[pos:], |
| 121 | + (lnum, pos), (lnum, len(line)), line) |
| 122 | + continue |
100 | 123 |
|
101 | 124 | if column > indents[-1]: # count indents or dedents |
102 | 125 | indents.append(column) |
103 | | - tokeneater(INDENT, '\t', linenum, line, 0, 0) |
| 126 | + tokeneater(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) |
104 | 127 | while column < indents[-1]: |
105 | 128 | indents = indents[:-1] |
106 | | - tokeneater(DEDENT, '\t', linenum, line, 0, 0) |
| 129 | + tokeneater(DEDENT, line[:pos], (lnum, 0), (lnum, pos), line) |
107 | 130 |
|
108 | 131 | else: # continued statement |
109 | 132 | if not line: raise TokenError, "EOF within multi-line statement" |
110 | 133 | continued = 0 |
111 | 134 |
|
112 | 135 | while pos < max: |
113 | | - if tokenprog.match(line, pos) > 0: # scan for tokens |
114 | | - start, end = tokenprog.regs[3] |
115 | | - token = line[start:end] |
| 136 | + if pseudoprog.match(line, pos) > 0: # scan for tokens |
| 137 | + start, end = pseudoprog.regs[1] |
| 138 | + spos, epos = (lnum, start), (lnum, end) |
| 139 | + token, initial = line[start:end], line[start] |
116 | 140 | pos = end |
117 | 141 |
|
118 | | - if token[0] in namechars: # ordinary name |
119 | | - tokeneater(NAME, token, linenum, line, start, end) |
120 | | - elif token[0] in numchars: # ordinary number |
121 | | - tokeneater(NUMBER, token, linenum, line, start, end) |
122 | | - |
| 142 | + if initial in namechars: # ordinary name |
| 143 | + tokeneater(NAME, token, spos, epos, line) |
| 144 | + elif initial in numchars: # ordinary number |
| 145 | + tokeneater(NUMBER, token, spos, epos, line) |
| 146 | + elif initial in '\r\n': |
| 147 | + tokeneater(NEWLINE, token, spos, epos, line) |
| 148 | + elif initial == '#': |
| 149 | + tokeneater(COMMENT, token, spos, epos, line) |
| 150 | + elif initial == '\\': # continued stmt |
| 151 | + continued = 1 |
123 | 152 | elif token in ('\'\'\'', '"""'): # triple-quoted |
124 | 153 | endprog = endprogs[token] |
125 | 154 | if endprog.search(line, pos) >= 0: # all on one line |
126 | 155 | pos = endprog.regs[0][1] |
127 | | - token = line[start:pos] |
128 | | - tokeneater(STRING, token, linenum, line, start, pos) |
| 156 | + token = line[start:pos] |
| 157 | + tokeneater(STRING, token, spos, (lnum, pos), line) |
129 | 158 | else: |
130 | | - contstr = line[start:] # multiple lines |
| 159 | + strstart = (lnum, start) # multiple lines |
| 160 | + contstr = line[start:] |
131 | 161 | break |
132 | | - elif token[0] in '\'"': |
| 162 | + elif initial in '\'"': |
133 | 163 | if token[-1] == '\n': # continued string |
134 | | - endprog, contstr = endprogs[token[0]], line[start:] |
| 164 | + strstart = (lnum, start) |
| 165 | + endprog, contstr = endprogs[initial], line[start:] |
135 | 166 | break |
136 | 167 | else: # ordinary string |
137 | | - tokeneater(STRING, token, linenum, line, start, end) |
138 | | - |
139 | | - elif token[0] == '\n': |
140 | | - tokeneater(NEWLINE, token, linenum, line, start, end) |
141 | | - elif token[0] == '\\': # continued stmt |
142 | | - continued = 1 |
143 | | - |
| 168 | + tokeneater(STRING, token, spos, epos, line) |
144 | 169 | else: |
145 | | - if token[0] in '([{': parenlev = parenlev + 1 |
146 | | - if token[0] in ')]}': parenlev = parenlev - 1 |
147 | | - tokeneater(OP, token, linenum, line, start, end) |
| 170 | + if initial in '([{': parenlev = parenlev + 1 |
| 171 | + elif initial in ')]}': parenlev = parenlev - 1 |
| 172 | + tokeneater(OP, token, spos, epos, line) |
148 | 173 | else: |
149 | | - tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1) |
| 174 | + tokeneater(ERRORTOKEN, line[pos], spos, (lnum, pos+1), line) |
150 | 175 | pos = pos + 1 |
151 | 176 |
|
152 | 177 | for indent in indents[1:]: # pop remaining indent levels |
153 | | - tokeneater(DEDENT, '\t', linenum, line, 0, 0) |
| 178 | + tokeneater(DEDENT, '', (lnum, 0), (lnum, 0), '') |
154 | 179 |
|
155 | 180 | if __name__ == '__main__': # testing |
156 | 181 | import sys |
157 | | - file = open(sys.argv[-1]) |
158 | | - tokenize(file.readline) |
| 182 | + tokenize(open(sys.argv[-1]).readline) |
0 commit comments