Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 1aec323

Browse files
committed
Ka-Ping's muich improved version of March 26, 1997:
# Ignore now accepts \f as whitespace. Operator now includes '**'. # Ignore and Special now accept \n or \r\n at the end of a line. # Imagnumber is new. Expfloat is corrected to reject '0e4'.
1 parent 24dacb3 commit 1aec323

1 file changed

Lines changed: 98 additions & 74 deletions

File tree

Lib/tokenize.py

Lines changed: 98 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,158 +1,182 @@
11
"""Tokenization help for Python programs.
22
3-
This module compiles a regular expression that recognizes Python
4-
tokens in individual lines of text. The regular expression handles
5-
everything except indentation, continuations, and triple-quoted
6-
strings. The function 'tokenize.tokenize()' takes care of these
7-
things for streams of text. It accepts a readline-like function which
8-
is called repeatedly to come up with the next input line (or "" for
9-
EOF), and a "token-eater" function which is called for each token
10-
found, passing its type, a string containing the token, the line
11-
number, the line, and the starting and ending positions of the token
12-
within the line. It is designed to match the working of the Python
13-
tokenizer exactly.
14-
15-
"""
16-
17-
__version__ = "Ka-Ping Yee, 4 March 1997, updated by GvR, 10 March 1997"
3+
This module exports a function called 'tokenize()' that breaks a stream of
4+
text into Python tokens. It accepts a readline-like method which is called
5+
repeatedly to get the next line of input (or "" for EOF) and a "token-eater"
6+
function which is called once for each token found. The latter function is
7+
passed the token type, a string containing the token, the starting and
8+
ending (row, column) coordinates of the token, and the original line. It is
9+
designed to match the working of the Python tokenizer exactly, except that
10+
it produces COMMENT tokens for comments and gives type OP for all operators.
11+
12+
For compatibility with the older 'tokenize' module, this also compiles a
13+
regular expression into 'tokenprog' that matches Python tokens in individual
14+
lines of text, leaving the token in 'tokenprog.group(3)', but does not
15+
handle indentation, continuations, or multi-line strings."""
16+
17+
__version__ = "Ka-Ping Yee, 26 March 1997"
1818

1919
import string, regex
2020
from token import *
2121

22+
COMMENT = N_TOKENS
23+
tok_name[COMMENT] = 'COMMENT'
24+
25+
# Changes from 1.3:
26+
# Ignore now accepts \f as whitespace. Operator now includes '**'.
27+
# Ignore and Special now accept \n or \r\n at the end of a line.
28+
# Imagnumber is new. Expfloat is corrected to reject '0e4'.
29+
# Note: to get a quoted backslash in a regex, it must be enclosed in brackets.
30+
2231
def group(*choices): return '\(' + string.join(choices, '\|') + '\)'
2332

24-
Ignore = '[ \f\t]*\([\]\r?\n[ \t]*\)*\(#.*\)?'
33+
Whitespace = '[ \f\t]*'
34+
Comment = '\(#[^\r\n]*\)'
35+
Ignore = Whitespace + group('[\]\r?\n' + Whitespace)+'*' + Comment+'?'
2536
Name = '[a-zA-Z_][a-zA-Z0-9_]*'
2637

27-
ImagZero = '0[jJ]' # This is not caught by any of the following
2838
Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
2939
Octnumber = '0[0-7]*[lL]?'
30-
Decnumber = '[1-9][0-9]*[lLjJ]?'
31-
Intnumber = group(ImagZero, Hexnumber, Octnumber, Decnumber)
40+
Decnumber = '[1-9][0-9]*[lL]?'
41+
Intnumber = group(Hexnumber, Octnumber, Decnumber)
3242
Exponent = '[eE][-+]?[0-9]+'
3343
Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?'
34-
Expfloat = '[0-9]+' + Exponent
35-
Floatnumber = group(Pointfloat, Expfloat) + "[jJ]?"
36-
Number = group(Floatnumber, Intnumber)
44+
Expfloat = '[1-9][0-9]*' + Exponent
45+
Floatnumber = group(Pointfloat, Expfloat)
46+
Imagnumber = group('0[jJ]', '[1-9][0-9]*[jJ]', Floatnumber + '[jJ]')
47+
Number = group(Imagnumber, Floatnumber, Intnumber)
3748

38-
Single = group('^\'', '[^\]\'')
49+
Single = group("^'", "[^\]'")
3950
Double = group('^"', '[^\]"')
40-
Tsingle = group('^\'\'\'', '[^\]\'\'\'')
41-
Tdouble = group('^"""', '[^\]"""')
42-
Triple = group('\'\'\'', '"""')
43-
String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'),
44-
'"' + group('[\].', '[^"\]') + '*' + group('"', '[\]\n'))
51+
Single3 = group("^'''", "[^\]'''")
52+
Double3 = group('^"""', '[^\]"""')
53+
Triple = group("'''", '"""')
54+
String = group("'" + group('[\].', "[^\n'\]") + "*'",
55+
'"' + group('[\].', '[^\n"\]') + '*"')
4556

4657
Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|',
4758
'<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>')
4859
Bracket = '[][(){}]'
49-
Special = group('[\]?\r?\n', '[:;.,`\f]')
60+
Special = group('\r?\n', '[:;.,`]')
5061
Funny = group(Operator, Bracket, Special)
5162

52-
PlainToken = group(Name, Number, Triple, String, Funny)
63+
PlainToken = group(Name, Number, String, Funny)
5364
Token = Ignore + PlainToken
5465

66+
ContStr = group("'" + group('[\].', "[^\n'\]")+'*' + group("'", '[\]\r?\n'),
67+
'"' + group('[\].', '[^\n"\]')+'*' + group('"', '[\]\r?\n'))
68+
PseudoExtras = group('[\]\r?\n', Comment, Triple)
69+
PseudoToken = Whitespace + group(PseudoExtras, Name, Number, ContStr, Funny)
70+
5571
try:
56-
save_syntax = regex.set_syntax(0) # use default syntax
72+
saved_syntax = regex.set_syntax(0) # use default syntax
5773
tokenprog = regex.compile(Token)
74+
pseudoprog = regex.compile(PseudoToken)
5875
endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double),
59-
'\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) }
76+
'\'\'\'': regex.compile(Single3), '"""': regex.compile(Double3) }
6077
finally:
61-
regex.set_syntax(save_syntax) # restore original syntax
78+
regex.set_syntax(saved_syntax) # restore original syntax
6279

6380
tabsize = 8
6481
TokenError = 'TokenError'
65-
def printtoken(type, string, linenum, line, start, end): # for testing
66-
print `linenum` + ':', tok_name[type], repr(string)
82+
def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
83+
print "%d,%d-%d,%d:\t%s\t%s" % \
84+
(srow, scol, erow, ecol, tok_name[type], repr(token))
6785

68-
def tokenize(readline, tokeneater = printtoken):
69-
linenum = parenlev = continued = 0
86+
def tokenize(readline, tokeneater=printtoken):
87+
lnum = parenlev = continued = 0
7088
namechars, numchars = string.letters + '_', string.digits
7189
contstr = ''
7290
indents = [0]
91+
7392
while 1: # loop over lines in stream
7493
line = readline()
75-
linenum = linenum + 1
76-
if line[-2:] == '\r\n': line = line[:-2] + '\n'
94+
lnum = lnum + 1
7795
pos, max = 0, len(line)
7896

7997
if contstr: # continued string
8098
if not line: raise TokenError, "EOF within multi-line string"
81-
if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n'
8299
if endprog.search(line) >= 0:
83100
pos = end = endprog.regs[0][1]
84-
tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0)
101+
tokeneater(STRING, contstr + line[:end],
102+
strstart, (lnum, end), line)
85103
contstr = ''
86104
else:
87105
contstr = contstr + line
88106
continue
89107

90-
elif parenlev == 0 and not continued: # this is a new statement
108+
elif parenlev == 0 and not continued: # new statement
91109
if not line: break
92110
column = 0
93-
while 1: # measure leading whitespace
111+
while pos < max: # measure leading whitespace
94112
if line[pos] == ' ': column = column + 1
95-
elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize
113+
elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
96114
elif line[pos] == '\f': column = 0
97115
else: break
98116
pos = pos + 1
99-
if line[pos] in '#\n': continue # skip comments or blank lines
117+
if pos == max: break
118+
119+
if line[pos] in '#\r\n': # skip comments or blank lines
120+
tokeneater((NEWLINE, COMMENT)[line[pos] == '#'], line[pos:],
121+
(lnum, pos), (lnum, len(line)), line)
122+
continue
100123

101124
if column > indents[-1]: # count indents or dedents
102125
indents.append(column)
103-
tokeneater(INDENT, '\t', linenum, line, 0, 0)
126+
tokeneater(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
104127
while column < indents[-1]:
105128
indents = indents[:-1]
106-
tokeneater(DEDENT, '\t', linenum, line, 0, 0)
129+
tokeneater(DEDENT, line[:pos], (lnum, 0), (lnum, pos), line)
107130

108131
else: # continued statement
109132
if not line: raise TokenError, "EOF within multi-line statement"
110133
continued = 0
111134

112135
while pos < max:
113-
if tokenprog.match(line, pos) > 0: # scan for tokens
114-
start, end = tokenprog.regs[3]
115-
token = line[start:end]
136+
if pseudoprog.match(line, pos) > 0: # scan for tokens
137+
start, end = pseudoprog.regs[1]
138+
spos, epos = (lnum, start), (lnum, end)
139+
token, initial = line[start:end], line[start]
116140
pos = end
117141

118-
if token[0] in namechars: # ordinary name
119-
tokeneater(NAME, token, linenum, line, start, end)
120-
elif token[0] in numchars: # ordinary number
121-
tokeneater(NUMBER, token, linenum, line, start, end)
122-
142+
if initial in namechars: # ordinary name
143+
tokeneater(NAME, token, spos, epos, line)
144+
elif initial in numchars: # ordinary number
145+
tokeneater(NUMBER, token, spos, epos, line)
146+
elif initial in '\r\n':
147+
tokeneater(NEWLINE, token, spos, epos, line)
148+
elif initial == '#':
149+
tokeneater(COMMENT, token, spos, epos, line)
150+
elif initial == '\\': # continued stmt
151+
continued = 1
123152
elif token in ('\'\'\'', '"""'): # triple-quoted
124153
endprog = endprogs[token]
125154
if endprog.search(line, pos) >= 0: # all on one line
126155
pos = endprog.regs[0][1]
127-
token = line[start:pos]
128-
tokeneater(STRING, token, linenum, line, start, pos)
156+
token = line[start:pos]
157+
tokeneater(STRING, token, spos, (lnum, pos), line)
129158
else:
130-
contstr = line[start:] # multiple lines
159+
strstart = (lnum, start) # multiple lines
160+
contstr = line[start:]
131161
break
132-
elif token[0] in '\'"':
162+
elif initial in '\'"':
133163
if token[-1] == '\n': # continued string
134-
endprog, contstr = endprogs[token[0]], line[start:]
164+
strstart = (lnum, start)
165+
endprog, contstr = endprogs[initial], line[start:]
135166
break
136167
else: # ordinary string
137-
tokeneater(STRING, token, linenum, line, start, end)
138-
139-
elif token[0] == '\n':
140-
tokeneater(NEWLINE, token, linenum, line, start, end)
141-
elif token[0] == '\\': # continued stmt
142-
continued = 1
143-
168+
tokeneater(STRING, token, spos, epos, line)
144169
else:
145-
if token[0] in '([{': parenlev = parenlev + 1
146-
if token[0] in ')]}': parenlev = parenlev - 1
147-
tokeneater(OP, token, linenum, line, start, end)
170+
if initial in '([{': parenlev = parenlev + 1
171+
elif initial in ')]}': parenlev = parenlev - 1
172+
tokeneater(OP, token, spos, epos, line)
148173
else:
149-
tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1)
174+
tokeneater(ERRORTOKEN, line[pos], spos, (lnum, pos+1), line)
150175
pos = pos + 1
151176

152177
for indent in indents[1:]: # pop remaining indent levels
153-
tokeneater(DEDENT, '\t', linenum, line, 0, 0)
178+
tokeneater(DEDENT, '', (lnum, 0), (lnum, 0), '')
154179

155180
if __name__ == '__main__': # testing
156181
import sys
157-
file = open(sys.argv[-1])
158-
tokenize(file.readline)
182+
tokenize(open(sys.argv[-1]).readline)

0 commit comments

Comments
 (0)