Ka-Ping's muich improved version of March 26, 1997:

gvanrossum · gvanrossum · commit 1aec32363f25 · 1997-04-08T14:24:39.000Z
#     Ignore now accepts \f as whitespace.  Operator now includes '**'.
#     Ignore and Special now accept \n or \r\n at the end of a line.
#     Imagnumber is new.  Expfloat is corrected to reject '0e4'.
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
@@ -1,158 +1,182 @@
 """Tokenization help for Python programs.
 
-This module compiles a regular expression that recognizes Python
-tokens in individual lines of text.  The regular expression handles
-everything except indentation, continuations, and triple-quoted
-strings.  The function 'tokenize.tokenize()' takes care of these
-things for streams of text.  It accepts a readline-like function which
-is called repeatedly to come up with the next input line (or "" for
-EOF), and a "token-eater" function which is called for each token
-found, passing its type, a string containing the token, the line
-number, the line, and the starting and ending positions of the token
-within the line.  It is designed to match the working of the Python
-tokenizer exactly.
-
-"""
-
-__version__ = "Ka-Ping Yee, 4 March 1997, updated by GvR, 10 March 1997"
+This module exports a function called 'tokenize()' that breaks a stream of
+text into Python tokens.  It accepts a readline-like method which is called
+repeatedly to get the next line of input (or "" for EOF) and a "token-eater"
+function which is called once for each token found.  The latter function is
+passed the token type, a string containing the token, the starting and
+ending (row, column) coordinates of the token, and the original line.  It is
+designed to match the working of the Python tokenizer exactly, except that
+it produces COMMENT tokens for comments and gives type OP for all operators.
+
+For compatibility with the older 'tokenize' module, this also compiles a
+regular expression into 'tokenprog' that matches Python tokens in individual
+lines of text, leaving the token in 'tokenprog.group(3)', but does not
+handle indentation, continuations, or multi-line strings."""
+
+__version__ = "Ka-Ping Yee, 26 March 1997"
 
 import string, regex
 from token import *
 
+COMMENT = N_TOKENS
+tok_name[COMMENT] = 'COMMENT'
+
+# Changes from 1.3:
+#     Ignore now accepts \f as whitespace.  Operator now includes '**'.
+#     Ignore and Special now accept \n or \r\n at the end of a line.
+#     Imagnumber is new.  Expfloat is corrected to reject '0e4'.
+# Note: to get a quoted backslash in a regex, it must be enclosed in brackets.
+
 def group(*choices): return '\(' + string.join(choices, '\|') + '\)'
 
-Ignore = '[ \f\t]*\([\]\r?\n[ \t]*\)*\(#.*\)?'
+Whitespace = '[ \f\t]*'
+Comment = '\(#[^\r\n]*\)'
+Ignore = Whitespace + group('[\]\r?\n' + Whitespace)+'*' + Comment+'?'
 Name = '[a-zA-Z_][a-zA-Z0-9_]*'
 
-ImagZero = '0[jJ]' # This is not caught by any of the following
 Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
 Octnumber = '0[0-7]*[lL]?'
-Decnumber = '[1-9][0-9]*[lLjJ]?'
-Intnumber = group(ImagZero, Hexnumber, Octnumber, Decnumber)
+Decnumber = '[1-9][0-9]*[lL]?'
+Intnumber = group(Hexnumber, Octnumber, Decnumber)
 Exponent = '[eE][-+]?[0-9]+'
 Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?'
-Expfloat = '[0-9]+' + Exponent
-Floatnumber = group(Pointfloat, Expfloat) + "[jJ]?"
-Number = group(Floatnumber, Intnumber)
+Expfloat = '[1-9][0-9]*' + Exponent
+Floatnumber = group(Pointfloat, Expfloat)
+Imagnumber = group('0[jJ]', '[1-9][0-9]*[jJ]', Floatnumber + '[jJ]')
+Number = group(Imagnumber, Floatnumber, Intnumber)
 
-Single = group('^\'', '[^\]\'')
+Single = group("^'", "[^\]'")
 Double = group('^"', '[^\]"')
-Tsingle = group('^\'\'\'', '[^\]\'\'\'')
-Tdouble = group('^"""', '[^\]"""')
-Triple = group('\'\'\'', '"""')
-String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'),
-               '"'  + group('[\].', '[^"\]') + '*' + group('"', '[\]\n'))
+Single3 = group("^'''", "[^\]'''")
+Double3 = group('^"""', '[^\]"""')
+Triple = group("'''", '"""')
+String = group("'" + group('[\].', "[^\n'\]") + "*'",
+               '"' + group('[\].', '[^\n"\]') + '*"')
 
 Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|',
                  '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>')
 Bracket = '[][(){}]'
-Special = group('[\]?\r?\n', '[:;.,`\f]')
+Special = group('\r?\n', '[:;.,`]')
 Funny = group(Operator, Bracket, Special)
 
-PlainToken = group(Name, Number, Triple, String, Funny)
+PlainToken = group(Name, Number, String, Funny)
 Token = Ignore + PlainToken
 
+ContStr = group("'" + group('[\].', "[^\n'\]")+'*' + group("'", '[\]\r?\n'),
+                '"' + group('[\].', '[^\n"\]')+'*' + group('"', '[\]\r?\n'))
+PseudoExtras = group('[\]\r?\n', Comment, Triple)
+PseudoToken = Whitespace + group(PseudoExtras, Name, Number, ContStr, Funny)
+
 try:
-    save_syntax = regex.set_syntax(0)          # use default syntax
+    saved_syntax = regex.set_syntax(0)         # use default syntax
     tokenprog = regex.compile(Token)
+    pseudoprog = regex.compile(PseudoToken)
     endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double),
-        '\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) }
+        '\'\'\'': regex.compile(Single3), '"""': regex.compile(Double3) }
 finally:
-    regex.set_syntax(save_syntax)              # restore original syntax
+    regex.set_syntax(saved_syntax)             # restore original syntax
 
 tabsize = 8
 TokenError = 'TokenError'
-def printtoken(type, string, linenum, line, start, end):   # for testing
-    print `linenum` + ':', tok_name[type], repr(string)
+def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
+    print "%d,%d-%d,%d:\t%s\t%s" % \
+        (srow, scol, erow, ecol, tok_name[type], repr(token))
 
-def tokenize(readline, tokeneater = printtoken):
-    linenum = parenlev = continued = 0
+def tokenize(readline, tokeneater=printtoken):
+    lnum = parenlev = continued = 0
     namechars, numchars = string.letters + '_', string.digits
     contstr = ''
     indents = [0]
+
     while 1:                                   # loop over lines in stream
         line = readline()
-        linenum = linenum + 1
-        if line[-2:] == '\r\n': line = line[:-2] + '\n'
+        lnum = lnum + 1
         pos, max = 0, len(line)
 
         if contstr:                            # continued string
             if not line: raise TokenError, "EOF within multi-line string"
-            if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n'
             if endprog.search(line) >= 0:
                 pos = end = endprog.regs[0][1]
-                tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0)
+                tokeneater(STRING, contstr + line[:end],
+                           strstart, (lnum, end), line)
                 contstr = ''
             else:
                 contstr = contstr + line
                 continue
 
-        elif parenlev == 0 and not continued:  # this is a new statement
+        elif parenlev == 0 and not continued:  # new statement
             if not line: break
             column = 0
-            while 1:                           # measure leading whitespace
+            while pos < max:                   # measure leading whitespace
                 if line[pos] == ' ': column = column + 1
-                elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize
+                elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
                 elif line[pos] == '\f': column = 0
                 else: break
                 pos = pos + 1
-            if line[pos] in '#\n': continue    # skip comments or blank lines
+	    if pos == max: break
+
+            if line[pos] in '#\r\n':           # skip comments or blank lines
+                tokeneater((NEWLINE, COMMENT)[line[pos] == '#'], line[pos:],
+                           (lnum, pos), (lnum, len(line)), line)
+                continue
 
             if column > indents[-1]:           # count indents or dedents
                 indents.append(column)
-                tokeneater(INDENT, '\t', linenum, line, 0, 0)
+                tokeneater(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
             while column < indents[-1]:
                 indents = indents[:-1]
-                tokeneater(DEDENT, '\t', linenum, line, 0, 0)
+                tokeneater(DEDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 
         else:                                  # continued statement
             if not line: raise TokenError, "EOF within multi-line statement"
             continued = 0
 
         while pos < max:
-            if tokenprog.match(line, pos) > 0:             # scan for tokens
-                start, end = tokenprog.regs[3]
-                token = line[start:end]
+            if pseudoprog.match(line, pos) > 0:            # scan for tokens
+                start, end = pseudoprog.regs[1]
+                spos, epos = (lnum, start), (lnum, end)
+                token, initial = line[start:end], line[start]
                 pos = end
 
-                if token[0] in namechars:                  # ordinary name
-                    tokeneater(NAME, token, linenum, line, start, end)
-                elif token[0] in numchars:                 # ordinary number
-                    tokeneater(NUMBER, token, linenum, line, start, end)
-
+                if initial in namechars:                   # ordinary name
+                    tokeneater(NAME, token, spos, epos, line)
+                elif initial in numchars:                  # ordinary number
+                    tokeneater(NUMBER, token, spos, epos, line)
+                elif initial in '\r\n':
+                    tokeneater(NEWLINE, token, spos, epos, line)
+                elif initial == '#':
+                    tokeneater(COMMENT, token, spos, epos, line)
+                elif initial == '\\':                      # continued stmt
+                    continued = 1
                 elif token in ('\'\'\'', '"""'):           # triple-quoted
                     endprog = endprogs[token]
                     if endprog.search(line, pos) >= 0:     # all on one line
                         pos = endprog.regs[0][1]
-			token = line[start:pos]
-                        tokeneater(STRING, token, linenum, line, start, pos)
+                        token = line[start:pos]
+                        tokeneater(STRING, token, spos, (lnum, pos), line)
                     else:
-                        contstr = line[start:]             # multiple lines
+                        strstart = (lnum, start)           # multiple lines
+                        contstr = line[start:]
                         break
-                elif token[0] in '\'"':
+                elif initial in '\'"':
                     if token[-1] == '\n':                  # continued string
-                        endprog, contstr = endprogs[token[0]], line[start:]
+                        strstart = (lnum, start)
+                        endprog, contstr = endprogs[initial], line[start:]
                         break
                     else:                                  # ordinary string
-                        tokeneater(STRING, token, linenum, line, start, end)
-
-                elif token[0] == '\n':
-                    tokeneater(NEWLINE, token, linenum, line, start, end)
-                elif token[0] == '\\':                     # continued stmt
-                    continued = 1
-
+                        tokeneater(STRING, token, spos, epos, line)
                 else:
-                    if token[0] in '([{': parenlev = parenlev + 1
-                    if token[0] in ')]}': parenlev = parenlev - 1
-                    tokeneater(OP, token, linenum, line, start, end)
+                    if initial in '([{': parenlev = parenlev + 1
+                    elif initial in ')]}': parenlev = parenlev - 1
+                    tokeneater(OP, token, spos, epos, line)
             else:
-                tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1)
+                tokeneater(ERRORTOKEN, line[pos], spos, (lnum, pos+1), line)
                 pos = pos + 1
 
     for indent in indents[1:]:                 # pop remaining indent levels
-        tokeneater(DEDENT, '\t', linenum, line, 0, 0)
+        tokeneater(DEDENT, '', (lnum, 0), (lnum, 0), '')
 
 if __name__ == '__main__':                     # testing
     import sys
-    file = open(sys.argv[-1])
-    tokenize(file.readline)
+    tokenize(open(sys.argv[-1]).readline)