2424 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
2525 'Michael Foord' )
2626
27+ import collections
2728import re , string , sys
2829from token import *
2930from codecs import lookup , BOM_UTF8
3031cookie_re = re .compile ("coding[:=]\s*([-\w.]+)" )
3132
3233import token
3334__all__ = [x for x in dir (token ) if x [0 ] != '_' ] + ["COMMENT" , "tokenize" ,
34- "detect_encoding" , "NL" , "untokenize" , "ENCODING" ]
35+ "detect_encoding" , "NL" , "untokenize" , "ENCODING" , "Tokenize" ]
3536del token
3637
3738COMMENT = N_TOKENS
4243tok_name [ENCODING ] = 'ENCODING'
4344N_TOKENS += 3
4445
46+ TokenInfo = collections .namedtuple ('TokenInfo' , 'type string start end line' )
47+
4548def group (* choices ): return '(' + '|' .join (choices ) + ')'
4649def any (* choices ): return group (* choices ) + '*'
4750def maybe (* choices ): return group (* choices ) + '?'
@@ -346,7 +349,7 @@ def _tokenize(readline, encoding):
346349 indents = [0 ]
347350
348351 if encoding is not None :
349- yield (ENCODING , encoding , (0 , 0 ), (0 , 0 ), '' )
352+ yield TokenInfo (ENCODING , encoding , (0 , 0 ), (0 , 0 ), '' )
350353 while True : # loop over lines in stream
351354 try :
352355 line = readline ()
@@ -364,12 +367,12 @@ def _tokenize(readline, encoding):
364367 endmatch = endprog .match (line )
365368 if endmatch :
366369 pos = end = endmatch .end (0 )
367- yield (STRING , contstr + line [:end ],
370+ yield TokenInfo (STRING , contstr + line [:end ],
368371 strstart , (lnum , end ), contline + line )
369372 contstr , needcont = '' , 0
370373 contline = None
371374 elif needcont and line [- 2 :] != '\\ \n ' and line [- 3 :] != '\\ \r \n ' :
372- yield (ERRORTOKEN , contstr + line ,
375+ yield TokenInfo (ERRORTOKEN , contstr + line ,
373376 strstart , (lnum , len (line )), contline )
374377 contstr = ''
375378 contline = None
@@ -394,25 +397,25 @@ def _tokenize(readline, encoding):
394397 if line [pos ] == '#' :
395398 comment_token = line [pos :].rstrip ('\r \n ' )
396399 nl_pos = pos + len (comment_token )
397- yield (COMMENT , comment_token ,
400+ yield TokenInfo (COMMENT , comment_token ,
398401 (lnum , pos ), (lnum , pos + len (comment_token )), line )
399- yield (NL , line [nl_pos :],
402+ yield TokenInfo (NL , line [nl_pos :],
400403 (lnum , nl_pos ), (lnum , len (line )), line )
401404 else :
402- yield ((NL , COMMENT )[line [pos ] == '#' ], line [pos :],
405+ yield TokenInfo ((NL , COMMENT )[line [pos ] == '#' ], line [pos :],
403406 (lnum , pos ), (lnum , len (line )), line )
404407 continue
405408
406409 if column > indents [- 1 ]: # count indents or dedents
407410 indents .append (column )
408- yield (INDENT , line [:pos ], (lnum , 0 ), (lnum , pos ), line )
411+ yield TokenInfo (INDENT , line [:pos ], (lnum , 0 ), (lnum , pos ), line )
409412 while column < indents [- 1 ]:
410413 if column not in indents :
411414 raise IndentationError (
412415 "unindent does not match any outer indentation level" ,
413416 ("<tokenize>" , lnum , pos , line ))
414417 indents = indents [:- 1 ]
415- yield (DEDENT , '' , (lnum , pos ), (lnum , pos ), line )
418+ yield TokenInfo (DEDENT , '' , (lnum , pos ), (lnum , pos ), line )
416419
417420 else : # continued statement
418421 if not line :
@@ -428,20 +431,20 @@ def _tokenize(readline, encoding):
428431
429432 if (initial in numchars or # ordinary number
430433 (initial == '.' and token != '.' and token != '...' )):
431- yield (NUMBER , token , spos , epos , line )
434+ yield TokenInfo (NUMBER , token , spos , epos , line )
432435 elif initial in '\r \n ' :
433- yield (NL if parenlev > 0 else NEWLINE ,
436+ yield TokenInfo (NL if parenlev > 0 else NEWLINE ,
434437 token , spos , epos , line )
435438 elif initial == '#' :
436439 assert not token .endswith ("\n " )
437- yield (COMMENT , token , spos , epos , line )
440+ yield TokenInfo (COMMENT , token , spos , epos , line )
438441 elif token in triple_quoted :
439442 endprog = endprogs [token ]
440443 endmatch = endprog .match (line , pos )
441444 if endmatch : # all on one line
442445 pos = endmatch .end (0 )
443446 token = line [start :pos ]
444- yield (STRING , token , spos , (lnum , pos ), line )
447+ yield TokenInfo (STRING , token , spos , (lnum , pos ), line )
445448 else :
446449 strstart = (lnum , start ) # multiple lines
447450 contstr = line [start :]
@@ -458,23 +461,23 @@ def _tokenize(readline, encoding):
458461 contline = line
459462 break
460463 else : # ordinary string
461- yield (STRING , token , spos , epos , line )
464+ yield TokenInfo (STRING , token , spos , epos , line )
462465 elif initial in namechars : # ordinary name
463- yield (NAME , token , spos , epos , line )
466+ yield TokenInfo (NAME , token , spos , epos , line )
464467 elif initial == '\\ ' : # continued stmt
465468 continued = 1
466469 else :
467470 if initial in '([{' : parenlev = parenlev + 1
468471 elif initial in ')]}' : parenlev = parenlev - 1
469- yield (OP , token , spos , epos , line )
472+ yield TokenInfo (OP , token , spos , epos , line )
470473 else :
471- yield (ERRORTOKEN , line [pos ],
474+ yield TokenInfo (ERRORTOKEN , line [pos ],
472475 (lnum , pos ), (lnum , pos + 1 ), line )
473476 pos = pos + 1
474477
475478 for indent in indents [1 :]: # pop remaining indent levels
476- yield (DEDENT , '' , (lnum , 0 ), (lnum , 0 ), '' )
477- yield (ENDMARKER , '' , (lnum , 0 ), (lnum , 0 ), '' )
479+ yield TokenInfo (DEDENT , '' , (lnum , 0 ), (lnum , 0 ), '' )
480+ yield TokenInfo (ENDMARKER , '' , (lnum , 0 ), (lnum , 0 ), '' )
478481
479482
480483# An undocumented, backwards compatible, API for all the places in the standard
0 commit comments