Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit a48db39

Browse files
committed
Issue #5857: tokenize.tokenize() now returns named tuples.
1 parent c1edc2d commit a48db39

2 files changed

Lines changed: 28 additions & 20 deletions

File tree

Doc/library/tokenize.rst

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,12 @@ The primary entry point is a :term:`generator`:
2727
column where the token begins in the source; a 2-tuple ``(erow, ecol)`` of
2828
ints specifying the row and column where the token ends in the source; and
2929
the line on which the token was found. The line passed (the last tuple item)
30-
is the *logical* line; continuation lines are included.
30+
is the *logical* line; continuation lines are included. The 5 tuple is
31+
returned as a :term:`named tuple` with the field names:
32+
``type string start end line``.
33+
34+
.. versionchanged:: 3.1
35+
Added support for named tuples.
3136

3237
:func:`tokenize` determines the source encoding of the file by looking for a
3338
UTF-8 BOM or encoding cookie, according to :pep:`263`.

Lib/tokenize.py

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,15 @@
2424
'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
2525
'Michael Foord')
2626

27+
import collections
2728
import re, string, sys
2829
from token import *
2930
from codecs import lookup, BOM_UTF8
3031
cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
3132

3233
import token
3334
__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
34-
"detect_encoding", "NL", "untokenize", "ENCODING"]
35+
"detect_encoding", "NL", "untokenize", "ENCODING", "Tokenize"]
3536
del token
3637

3738
COMMENT = N_TOKENS
@@ -42,6 +43,8 @@
4243
tok_name[ENCODING] = 'ENCODING'
4344
N_TOKENS += 3
4445

46+
TokenInfo = collections.namedtuple('TokenInfo', 'type string start end line')
47+
4548
def group(*choices): return '(' + '|'.join(choices) + ')'
4649
def any(*choices): return group(*choices) + '*'
4750
def maybe(*choices): return group(*choices) + '?'
@@ -346,7 +349,7 @@ def _tokenize(readline, encoding):
346349
indents = [0]
347350

348351
if encoding is not None:
349-
yield (ENCODING, encoding, (0, 0), (0, 0), '')
352+
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
350353
while True: # loop over lines in stream
351354
try:
352355
line = readline()
@@ -364,12 +367,12 @@ def _tokenize(readline, encoding):
364367
endmatch = endprog.match(line)
365368
if endmatch:
366369
pos = end = endmatch.end(0)
367-
yield (STRING, contstr + line[:end],
370+
yield TokenInfo(STRING, contstr + line[:end],
368371
strstart, (lnum, end), contline + line)
369372
contstr, needcont = '', 0
370373
contline = None
371374
elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
372-
yield (ERRORTOKEN, contstr + line,
375+
yield TokenInfo(ERRORTOKEN, contstr + line,
373376
strstart, (lnum, len(line)), contline)
374377
contstr = ''
375378
contline = None
@@ -394,25 +397,25 @@ def _tokenize(readline, encoding):
394397
if line[pos] == '#':
395398
comment_token = line[pos:].rstrip('\r\n')
396399
nl_pos = pos + len(comment_token)
397-
yield (COMMENT, comment_token,
400+
yield TokenInfo(COMMENT, comment_token,
398401
(lnum, pos), (lnum, pos + len(comment_token)), line)
399-
yield (NL, line[nl_pos:],
402+
yield TokenInfo(NL, line[nl_pos:],
400403
(lnum, nl_pos), (lnum, len(line)), line)
401404
else:
402-
yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
405+
yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
403406
(lnum, pos), (lnum, len(line)), line)
404407
continue
405408

406409
if column > indents[-1]: # count indents or dedents
407410
indents.append(column)
408-
yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
411+
yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
409412
while column < indents[-1]:
410413
if column not in indents:
411414
raise IndentationError(
412415
"unindent does not match any outer indentation level",
413416
("<tokenize>", lnum, pos, line))
414417
indents = indents[:-1]
415-
yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
418+
yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
416419

417420
else: # continued statement
418421
if not line:
@@ -428,20 +431,20 @@ def _tokenize(readline, encoding):
428431

429432
if (initial in numchars or # ordinary number
430433
(initial == '.' and token != '.' and token != '...')):
431-
yield (NUMBER, token, spos, epos, line)
434+
yield TokenInfo(NUMBER, token, spos, epos, line)
432435
elif initial in '\r\n':
433-
yield (NL if parenlev > 0 else NEWLINE,
436+
yield TokenInfo(NL if parenlev > 0 else NEWLINE,
434437
token, spos, epos, line)
435438
elif initial == '#':
436439
assert not token.endswith("\n")
437-
yield (COMMENT, token, spos, epos, line)
440+
yield TokenInfo(COMMENT, token, spos, epos, line)
438441
elif token in triple_quoted:
439442
endprog = endprogs[token]
440443
endmatch = endprog.match(line, pos)
441444
if endmatch: # all on one line
442445
pos = endmatch.end(0)
443446
token = line[start:pos]
444-
yield (STRING, token, spos, (lnum, pos), line)
447+
yield TokenInfo(STRING, token, spos, (lnum, pos), line)
445448
else:
446449
strstart = (lnum, start) # multiple lines
447450
contstr = line[start:]
@@ -458,23 +461,23 @@ def _tokenize(readline, encoding):
458461
contline = line
459462
break
460463
else: # ordinary string
461-
yield (STRING, token, spos, epos, line)
464+
yield TokenInfo(STRING, token, spos, epos, line)
462465
elif initial in namechars: # ordinary name
463-
yield (NAME, token, spos, epos, line)
466+
yield TokenInfo(NAME, token, spos, epos, line)
464467
elif initial == '\\': # continued stmt
465468
continued = 1
466469
else:
467470
if initial in '([{': parenlev = parenlev + 1
468471
elif initial in ')]}': parenlev = parenlev - 1
469-
yield (OP, token, spos, epos, line)
472+
yield TokenInfo(OP, token, spos, epos, line)
470473
else:
471-
yield (ERRORTOKEN, line[pos],
474+
yield TokenInfo(ERRORTOKEN, line[pos],
472475
(lnum, pos), (lnum, pos+1), line)
473476
pos = pos + 1
474477

475478
for indent in indents[1:]: # pop remaining indent levels
476-
yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
477-
yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
479+
yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
480+
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
478481

479482

480483
# An undocumented, backwards compatible, API for all the places in the standard

0 commit comments

Comments
 (0)