Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit c566431

Browse files
committed
Improve regex tokenizer example by using re.finditer().
Also, improve variable names and fix column numbers in the generated output.
1 parent 4036d87 commit c566431

1 file changed

Lines changed: 38 additions & 37 deletions

File tree

Doc/library/re.rst

Lines changed: 38 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1333,7 +1333,7 @@ successive matches::
13331333

13341334
Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column'])
13351335

1336-
def tokenize(s):
1336+
def tokenize(code):
13371337
keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
13381338
token_specification = [
13391339
('NUMBER', r'\d+(\.\d*)?'), # Integer or decimal number
@@ -1343,26 +1343,27 @@ successive matches::
13431343
('OP', r'[+\-*/]'), # Arithmetic operators
13441344
('NEWLINE', r'\n'), # Line endings
13451345
('SKIP', r'[ \t]+'), # Skip over spaces and tabs
1346+
('MISMATCH',r'.'), # Any other character
13461347
]
13471348
tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
13481349
get_token = re.compile(tok_regex).match
1349-
line = 1
1350-
pos = line_start = 0
1351-
mo = get_token(s)
1352-
while mo is not None:
1353-
typ = mo.lastgroup
1354-
if typ == 'NEWLINE':
1355-
line_start = pos
1356-
line += 1
1357-
elif typ != 'SKIP':
1358-
val = mo.group(typ)
1359-
if typ == 'ID' and val in keywords:
1360-
typ = val
1361-
yield Token(typ, val, line, mo.start()-line_start)
1362-
pos = mo.end()
1363-
mo = get_token(s, pos)
1364-
if pos != len(s):
1365-
raise RuntimeError('Unexpected character %r on line %d' %(s[pos], line))
1350+
line_num = 1
1351+
line_start = 0
1352+
for mo in re.finditer(tok_regex, code):
1353+
kind = mo.lastgroup
1354+
value = mo.group(kind)
1355+
if kind == 'NEWLINE':
1356+
line_start = mo.end()
1357+
line_num += 1
1358+
elif kind == 'SKIP':
1359+
pass
1360+
elif kind == 'MISMATCH':
1361+
raise RuntimeError('%r unexpected on line %d' % (value, line_num))
1362+
else:
1363+
if kind == 'ID' and value in keywords:
1364+
kind = value
1365+
column = mo.start() - line_start
1366+
yield Token(kind, value, line_num, column)
13661367

13671368
statements = '''
13681369
IF quantity THEN
@@ -1376,22 +1377,22 @@ successive matches::
13761377

13771378
The tokenizer produces the following output::
13781379

1379-
Token(typ='IF', value='IF', line=2, column=5)
1380-
Token(typ='ID', value='quantity', line=2, column=8)
1381-
Token(typ='THEN', value='THEN', line=2, column=17)
1382-
Token(typ='ID', value='total', line=3, column=9)
1383-
Token(typ='ASSIGN', value=':=', line=3, column=15)
1384-
Token(typ='ID', value='total', line=3, column=18)
1385-
Token(typ='OP', value='+', line=3, column=24)
1386-
Token(typ='ID', value='price', line=3, column=26)
1387-
Token(typ='OP', value='*', line=3, column=32)
1388-
Token(typ='ID', value='quantity', line=3, column=34)
1389-
Token(typ='END', value=';', line=3, column=42)
1390-
Token(typ='ID', value='tax', line=4, column=9)
1391-
Token(typ='ASSIGN', value=':=', line=4, column=13)
1392-
Token(typ='ID', value='price', line=4, column=16)
1393-
Token(typ='OP', value='*', line=4, column=22)
1394-
Token(typ='NUMBER', value='0.05', line=4, column=24)
1395-
Token(typ='END', value=';', line=4, column=28)
1396-
Token(typ='ENDIF', value='ENDIF', line=5, column=5)
1397-
Token(typ='END', value=';', line=5, column=10)
1380+
Token(typ='IF', value='IF', line=2, column=4)
1381+
Token(typ='ID', value='quantity', line=2, column=7)
1382+
Token(typ='THEN', value='THEN', line=2, column=16)
1383+
Token(typ='ID', value='total', line=3, column=8)
1384+
Token(typ='ASSIGN', value=':=', line=3, column=14)
1385+
Token(typ='ID', value='total', line=3, column=17)
1386+
Token(typ='OP', value='+', line=3, column=23)
1387+
Token(typ='ID', value='price', line=3, column=25)
1388+
Token(typ='OP', value='*', line=3, column=31)
1389+
Token(typ='ID', value='quantity', line=3, column=33)
1390+
Token(typ='END', value=';', line=3, column=41)
1391+
Token(typ='ID', value='tax', line=4, column=8)
1392+
Token(typ='ASSIGN', value=':=', line=4, column=12)
1393+
Token(typ='ID', value='price', line=4, column=15)
1394+
Token(typ='OP', value='*', line=4, column=21)
1395+
Token(typ='NUMBER', value='0.05', line=4, column=23)
1396+
Token(typ='END', value=';', line=4, column=27)
1397+
Token(typ='ENDIF', value='ENDIF', line=5, column=4)
1398+
Token(typ='END', value=';', line=5, column=9)

0 commit comments

Comments
 (0)