@@ -1609,38 +1609,40 @@ successive matches::
16091609 import collections
16101610 import re
16111611
1612- Token = collections.namedtuple('Token', ['typ ', 'value', 'line', 'column'])
1612+ Token = collections.namedtuple('Token', ['type ', 'value', 'line', 'column'])
16131613
16141614 def tokenize(code):
16151615 keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
16161616 token_specification = [
1617- ('NUMBER', r'\d+(\.\d*)?'), # Integer or decimal number
1618- ('ASSIGN', r':='), # Assignment operator
1619- ('END', r';'), # Statement terminator
1620- ('ID', r'[A-Za-z]+'), # Identifiers
1621- ('OP', r'[+\-*/]'), # Arithmetic operators
1622- ('NEWLINE', r'\n'), # Line endings
1623- ('SKIP', r'[ \t]+'), # Skip over spaces and tabs
1624- ('MISMATCH',r'.'), # Any other character
1617+ ('NUMBER', r'\d+(\.\d*)?'), # Integer or decimal number
1618+ ('ASSIGN', r':='), # Assignment operator
1619+ ('END', r';'), # Statement terminator
1620+ ('ID', r'[A-Za-z]+'), # Identifiers
1621+ ('OP', r'[+\-*/]'), # Arithmetic operators
1622+ ('NEWLINE', r'\n'), # Line endings
1623+ ('SKIP', r'[ \t]+'), # Skip over spaces and tabs
1624+ ('MISMATCH', r'.'), # Any other character
16251625 ]
16261626 tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
16271627 line_num = 1
16281628 line_start = 0
16291629 for mo in re.finditer(tok_regex, code):
16301630 kind = mo.lastgroup
1631- value = mo.group(kind)
1632- if kind == 'NEWLINE':
1631+ value = mo.group()
1632+ column = mo.start() - line_start
1633+ if kind == 'NUMBER':
1634+ value = float(value) if '.' in value else int(value)
1635+ elif kind == 'ID' and value in keywords:
1636+ kind = value
1637+ elif kind == 'NEWLINE':
16331638 line_start = mo.end()
16341639 line_num += 1
1640+ continue
16351641 elif kind == 'SKIP':
1636- pass
1642+ continue
16371643 elif kind == 'MISMATCH':
16381644 raise RuntimeError(f'{value!r} unexpected on line {line_num}')
1639- else:
1640- if kind == 'ID' and value in keywords:
1641- kind = value
1642- column = mo.start() - line_start
1643- yield Token(kind, value, line_num, column)
1645+ yield Token(kind, value, line_num, column)
16441646
16451647 statements = '''
16461648 IF quantity THEN
@@ -1654,25 +1656,25 @@ successive matches::
16541656
16551657The tokenizer produces the following output::
16561658
1657- Token(typ ='IF', value='IF', line=2, column=4)
1658- Token(typ ='ID', value='quantity', line=2, column=7)
1659- Token(typ ='THEN', value='THEN', line=2, column=16)
1660- Token(typ ='ID', value='total', line=3, column=8)
1661- Token(typ ='ASSIGN', value=':=', line=3, column=14)
1662- Token(typ ='ID', value='total', line=3, column=17)
1663- Token(typ ='OP', value='+', line=3, column=23)
1664- Token(typ ='ID', value='price', line=3, column=25)
1665- Token(typ ='OP', value='*', line=3, column=31)
1666- Token(typ ='ID', value='quantity', line=3, column=33)
1667- Token(typ ='END', value=';', line=3, column=41)
1668- Token(typ ='ID', value='tax', line=4, column=8)
1669- Token(typ ='ASSIGN', value=':=', line=4, column=12)
1670- Token(typ ='ID', value='price', line=4, column=15)
1671- Token(typ ='OP', value='*', line=4, column=21)
1672- Token(typ ='NUMBER', value=' 0.05' , line=4, column=23)
1673- Token(typ ='END', value=';', line=4, column=27)
1674- Token(typ ='ENDIF', value='ENDIF', line=5, column=4)
1675- Token(typ ='END', value=';', line=5, column=9)
1659+ Token(type ='IF', value='IF', line=2, column=4)
1660+ Token(type ='ID', value='quantity', line=2, column=7)
1661+ Token(type ='THEN', value='THEN', line=2, column=16)
1662+ Token(type ='ID', value='total', line=3, column=8)
1663+ Token(type ='ASSIGN', value=':=', line=3, column=14)
1664+ Token(type ='ID', value='total', line=3, column=17)
1665+ Token(type ='OP', value='+', line=3, column=23)
1666+ Token(type ='ID', value='price', line=3, column=25)
1667+ Token(type ='OP', value='*', line=3, column=31)
1668+ Token(type ='ID', value='quantity', line=3, column=33)
1669+ Token(type ='END', value=';', line=3, column=41)
1670+ Token(type ='ID', value='tax', line=4, column=8)
1671+ Token(type ='ASSIGN', value=':=', line=4, column=12)
1672+ Token(type ='ID', value='price', line=4, column=15)
1673+ Token(type ='OP', value='*', line=4, column=21)
1674+ Token(type ='NUMBER', value=0.05, line=4, column=23)
1675+ Token(type ='END', value=';', line=4, column=27)
1676+ Token(type ='ENDIF', value='ENDIF', line=5, column=4)
1677+ Token(type ='END', value=';', line=5, column=9)
16761678
16771679
16781680.. [Frie09 ] Friedl, Jeffrey. Mastering Regular Expressions. 3rd ed., O'Reilly
0 commit comments