48 lines
1.3 KiB
Python
48 lines
1.3 KiB
Python
"""The tokenizer"""
|
|
from .lex import lex
|
|
|
|
RESERVED = 'RESERVED'
|
|
INT = 'INT'
|
|
ID = 'ID'
|
|
|
|
TOKENS = [
|
|
(r'[ \\n\\t]+', None),
|
|
(r'#[^\\n]*', None),
|
|
(r'\:=', RESERVED),
|
|
(r'\(', RESERVED),
|
|
(r'\)', RESERVED),
|
|
(r';', RESERVED),
|
|
(r'\+', RESERVED),
|
|
(r'-', RESERVED),
|
|
(r'\*\*', RESERVED),
|
|
(r'\*', RESERVED),
|
|
(r'/', RESERVED),
|
|
(r'<=', RESERVED),
|
|
(r'<', RESERVED),
|
|
(r'>=', RESERVED),
|
|
(r'>', RESERVED),
|
|
(r'=', RESERVED),
|
|
(r'!=', RESERVED),
|
|
(r'and', RESERVED),
|
|
(r'or', RESERVED),
|
|
(r'not', RESERVED),
|
|
(r'if', RESERVED),
|
|
(r'then', RESERVED),
|
|
(r'else', RESERVED),
|
|
(r'while', RESERVED),
|
|
(r'do', RESERVED),
|
|
(r'end', RESERVED),
|
|
(r'[0-9]+', INT),
|
|
(r'[A-Za-z][A-Za-z0-9_]*', ID),
|
|
]
|
|
|
|
def tokenize(characters):
|
|
"""
|
|
Tokenizes the input.
|
|
|
|
characters -- the string to be tokenized
|
|
|
|
returns -- a list of tuples of the form (contents, tag)
|
|
"""
|
|
return lex(characters, TOKENS)
|