Files
IMProved/improved/tokenize.py
2015-07-08 16:38:49 +02:00

48 lines
1.3 KiB
Python

"""The tokenizer"""
from .lex import lex
RESERVED = 'RESERVED'
INT = 'INT'
ID = 'ID'
TOKENS = [
(r'[ \n\t]+', None),
(r'#[^\n]*', None),
(r'\:=', RESERVED),
(r'\(', RESERVED),
(r'\)', RESERVED),
(r';', RESERVED),
(r'\+', RESERVED),
(r'-', RESERVED),
(r'\*\*', RESERVED),
(r'\*', RESERVED),
(r'/', RESERVED),
(r'<=', RESERVED),
(r'<', RESERVED),
(r'>=', RESERVED),
(r'>', RESERVED),
(r'=', RESERVED),
(r'!=', RESERVED),
(r'and', RESERVED),
(r'or', RESERVED),
(r'not', RESERVED),
(r'if', RESERVED),
(r'then', RESERVED),
(r'else', RESERVED),
(r'while', RESERVED),
(r'do', RESERVED),
(r'end', RESERVED),
(r'[0-9]+', INT),
(r'[A-Za-z][A-Za-z0-9_]*', ID),
]
def tokenize(characters):
"""
Tokenizes the input.
characters -- the string to be tokenized
returns -- a list of tuples of the form (contents, tag)
"""
return lex(characters, TOKENS)