34 lines
949 B
Python
34 lines
949 B
Python
"""The lexer"""
|
|
import sys
|
|
import re
|
|
|
|
def lex(characters, token_exprs):
|
|
"""
|
|
A somewhat generic lexer.
|
|
|
|
characters -- the string to be lexed
|
|
token_exprs -- the tokens that consitute our grammar
|
|
|
|
returns -- a list of tokens of the form (contents, tag)
|
|
"""
|
|
pos = 0
|
|
tokens = []
|
|
while pos < len(characters):
|
|
match = None
|
|
for token_expr in token_exprs:
|
|
pattern, tag = token_expr
|
|
regex = re.compile(pattern)
|
|
match = regex.match(characters, pos)
|
|
if match:
|
|
text = match.group(0)
|
|
if tag:
|
|
token = (text, tag)
|
|
tokens.append(token)
|
|
break
|
|
if not match:
|
|
sys.stderr.write('[Parser] Illegal character: %s\\n' % characters[pos])
|
|
raise ValueError(characters[pos])
|
|
else:
|
|
pos = match.end(0)
|
|
return tokens
|