"""The lexer""" import sys import re def lex(characters, token_exprs): """ A somewhat generic lexer. characters -- the string to be lexed token_exprs -- the tokens that consitute our grammar returns -- a list of tokens of the form (contents, tag) """ pos = 0 tokens = [] while pos < len(characters): match = None for token_expr in token_exprs: pattern, tag = token_expr regex = re.compile(pattern) match = regex.match(characters, pos) if match: text = match.group(0) if tag: token = (text, tag) tokens.append(token) break if not match: sys.stderr.write('[Parser] Illegal character: %s\\n' % characters[pos]) raise ValueError(characters[pos]) else: pos = match.end(0) return tokens