diff --git a/README.md b/README.md new file mode 100644 index 0000000..1c0cd5d --- /dev/null +++ b/README.md @@ -0,0 +1,6 @@ +###IMProved + +IMProved aims to build an interpreter for IMP (based on +[this](http://jayconrod.com/posts/37/a-simple-interpreter-from-scratch-in-python-part-1)) +that is fully documented, occassionally improved and +embeddable into Python. diff --git a/improved/__init__.py b/improved/__init__.py new file mode 100644 index 0000000..26311d3 --- /dev/null +++ b/improved/__init__.py @@ -0,0 +1,12 @@ +from .interpreter import callIMP + +__author__ = 'Veit Heller' +__author_mail__ = 'veit@veitheller.de' +__version__ = '0.0.1' +__url__ = 'http://github.com/hellerver/IMProved' +__longdescr__ = """ + An embeddable IMP interpreter for and in + Python. + """ +__classifiers__ = ['Topic :: Software Development :: Interpreters'] +__keywords__ = ['IMP', 'interpreter', 'embeddable'] diff --git a/improved/interpreter.py b/improved/interpreter.py new file mode 100644 index 0000000..8fc8061 --- /dev/null +++ b/improved/interpreter.py @@ -0,0 +1,2 @@ +def callIMP(chars): + pass diff --git a/improved/lex.py b/improved/lex.py new file mode 100644 index 0000000..2f2c284 --- /dev/null +++ b/improved/lex.py @@ -0,0 +1,33 @@ +"""The lexer""" +import sys +import re + +def lex(characters, token_exprs): + """ + A somewhat generic lexer. + + characters -- the string to be lexed + token_exprs -- the tokens that consitute our grammar + + returns -- a list of tokens of the form (contents, tag) + """ + pos = 0 + tokens = [] + while pos < len(characters): + match = None + for token_expr in token_exprs: + pattern, tag = token_expr + regex = re.compile(pattern) + match = regex.match(characters, pos) + if match: + text = match.group(0) + if tag: + token = (text, tag) + tokens.append(token) + break + if not match: + sys.stderr.write('[Parser] Illegal character: %s\\n' % characters[pos]) + raise ValueError(characters[pos]) + else: + pos = match.end(0) + return tokens diff --git a/improved/parsecomb.py b/improved/parsecomb.py new file mode 100644 index 0000000..becb4a4 --- /dev/null +++ b/improved/parsecomb.py @@ -0,0 +1,370 @@ +"""A minimal parser combinator library.""" + +class Result(object): + """The result class. Returned by every parser.""" + def __init__(self, value, pos): + """ + The result initializer. + + value -- the value of the result node + pos -- the position of the result node + """ + self.value = value + self.pos = pos + + def __repr__(self): + """ + A representation to make debugging easier. + + returns -- A string of the form 'Result(value, position)' + """ + return 'Result(%s, %d)' % (self.value, self.pos) + +class Parser(object): + """ + The parser superclass. + All parsers have to inherit from it. + """ + def __call__(self, tokens, pos): + """ + Makes the parser callable. + All subclasses need to override this method. + + tokens -- the tokens with which the Parser is called + pos -- the token position + + returns -- None + """ + return None + + def __add__(self, other): + """ + Concatenates a parser to another parser. + + returns -- a Concat object of both parsers + """ + return Concat(self, other) + + def __mul__(self, other): + """ + "Multiplies" the parser to another parser. + + returns -- an Exp object of both parsers + """ + return Exp(self, other) + + def __or__(self, other): + """ + Alternates between this parser and another. + + returns -- an Alternate object of both parsers + """ + return Alternate(self, other) + + def __xor__(self, function): + """ + Applies a function to the parser's results. + + returns -- a Process object of the parser and the function + """ + return Process(self, function) + +class Reserved(Parser): + """The parser for reserved words""" + def __init__(self, value, tag): + """ + The initialization method. + + value -- the reserved word + tag -- the RESERVED tag + """ + self.value = value + self.tag = tag + + def __call__(self, tokens, pos): + """ + Call parser. Returns a result if the token at the position + matches the reserved word. Otherwise returns None. + + tokens -- the token list + pos -- the position to check + + returns -- a Result object | None + """ + if pos < len(tokens) and tokens[pos] == [self.value, self.tag]: + return Result(tokens[pos][0], pos + 1) + else: + return None + +class Tag(Parser): + """ + The parser for tags. Matches anything if the tag matches, + regardless of its value. + """ + def __init__(self, tag): + """ + The initialization method. + + tag -- the tag that should match + """ + self.tag = tag + + def __call__(self, tokens, pos): + """ + Call parser. Returns a result if the tag of the + token at the specified position matches, regardless + of its value. Otherwise returns None. + + tokens -- the token list + pos -- the position to check + + returns -- a Result object | None + """ + if pos < len(tokens) and tokens[pos][1] is self.tag: + return Result(tokens[pos][0], pos + 1) + else: + return None + +class Concat(Parser): + """The concat combinator. Parses sequences of two tokens.""" + def __init__(self, left, right): + """ + The initialization method. + + left -- the first parser + right -- the second parser + """ + self.left = left + self.right = right + + def __call__(self, tokens, pos): + """ + Calls the parser. Returns a result tuple if both + parser match, otherwise returns None. + + tokens -- the token list + pos -- the position to check + + returns -- a Tuple of the Result objects of both parsers | None + """ + left_result = self.left(tokens, pos) + if left_result: + right_result = self.right(tokens, left_result.pos) + if right_result: + combined_value = (left_result.value, right_result.value) + return Result(combined_value, right_result.pos) + return None + + class Alternate(Parser): + """The alternate combinator. Parses using either of two parsers.""" + def __init__(self, left, right): + """ + The initialization method. + + left -- the first parser + right -- the second parser + """ + self.left = left + self.right = right + + def __call__(self, tokens, pos): + """ + Calls the parser. Returns a result if either + parser matches, otherwisu returns None. + + tokens -- the token list + pos -- the position to check + + returns -- the Result object of either parser | None + """ + left_result = self.left(tokens, pos) + if left_result: + return left_result + else: + right_result = self.right(tokens, pos) + return right_result + return None + +class Opt(Parser): + """The optional combinator. Always returns a result.""" + def __init__(self, parser): + """ + The initialization method. + + parser -- the parser to wrap + """ + self.parser = parser + + def __call__(self, tokens, pos): + """ + Calls the parser. Returns either the + parser's result or an empty result. + + tokens -- the token list + pos -- the position to check + + returns -- a Result object + """ + result = self.parser(tokens, pos) + if result: + return result + else: + return Result(None, pos) + +class Rep(Parser): + """The repetition combinator. Applies a parser until it fails.""" + def __init__(self, parser): + """ + The initialization method. + + parser -- the parser to wrap + """ + self.parser = parser + + def __call__(self, tokens, pos): + """ + Calls the parser. Returns a Result + that contains a list of Results (one + for each successful application). + + tokens -- the token list + pos -- the position to check + + returns -- a Result object + """ + results = [] + result = self.parser(tokens, pos) + while result: + results.append(result.value) + pos = result.pos + result = self.parser(tokens, pos) + return Result(results, pos) + +class Process(Parser): + """ + The process combinator. Applies a function that + manipulates the parser's result. + """ + def __init__(self, parser, function): + """ + The initialization method. + + parser -- the parser to wrap + function -- the manipulation function + """ + self.parser = parser + self.function = function + + def __call__(self, tokens, pos): + """ + Calls the parser. Returns the Result + object whose value was manipulated by the + given function. + + tokens -- the token list + pos -- the position to check + + returns -- a Result object | None + """ + result = self.parser(tokens, pos) + if result: + result.value = self.function(result.value) + return result + +class Lazy(Parser): + """ + The lazy combinator. Builds a parser only if needed. + This makes recursive parsers possible. + """ + def __init__(self, parser_func): + """ + The initialization method. + + parser_func -- a function building a parser + """ + self.parser = None + self.parser_func = parser_func + + def __call__(self, tokens, pos): + """ + Builds the parser and returns its' result. + + tokens -- the token list + pos -- the position to check + + returns -- a Result object | None + """ + if not self.parser: + self.parser = self.parser_func() + return self.parser(tokens, pos) + +class Phrase(Parser): + """ + The phrase combinator. Applies a parser and + only succeeds if it consumed all remaining input. + """ + def __init__(self, parser): + """ + The initialization method. + + parser -- the parser to wrap + """ + self.parser = parser + + def __call__(self, tokens, pos): + """ + Calls the parser. If there is a result + and nothing is left to parse, it returns + the result. Otherwise it returns None. + + tokens -- the token list + pos -- the position to check + + returns -- a Result object | None + """ + result = self.parser(tokens, pos) + if result and result.pos == len(tokens): + return result + else: + return None + +class Exp(Parser): + """ + The compound statement parser. + Workaround for problems with left recursion. + """ + def __init__(self, parser, separator): + """ + The initialization method. + + parser -- the parser to wrap + separator -- the parser that parses the compound + separators + """ + self.parser = parser + self.separator = separator + + def __call__(self, tokens, pos): + """ + Calls the parser. Like Rep, it applies + a parser multiple times, but it also + keeps track of the separators. + + tokens -- the token list + pos -- the position to check + + returns -- a Result object | None + """ + result = self.parser(tokens, pos) + + def process_next(parsed): + (sepfunc, right) = parsed + return sepfunc(result.value, right) + next_parser = self.separator + self.parser ^ process_next + + next_result = result + while next_result: + next_result = next_parser(tokens, result.pos) + if next_result: + result = next_result + return result diff --git a/improved/tokenize.py b/improved/tokenize.py new file mode 100644 index 0000000..d1a27af --- /dev/null +++ b/improved/tokenize.py @@ -0,0 +1,47 @@ +"""The tokenizer""" +from .lex import lex + +RESERVED = 'RESERVED' +INT = 'INT' +ID = 'ID' + +TOKENS = [ + (r'[ \\n\\t]+', None), + (r'#[^\\n]*', None), + (r'\:=', RESERVED), + (r'\(', RESERVED), + (r'\)', RESERVED), + (r';', RESERVED), + (r'\+', RESERVED), + (r'-', RESERVED), + (r'\*\*', RESERVED), + (r'\*', RESERVED), + (r'/', RESERVED), + (r'<=', RESERVED), + (r'<', RESERVED), + (r'>=', RESERVED), + (r'>', RESERVED), + (r'=', RESERVED), + (r'!=', RESERVED), + (r'and', RESERVED), + (r'or', RESERVED), + (r'not', RESERVED), + (r'if', RESERVED), + (r'then', RESERVED), + (r'else', RESERVED), + (r'while', RESERVED), + (r'do', RESERVED), + (r'end', RESERVED), + (r'[0-9]+', INT), + (r'[A-Za-z][A-Za-z0-9_]*', ID), +] + +def tokenize(characters): + """ + Tokenizes the input. + + characters -- the string to be tokenized + + returns -- a list of tuples of the form (contents, tag) + """ + return lex(characters, TOKENS) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..a32cb6c --- /dev/null +++ b/setup.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python +"""An installation script for IMProved.""" + +from distutils.core import setup +import improved + +setup( + name='IMProved', + description='An embeddable IMP interpreter', + author=IMProved.__author__, + author_email=IMProved.__author_mail__, + version=IMProved.__version__, + url=IMProved.__url__, + long_description=IMProved.__longdescr__, + classifiers=IMProved.__classifiers__, + keywords=IMProved.__keywords__, + packages=['IMProved'], + license="GPLv2", + platforms=['Linux', 'OS X', 'Windows'] +)