initial; lexer and parser

2015-07-07 13:51:15 +02:00
parent 0bc4bc1616
commit f9c1f2a45d
7 changed files with 490 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,6 @@
+###IMProved
+
+IMProved aims to build an interpreter for IMP (based on
+[this](http://jayconrod.com/posts/37/a-simple-interpreter-from-scratch-in-python-part-1))
+that is fully documented, occassionally improved and
+embeddable into Python.
--- a/improved/init.py
+++ b/improved/init.py
@@ -0,0 +1,12 @@
+from .interpreter import callIMP
+
+__author__ = 'Veit Heller'
+__author_mail__ = 'veit@veitheller.de'
+__version__ = '0.0.1'
+__url__ = 'http://github.com/hellerver/IMProved'
+__longdescr__ = """
+                    An embeddable IMP interpreter for and in
+                    Python.
+                """
+__classifiers__ = ['Topic :: Software Development :: Interpreters']
+__keywords__ = ['IMP', 'interpreter', 'embeddable']
--- a/improved/interpreter.py
+++ b/improved/interpreter.py
@@ -0,0 +1,2 @@
+def callIMP(chars):
+    pass
--- a/improved/lex.py
+++ b/improved/lex.py
@@ -0,0 +1,33 @@
+"""The lexer"""
+import sys
+import re
+
+def lex(characters, token_exprs):
+    """
+        A somewhat generic lexer.
+
+        characters  -- the string to be lexed
+        token_exprs -- the tokens that consitute our grammar
+
+        returns     -- a list of tokens of the form (contents, tag)
+    """
+    pos = 0
+    tokens = []
+    while pos < len(characters):
+        match = None
+        for token_expr in token_exprs:
+            pattern, tag = token_expr
+            regex = re.compile(pattern)
+            match = regex.match(characters, pos)
+            if match:
+                text = match.group(0)
+                if tag:
+                    token = (text, tag)
+                    tokens.append(token)
+                break
+        if not match:
+            sys.stderr.write('[Parser] Illegal character: %s\\n' % characters[pos])
+            raise ValueError(characters[pos])
+        else:
+            pos = match.end(0)
+    return tokens
--- a/improved/parsecomb.py
+++ b/improved/parsecomb.py
@@ -0,0 +1,370 @@
+"""A minimal parser combinator library."""
+
+class Result(object):
+    """The result class. Returned by every parser."""
+    def __init__(self, value, pos):
+        """
+            The result initializer.
+
+            value -- the value of the result node
+            pos   -- the position of the result node
+        """
+        self.value = value
+        self.pos = pos
+
+    def __repr__(self):
+        """
+            A representation to make debugging easier.
+
+            returns -- A string of the form 'Result(value, position)'
+        """
+        return 'Result(%s, %d)' % (self.value, self.pos)
+
+class Parser(object):
+    """
+        The parser superclass.
+        All parsers have to inherit from it.
+    """
+    def __call__(self, tokens, pos):
+        """
+            Makes the parser callable.
+            All subclasses need to override this method.
+
+            tokens -- the tokens with which the Parser is called
+            pos    -- the token position
+
+            returns -- None
+        """
+        return None
+
+    def __add__(self, other):
+        """
+            Concatenates a parser to another parser.
+
+            returns -- a Concat object of both parsers
+        """
+        return Concat(self, other)
+
+    def __mul__(self, other):
+        """
+            "Multiplies" the parser to another parser.
+
+            returns -- an Exp object of both parsers
+        """
+        return Exp(self, other)
+
+    def __or__(self, other):
+        """
+            Alternates between this parser and another.
+
+            returns -- an Alternate object of both parsers
+        """
+        return Alternate(self, other)
+
+    def __xor__(self, function):
+        """
+            Applies a function to the parser's results.
+
+            returns -- a Process object of the parser and the function
+        """
+        return Process(self, function)
+
+class Reserved(Parser):
+    """The parser for reserved words"""
+    def __init__(self, value, tag):
+        """
+            The initialization method.
+
+            value -- the reserved word
+            tag   -- the RESERVED tag
+        """
+        self.value = value
+        self.tag = tag
+
+    def __call__(self, tokens, pos):
+        """
+            Call parser. Returns a result if the token at the position
+            matches the reserved word. Otherwise returns None.
+
+            tokens -- the token list
+            pos    -- the position to check
+
+            returns -- a Result object | None
+        """
+        if pos < len(tokens) and tokens[pos] == [self.value, self.tag]:
+            return Result(tokens[pos][0], pos + 1)
+        else:
+            return None
+
+class Tag(Parser):
+    """
+        The parser for tags. Matches anything if the tag matches,
+        regardless of its value.
+    """
+    def __init__(self, tag):
+        """
+            The initialization method.
+
+            tag -- the tag that should match
+        """
+        self.tag = tag
+
+    def __call__(self, tokens, pos):
+        """
+            Call parser. Returns a result if the tag of the
+            token at the specified position matches, regardless
+            of its value. Otherwise returns None.
+
+            tokens -- the token list
+            pos    -- the position to check
+
+            returns -- a Result object | None
+        """
+        if pos < len(tokens) and tokens[pos][1] is self.tag:
+            return Result(tokens[pos][0], pos + 1)
+        else:
+            return None
+
+class Concat(Parser):
+    """The concat combinator. Parses sequences of two tokens."""
+    def __init__(self, left, right):
+        """
+            The initialization method.
+
+            left  -- the first parser
+            right -- the second parser
+        """
+        self.left = left
+        self.right = right
+
+    def __call__(self, tokens, pos):
+        """
+            Calls the parser. Returns a result tuple if both
+            parser match, otherwise returns None.
+
+            tokens -- the token list
+            pos    -- the position to check
+
+            returns -- a Tuple of the Result objects of both parsers | None
+        """
+        left_result = self.left(tokens, pos)
+        if left_result:
+            right_result = self.right(tokens, left_result.pos)
+            if right_result:
+                combined_value = (left_result.value, right_result.value)
+                return Result(combined_value, right_result.pos)
+        return None
+
+ class Alternate(Parser):
+    """The alternate combinator. Parses using either of two parsers."""
+    def __init__(self, left, right):
+        """
+            The initialization method.
+
+            left  -- the first parser
+            right -- the second parser
+        """
+        self.left = left
+        self.right = right
+
+    def __call__(self, tokens, pos):
+        """
+            Calls the parser. Returns a result if either
+            parser matches, otherwisu returns None.
+
+            tokens -- the token list
+            pos    -- the position to check
+
+            returns -- the Result object of either parser | None
+        """
+        left_result = self.left(tokens, pos)
+        if left_result:
+            return left_result
+        else:
+            right_result = self.right(tokens, pos)
+            return right_result
+        return None
+
+class Opt(Parser):
+    """The optional combinator. Always returns a result."""
+    def __init__(self, parser):
+        """
+            The initialization method.
+
+            parser -- the parser to wrap
+        """
+        self.parser = parser
+
+    def __call__(self, tokens, pos):
+        """
+            Calls the parser. Returns either the
+            parser's result or an empty result.
+
+            tokens -- the token list
+            pos    -- the position to check
+
+            returns -- a Result object
+        """
+        result = self.parser(tokens, pos)
+        if result:
+            return result
+        else:
+            return Result(None, pos)
+
+class Rep(Parser):
+    """The repetition combinator. Applies a parser until it fails."""
+    def __init__(self, parser):
+        """
+            The initialization method.
+
+            parser -- the parser to wrap
+        """
+        self.parser = parser
+
+    def __call__(self, tokens, pos):
+        """
+            Calls the parser. Returns a Result
+            that contains a list of Results (one
+            for each successful application).
+
+            tokens -- the token list
+            pos    -- the position to check
+
+            returns -- a Result object
+        """
+        results = []
+        result = self.parser(tokens, pos)
+        while result:
+            results.append(result.value)
+            pos = result.pos
+            result = self.parser(tokens, pos)
+        return Result(results, pos)
+
+class Process(Parser):
+    """
+        The process combinator. Applies a function that
+        manipulates the parser's result.
+    """
+    def __init__(self, parser, function):
+        """
+            The initialization method.
+
+            parser   -- the parser to wrap
+            function -- the manipulation function
+        """
+        self.parser = parser
+        self.function = function
+
+    def __call__(self, tokens, pos):
+        """
+            Calls the parser. Returns the Result
+            object whose value was manipulated by the
+            given function.
+
+            tokens -- the token list
+            pos    -- the position to check
+
+            returns -- a Result object | None
+        """
+        result = self.parser(tokens, pos)
+        if result:
+            result.value = self.function(result.value)
+        return result
+
+class Lazy(Parser):
+    """
+        The lazy combinator. Builds a parser only if needed.
+        This makes recursive parsers possible.
+    """
+    def __init__(self, parser_func):
+        """
+            The initialization method.
+
+            parser_func -- a function building a parser
+        """
+        self.parser = None
+        self.parser_func = parser_func
+
+    def __call__(self, tokens, pos):
+        """
+            Builds the parser and returns its' result.
+
+            tokens -- the token list
+            pos    -- the position to check
+
+            returns -- a Result object | None
+        """
+        if not self.parser:
+            self.parser = self.parser_func()
+        return self.parser(tokens, pos)
+
+class Phrase(Parser):
+    """
+        The phrase combinator. Applies a parser and
+        only succeeds if it consumed all remaining input.
+    """
+    def __init__(self, parser):
+        """
+            The initialization method.
+
+            parser   -- the parser to wrap
+        """
+        self.parser = parser
+
+    def __call__(self, tokens, pos):
+        """
+            Calls the parser. If there is a result
+            and nothing is left to parse, it returns
+            the result. Otherwise it returns None.
+
+            tokens -- the token list
+            pos    -- the position to check
+
+            returns -- a Result object | None
+        """
+        result = self.parser(tokens, pos)
+        if result and result.pos == len(tokens):
+            return result
+        else:
+            return None
+
+class Exp(Parser):
+    """
+        The compound statement parser.
+        Workaround for problems with left recursion.
+    """
+    def __init__(self, parser, separator):
+        """
+            The initialization method.
+
+            parser    -- the parser to wrap
+            separator -- the parser that parses the compound
+                         separators
+        """
+        self.parser = parser
+        self.separator = separator
+
+    def __call__(self, tokens, pos):
+        """
+            Calls the parser. Like Rep, it applies
+            a parser multiple times, but it also
+            keeps track of the separators.
+
+            tokens -- the token list
+            pos    -- the position to check
+
+            returns -- a Result object | None
+        """
+        result = self.parser(tokens, pos)
+
+        def process_next(parsed):
+            (sepfunc, right) = parsed
+            return sepfunc(result.value, right)
+        next_parser = self.separator + self.parser ^ process_next
+
+        next_result = result
+        while next_result:
+            next_result = next_parser(tokens, result.pos)
+            if next_result:
+                result = next_result
+        return result
--- a/improved/tokenize.py
+++ b/improved/tokenize.py
@@ -0,0 +1,47 @@
+"""The tokenizer"""
+from .lex import lex
+
+RESERVED = 'RESERVED'
+INT      = 'INT'
+ID       = 'ID'
+
+TOKENS = [
+    (r'[ \\n\\t]+',          None),
+    (r'#[^\\n]*',            None),
+    (r'\:=',             RESERVED),
+    (r'\(',              RESERVED),
+    (r'\)',              RESERVED),
+    (r';',               RESERVED),
+    (r'\+',              RESERVED),
+    (r'-',               RESERVED),
+    (r'\*\*',            RESERVED),
+    (r'\*',              RESERVED),
+    (r'/',               RESERVED),
+    (r'<=',              RESERVED),
+    (r'<',               RESERVED),
+    (r'>=',              RESERVED),
+    (r'>',               RESERVED),
+    (r'=',               RESERVED),
+    (r'!=',              RESERVED),
+    (r'and',             RESERVED),
+    (r'or',              RESERVED),
+    (r'not',             RESERVED),
+    (r'if',              RESERVED),
+    (r'then',            RESERVED),
+    (r'else',            RESERVED),
+    (r'while',           RESERVED),
+    (r'do',              RESERVED),
+    (r'end',             RESERVED),
+    (r'[0-9]+',               INT),
+    (r'[A-Za-z][A-Za-z0-9_]*', ID),
+]
+
+def tokenize(characters):
+    """
+        Tokenizes the input.
+
+        characters -- the string to be tokenized
+
+        returns -- a list of tuples of the form (contents, tag)
+    """
+    return lex(characters, TOKENS)
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+"""An installation script for IMProved."""
+
+from distutils.core import setup
+import improved
+
+setup(
+    name='IMProved',
+    description='An embeddable IMP interpreter',
+    author=IMProved.__author__,
+    author_email=IMProved.__author_mail__,
+    version=IMProved.__version__,
+    url=IMProved.__url__,
+    long_description=IMProved.__longdescr__,
+    classifiers=IMProved.__classifiers__,
+    keywords=IMProved.__keywords__,
+    packages=['IMProved'],
+    license="GPLv2",
+    platforms=['Linux', 'OS X', 'Windows']
+)