initial; lexer and parser

2015-07-07 13:51:15 +02:00
parent 0bc4bc1616
commit f9c1f2a45d
7 changed files with 490 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,6 @@
 ###IMProved
 IMProved aims to build an interpreter for IMP (based on
 [this](http://jayconrod.com/posts/37/a-simple-interpreter-from-scratch-in-python-part-1))
 that is fully documented, occassionally improved and
 embeddable into Python.
--- a/improved/init.py
+++ b/improved/init.py
@@ -0,0 +1,12 @@
 from .interpreter import callIMP
 __author__ = 'Veit Heller'
 __author_mail__ = 'veit@veitheller.de'
 __version__ = '0.0.1'
 __url__ = 'http://github.com/hellerver/IMProved'
 __longdescr__ = """
                    An embeddable IMP interpreter for and in
                    Python.
                """
 __classifiers__ = ['Topic :: Software Development :: Interpreters']
 __keywords__ = ['IMP', 'interpreter', 'embeddable']
--- a/improved/interpreter.py
+++ b/improved/interpreter.py
@@ -0,0 +1,2 @@
 def callIMP(chars):
    pass
--- a/improved/lex.py
+++ b/improved/lex.py
@@ -0,0 +1,33 @@
 """The lexer"""
 import sys
 import re
 def lex(characters, token_exprs):
    """
        A somewhat generic lexer.
        characters  -- the string to be lexed
        token_exprs -- the tokens that consitute our grammar
        returns     -- a list of tokens of the form (contents, tag)
    """
    pos = 0
    tokens = []
    while pos < len(characters):
        match = None
        for token_expr in token_exprs:
            pattern, tag = token_expr
            regex = re.compile(pattern)
            match = regex.match(characters, pos)
            if match:
                text = match.group(0)
                if tag:
                    token = (text, tag)
                    tokens.append(token)
                break
        if not match:
            sys.stderr.write('[Parser] Illegal character: %s\\n' % characters[pos])
            raise ValueError(characters[pos])
        else:
            pos = match.end(0)
    return tokens
--- a/improved/parsecomb.py
+++ b/improved/parsecomb.py
@@ -0,0 +1,370 @@
 """A minimal parser combinator library."""
 class Result(object):
    """The result class. Returned by every parser."""
    def __init__(self, value, pos):
        """
            The result initializer.
            value -- the value of the result node
            pos   -- the position of the result node
        """
        self.value = value
        self.pos = pos
    def __repr__(self):
        """
            A representation to make debugging easier.
            returns -- A string of the form 'Result(value, position)'
        """
        return 'Result(%s, %d)' % (self.value, self.pos)
 class Parser(object):
    """
        The parser superclass.
        All parsers have to inherit from it.
    """
    def __call__(self, tokens, pos):
        """
            Makes the parser callable.
            All subclasses need to override this method.
            tokens -- the tokens with which the Parser is called
            pos    -- the token position
            returns -- None
        """
        return None
    def __add__(self, other):
        """
            Concatenates a parser to another parser.
            returns -- a Concat object of both parsers
        """
        return Concat(self, other)
    def __mul__(self, other):
        """
            "Multiplies" the parser to another parser.
            returns -- an Exp object of both parsers
        """
        return Exp(self, other)
    def __or__(self, other):
        """
            Alternates between this parser and another.
            returns -- an Alternate object of both parsers
        """
        return Alternate(self, other)
    def __xor__(self, function):
        """
            Applies a function to the parser's results.
            returns -- a Process object of the parser and the function
        """
        return Process(self, function)
 class Reserved(Parser):
    """The parser for reserved words"""
    def __init__(self, value, tag):
        """
            The initialization method.
            value -- the reserved word
            tag   -- the RESERVED tag
        """
        self.value = value
        self.tag = tag
    def __call__(self, tokens, pos):
        """
            Call parser. Returns a result if the token at the position
            matches the reserved word. Otherwise returns None.
            tokens -- the token list
            pos    -- the position to check
            returns -- a Result object | None
        """
        if pos < len(tokens) and tokens[pos] == [self.value, self.tag]:
            return Result(tokens[pos][0], pos + 1)
        else:
            return None
 class Tag(Parser):
    """
        The parser for tags. Matches anything if the tag matches,
        regardless of its value.
    """
    def __init__(self, tag):
        """
            The initialization method.
            tag -- the tag that should match
        """
        self.tag = tag
    def __call__(self, tokens, pos):
        """
            Call parser. Returns a result if the tag of the
            token at the specified position matches, regardless
            of its value. Otherwise returns None.
            tokens -- the token list
            pos    -- the position to check
            returns -- a Result object | None
        """
        if pos < len(tokens) and tokens[pos][1] is self.tag:
            return Result(tokens[pos][0], pos + 1)
        else:
            return None
 class Concat(Parser):
    """The concat combinator. Parses sequences of two tokens."""
    def __init__(self, left, right):
        """
            The initialization method.
            left  -- the first parser
            right -- the second parser
        """
        self.left = left
        self.right = right
    def __call__(self, tokens, pos):
        """
            Calls the parser. Returns a result tuple if both
            parser match, otherwise returns None.
            tokens -- the token list
            pos    -- the position to check
            returns -- a Tuple of the Result objects of both parsers | None
        """
        left_result = self.left(tokens, pos)
        if left_result:
            right_result = self.right(tokens, left_result.pos)
            if right_result:
                combined_value = (left_result.value, right_result.value)
                return Result(combined_value, right_result.pos)
        return None
 class Alternate(Parser):
    """The alternate combinator. Parses using either of two parsers."""
    def __init__(self, left, right):
        """
            The initialization method.
            left  -- the first parser
            right -- the second parser
        """
        self.left = left
        self.right = right
    def __call__(self, tokens, pos):
        """
            Calls the parser. Returns a result if either
            parser matches, otherwisu returns None.
            tokens -- the token list
            pos    -- the position to check
            returns -- the Result object of either parser | None
        """
        left_result = self.left(tokens, pos)
        if left_result:
            return left_result
        else:
            right_result = self.right(tokens, pos)
            return right_result
        return None
 class Opt(Parser):
    """The optional combinator. Always returns a result."""
    def __init__(self, parser):
        """
            The initialization method.
            parser -- the parser to wrap
        """
        self.parser = parser
    def __call__(self, tokens, pos):
        """
            Calls the parser. Returns either the
            parser's result or an empty result.
            tokens -- the token list
            pos    -- the position to check
            returns -- a Result object
        """
        result = self.parser(tokens, pos)
        if result:
            return result
        else:
            return Result(None, pos)
 class Rep(Parser):
    """The repetition combinator. Applies a parser until it fails."""
    def __init__(self, parser):
        """
            The initialization method.
            parser -- the parser to wrap
        """
        self.parser = parser
    def __call__(self, tokens, pos):
        """
            Calls the parser. Returns a Result
            that contains a list of Results (one
            for each successful application).
            tokens -- the token list
            pos    -- the position to check
            returns -- a Result object
        """
        results = []
        result = self.parser(tokens, pos)
        while result:
            results.append(result.value)
            pos = result.pos
            result = self.parser(tokens, pos)
        return Result(results, pos)
 class Process(Parser):
    """
        The process combinator. Applies a function that
        manipulates the parser's result.
    """
    def __init__(self, parser, function):
        """
            The initialization method.
            parser   -- the parser to wrap
            function -- the manipulation function
        """
        self.parser = parser
        self.function = function
    def __call__(self, tokens, pos):
        """
            Calls the parser. Returns the Result
            object whose value was manipulated by the
            given function.
            tokens -- the token list
            pos    -- the position to check
            returns -- a Result object | None
        """
        result = self.parser(tokens, pos)
        if result:
            result.value = self.function(result.value)
        return result
 class Lazy(Parser):
    """
        The lazy combinator. Builds a parser only if needed.
        This makes recursive parsers possible.
    """
    def __init__(self, parser_func):
        """
            The initialization method.
            parser_func -- a function building a parser
        """
        self.parser = None
        self.parser_func = parser_func
    def __call__(self, tokens, pos):
        """
            Builds the parser and returns its' result.
            tokens -- the token list
            pos    -- the position to check
            returns -- a Result object | None
        """
        if not self.parser:
            self.parser = self.parser_func()
        return self.parser(tokens, pos)
 class Phrase(Parser):
    """
        The phrase combinator. Applies a parser and
        only succeeds if it consumed all remaining input.
    """
    def __init__(self, parser):
        """
            The initialization method.
            parser   -- the parser to wrap
        """
        self.parser = parser
    def __call__(self, tokens, pos):
        """
            Calls the parser. If there is a result
            and nothing is left to parse, it returns
            the result. Otherwise it returns None.
            tokens -- the token list
            pos    -- the position to check
            returns -- a Result object | None
        """
        result = self.parser(tokens, pos)
        if result and result.pos == len(tokens):
            return result
        else:
            return None
 class Exp(Parser):
    """
        The compound statement parser.
        Workaround for problems with left recursion.
    """
    def __init__(self, parser, separator):
        """
            The initialization method.
            parser    -- the parser to wrap
            separator -- the parser that parses the compound
                         separators
        """
        self.parser = parser
        self.separator = separator
    def __call__(self, tokens, pos):
        """
            Calls the parser. Like Rep, it applies
            a parser multiple times, but it also
            keeps track of the separators.
            tokens -- the token list
            pos    -- the position to check
            returns -- a Result object | None
        """
        result = self.parser(tokens, pos)
        def process_next(parsed):
            (sepfunc, right) = parsed
            return sepfunc(result.value, right)
        next_parser = self.separator + self.parser ^ process_next
        next_result = result
        while next_result:
            next_result = next_parser(tokens, result.pos)
            if next_result:
                result = next_result
        return result
--- a/improved/tokenize.py
+++ b/improved/tokenize.py
@@ -0,0 +1,47 @@
 """The tokenizer"""
 from .lex import lex
 RESERVED = 'RESERVED'
 INT      = 'INT'
 ID       = 'ID'
 TOKENS = [
    (r'[ \\n\\t]+',          None),
    (r'#[^\\n]*',            None),
    (r'\:=',             RESERVED),
    (r'\(',              RESERVED),
    (r'\)',              RESERVED),
    (r';',               RESERVED),
    (r'\+',              RESERVED),
    (r'-',               RESERVED),
    (r'\*\*',            RESERVED),
    (r'\*',              RESERVED),
    (r'/',               RESERVED),
    (r'<=',              RESERVED),
    (r'<',               RESERVED),
    (r'>=',              RESERVED),
    (r'>',               RESERVED),
    (r'=',               RESERVED),
    (r'!=',              RESERVED),
    (r'and',             RESERVED),
    (r'or',              RESERVED),
    (r'not',             RESERVED),
    (r'if',              RESERVED),
    (r'then',            RESERVED),
    (r'else',            RESERVED),
    (r'while',           RESERVED),
    (r'do',              RESERVED),
    (r'end',             RESERVED),
    (r'[0-9]+',               INT),
    (r'[A-Za-z][A-Za-z0-9_]*', ID),
 ]
 def tokenize(characters):
    """
        Tokenizes the input.
        characters -- the string to be tokenized
        returns -- a list of tuples of the form (contents, tag)
    """
    return lex(characters, TOKENS)
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,20 @@
 #!/usr/bin/env python
 """An installation script for IMProved."""
 from distutils.core import setup
 import improved
 setup(
    name='IMProved',
    description='An embeddable IMP interpreter',
    author=IMProved.__author__,
    author_email=IMProved.__author_mail__,
    version=IMProved.__version__,
    url=IMProved.__url__,
    long_description=IMProved.__longdescr__,
    classifiers=IMProved.__classifiers__,
    keywords=IMProved.__keywords__,
    packages=['IMProved'],
    license="GPLv2",
    platforms=['Linux', 'OS X', 'Windows']
 )