initial; lexer and parser

This commit is contained in:
hellerve
2015-07-07 13:51:15 +02:00
parent 0bc4bc1616
commit f9c1f2a45d
7 changed files with 490 additions and 0 deletions

6
README.md Normal file
View File

@@ -0,0 +1,6 @@
###IMProved
IMProved aims to build an interpreter for IMP (based on
[this](http://jayconrod.com/posts/37/a-simple-interpreter-from-scratch-in-python-part-1))
that is fully documented, occassionally improved and
embeddable into Python.

12
improved/__init__.py Normal file
View File

@@ -0,0 +1,12 @@
from .interpreter import callIMP
__author__ = 'Veit Heller'
__author_mail__ = 'veit@veitheller.de'
__version__ = '0.0.1'
__url__ = 'http://github.com/hellerver/IMProved'
__longdescr__ = """
An embeddable IMP interpreter for and in
Python.
"""
__classifiers__ = ['Topic :: Software Development :: Interpreters']
__keywords__ = ['IMP', 'interpreter', 'embeddable']

2
improved/interpreter.py Normal file
View File

@@ -0,0 +1,2 @@
def callIMP(chars):
pass

33
improved/lex.py Normal file
View File

@@ -0,0 +1,33 @@
"""The lexer"""
import sys
import re
def lex(characters, token_exprs):
"""
A somewhat generic lexer.
characters -- the string to be lexed
token_exprs -- the tokens that consitute our grammar
returns -- a list of tokens of the form (contents, tag)
"""
pos = 0
tokens = []
while pos < len(characters):
match = None
for token_expr in token_exprs:
pattern, tag = token_expr
regex = re.compile(pattern)
match = regex.match(characters, pos)
if match:
text = match.group(0)
if tag:
token = (text, tag)
tokens.append(token)
break
if not match:
sys.stderr.write('[Parser] Illegal character: %s\\n' % characters[pos])
raise ValueError(characters[pos])
else:
pos = match.end(0)
return tokens

370
improved/parsecomb.py Normal file
View File

@@ -0,0 +1,370 @@
"""A minimal parser combinator library."""
class Result(object):
"""The result class. Returned by every parser."""
def __init__(self, value, pos):
"""
The result initializer.
value -- the value of the result node
pos -- the position of the result node
"""
self.value = value
self.pos = pos
def __repr__(self):
"""
A representation to make debugging easier.
returns -- A string of the form 'Result(value, position)'
"""
return 'Result(%s, %d)' % (self.value, self.pos)
class Parser(object):
"""
The parser superclass.
All parsers have to inherit from it.
"""
def __call__(self, tokens, pos):
"""
Makes the parser callable.
All subclasses need to override this method.
tokens -- the tokens with which the Parser is called
pos -- the token position
returns -- None
"""
return None
def __add__(self, other):
"""
Concatenates a parser to another parser.
returns -- a Concat object of both parsers
"""
return Concat(self, other)
def __mul__(self, other):
"""
"Multiplies" the parser to another parser.
returns -- an Exp object of both parsers
"""
return Exp(self, other)
def __or__(self, other):
"""
Alternates between this parser and another.
returns -- an Alternate object of both parsers
"""
return Alternate(self, other)
def __xor__(self, function):
"""
Applies a function to the parser's results.
returns -- a Process object of the parser and the function
"""
return Process(self, function)
class Reserved(Parser):
"""The parser for reserved words"""
def __init__(self, value, tag):
"""
The initialization method.
value -- the reserved word
tag -- the RESERVED tag
"""
self.value = value
self.tag = tag
def __call__(self, tokens, pos):
"""
Call parser. Returns a result if the token at the position
matches the reserved word. Otherwise returns None.
tokens -- the token list
pos -- the position to check
returns -- a Result object | None
"""
if pos < len(tokens) and tokens[pos] == [self.value, self.tag]:
return Result(tokens[pos][0], pos + 1)
else:
return None
class Tag(Parser):
"""
The parser for tags. Matches anything if the tag matches,
regardless of its value.
"""
def __init__(self, tag):
"""
The initialization method.
tag -- the tag that should match
"""
self.tag = tag
def __call__(self, tokens, pos):
"""
Call parser. Returns a result if the tag of the
token at the specified position matches, regardless
of its value. Otherwise returns None.
tokens -- the token list
pos -- the position to check
returns -- a Result object | None
"""
if pos < len(tokens) and tokens[pos][1] is self.tag:
return Result(tokens[pos][0], pos + 1)
else:
return None
class Concat(Parser):
"""The concat combinator. Parses sequences of two tokens."""
def __init__(self, left, right):
"""
The initialization method.
left -- the first parser
right -- the second parser
"""
self.left = left
self.right = right
def __call__(self, tokens, pos):
"""
Calls the parser. Returns a result tuple if both
parser match, otherwise returns None.
tokens -- the token list
pos -- the position to check
returns -- a Tuple of the Result objects of both parsers | None
"""
left_result = self.left(tokens, pos)
if left_result:
right_result = self.right(tokens, left_result.pos)
if right_result:
combined_value = (left_result.value, right_result.value)
return Result(combined_value, right_result.pos)
return None
class Alternate(Parser):
"""The alternate combinator. Parses using either of two parsers."""
def __init__(self, left, right):
"""
The initialization method.
left -- the first parser
right -- the second parser
"""
self.left = left
self.right = right
def __call__(self, tokens, pos):
"""
Calls the parser. Returns a result if either
parser matches, otherwisu returns None.
tokens -- the token list
pos -- the position to check
returns -- the Result object of either parser | None
"""
left_result = self.left(tokens, pos)
if left_result:
return left_result
else:
right_result = self.right(tokens, pos)
return right_result
return None
class Opt(Parser):
"""The optional combinator. Always returns a result."""
def __init__(self, parser):
"""
The initialization method.
parser -- the parser to wrap
"""
self.parser = parser
def __call__(self, tokens, pos):
"""
Calls the parser. Returns either the
parser's result or an empty result.
tokens -- the token list
pos -- the position to check
returns -- a Result object
"""
result = self.parser(tokens, pos)
if result:
return result
else:
return Result(None, pos)
class Rep(Parser):
"""The repetition combinator. Applies a parser until it fails."""
def __init__(self, parser):
"""
The initialization method.
parser -- the parser to wrap
"""
self.parser = parser
def __call__(self, tokens, pos):
"""
Calls the parser. Returns a Result
that contains a list of Results (one
for each successful application).
tokens -- the token list
pos -- the position to check
returns -- a Result object
"""
results = []
result = self.parser(tokens, pos)
while result:
results.append(result.value)
pos = result.pos
result = self.parser(tokens, pos)
return Result(results, pos)
class Process(Parser):
"""
The process combinator. Applies a function that
manipulates the parser's result.
"""
def __init__(self, parser, function):
"""
The initialization method.
parser -- the parser to wrap
function -- the manipulation function
"""
self.parser = parser
self.function = function
def __call__(self, tokens, pos):
"""
Calls the parser. Returns the Result
object whose value was manipulated by the
given function.
tokens -- the token list
pos -- the position to check
returns -- a Result object | None
"""
result = self.parser(tokens, pos)
if result:
result.value = self.function(result.value)
return result
class Lazy(Parser):
"""
The lazy combinator. Builds a parser only if needed.
This makes recursive parsers possible.
"""
def __init__(self, parser_func):
"""
The initialization method.
parser_func -- a function building a parser
"""
self.parser = None
self.parser_func = parser_func
def __call__(self, tokens, pos):
"""
Builds the parser and returns its' result.
tokens -- the token list
pos -- the position to check
returns -- a Result object | None
"""
if not self.parser:
self.parser = self.parser_func()
return self.parser(tokens, pos)
class Phrase(Parser):
"""
The phrase combinator. Applies a parser and
only succeeds if it consumed all remaining input.
"""
def __init__(self, parser):
"""
The initialization method.
parser -- the parser to wrap
"""
self.parser = parser
def __call__(self, tokens, pos):
"""
Calls the parser. If there is a result
and nothing is left to parse, it returns
the result. Otherwise it returns None.
tokens -- the token list
pos -- the position to check
returns -- a Result object | None
"""
result = self.parser(tokens, pos)
if result and result.pos == len(tokens):
return result
else:
return None
class Exp(Parser):
"""
The compound statement parser.
Workaround for problems with left recursion.
"""
def __init__(self, parser, separator):
"""
The initialization method.
parser -- the parser to wrap
separator -- the parser that parses the compound
separators
"""
self.parser = parser
self.separator = separator
def __call__(self, tokens, pos):
"""
Calls the parser. Like Rep, it applies
a parser multiple times, but it also
keeps track of the separators.
tokens -- the token list
pos -- the position to check
returns -- a Result object | None
"""
result = self.parser(tokens, pos)
def process_next(parsed):
(sepfunc, right) = parsed
return sepfunc(result.value, right)
next_parser = self.separator + self.parser ^ process_next
next_result = result
while next_result:
next_result = next_parser(tokens, result.pos)
if next_result:
result = next_result
return result

47
improved/tokenize.py Normal file
View File

@@ -0,0 +1,47 @@
"""The tokenizer"""
from .lex import lex
RESERVED = 'RESERVED'
INT = 'INT'
ID = 'ID'
TOKENS = [
(r'[ \\n\\t]+', None),
(r'#[^\\n]*', None),
(r'\:=', RESERVED),
(r'\(', RESERVED),
(r'\)', RESERVED),
(r';', RESERVED),
(r'\+', RESERVED),
(r'-', RESERVED),
(r'\*\*', RESERVED),
(r'\*', RESERVED),
(r'/', RESERVED),
(r'<=', RESERVED),
(r'<', RESERVED),
(r'>=', RESERVED),
(r'>', RESERVED),
(r'=', RESERVED),
(r'!=', RESERVED),
(r'and', RESERVED),
(r'or', RESERVED),
(r'not', RESERVED),
(r'if', RESERVED),
(r'then', RESERVED),
(r'else', RESERVED),
(r'while', RESERVED),
(r'do', RESERVED),
(r'end', RESERVED),
(r'[0-9]+', INT),
(r'[A-Za-z][A-Za-z0-9_]*', ID),
]
def tokenize(characters):
"""
Tokenizes the input.
characters -- the string to be tokenized
returns -- a list of tuples of the form (contents, tag)
"""
return lex(characters, TOKENS)

20
setup.py Normal file
View File

@@ -0,0 +1,20 @@
#!/usr/bin/env python
"""An installation script for IMProved."""
from distutils.core import setup
import improved
setup(
name='IMProved',
description='An embeddable IMP interpreter',
author=IMProved.__author__,
author_email=IMProved.__author_mail__,
version=IMProved.__version__,
url=IMProved.__url__,
long_description=IMProved.__longdescr__,
classifiers=IMProved.__classifiers__,
keywords=IMProved.__keywords__,
packages=['IMProved'],
license="GPLv2",
platforms=['Linux', 'OS X', 'Windows']
)