This commit is contained in:
2018-05-05 00:50:40 +02:00
commit a5ce351f91
3 changed files with 267 additions and 0 deletions

7
README.md Normal file
View File

@@ -0,0 +1,7 @@
# c
An incredibly simple compiler from a Kaleidoscope-like language to C.
Inspired by Gary Bernhardts [screencast on building a compiler](https://www.destroyallsoftware.com/screencasts/catalog/a-compiler-from-scratch),
but compiles to C and (kind of) supports strings, and including other source
files. Refer to the [example file](/examples/thing.src) to see how it works.

247
c.py Executable file
View File

@@ -0,0 +1,247 @@
#!/usr/bin/env python
import re
import sys
class Token:
def __init__(self, name, value):
self.name = name
self.value = value
def __str__(self):
return "Token(type='{}', value='{}')".format(self.name, self.value)
def __unicode__(self):
return self.__str__()
class DefNode:
def __init__(self, name, args, body):
self.name = name
self.args = args
self.body = body
def __str__(self):
return "Def(name='{}', args='{}', body='{}')".format(self.name, self.args, self.body)
def __unicode__(self):
return self.__str__()
class CallNode:
def __init__(self, name, exprs):
self.name = name
self.exprs = exprs
def __str__(self):
return "Call(name='{}', exprs=[{}])".format(self.name, ", ".join(str(e) for e in self.exprs))
def __unicode__(self):
return self.__str__()
class IntegerNode:
def __init__(self, value):
self.value = value
def __str__(self):
return "Integer(value={})".format(self.value)
def __unicode__(self):
return self.__str__()
class StringNode:
def __init__(self, value):
self.value = value
def __str__(self):
return "String(value={})".format(self.value)
def __unicode__(self):
return self.__str__()
class VarNode:
def __init__(self, name):
self.name = name
def __str__(self):
return "Var(name={})".format(self.name)
def __unicode__(self):
return self.__str__()
class ImportNode:
def __init__(self, name):
self.name = name
def __str__(self):
return "Import(name={})".format(self.name)
def __unicode__(self):
return self.__str__()
class Tokenizer:
token_types = {
"def": r"\bdef\b",
"end": r"\bend\b",
"import": r"\bimport\b",
"identifier": r"\b[a-zA-Z]+\b",
"string": r'"[^"]*"',
"integer": r"\b[0-9]+\b",
"open_paren": r"\(",
"comma": r",",
"close_paren": r"\)",
}
def __init__(self, code):
self.code = code
self.token_types = {}
for token_name, token_re in Tokenizer.token_types.iteritems():
self.token_types[token_name] = re.compile(token_re)
def tokenize(self):
res = []
while len(self.code):
token = self.tokenize_once()
if not token:
return []
res.append(token)
self.code = self.code.strip()
return res
def tokenize_once(self):
for token_name, token_re in self.token_types.iteritems():
match = token_re.match(self.code)
if match:
value = match.group()
self.code = self.code[len(value):]
return Token(token_name, value)
class Parser:
def __init__(self, tokens):
self.tokens = tokens
def parse(self):
res = []
while len(self.tokens):
if self.peek("import"):
res.append(self.parse_import())
else:
res.append(self.parse_def())
return res
def parse_def(self):
self.consume("def")
name = self.consume("identifier").value
args = self.parse_arg_names()
body = self.parse_expr()
self.consume("end")
return DefNode(name, args, body)
def parse_expr(self):
if self.peek("integer"):
return self.parse_integer()
if self.peek("string"):
return self.parse_string()
if self.peek("open_paren", 1):
return self.parse_call()
return self.parse_variable()
def parse_import(self):
self.consume("import")
return ImportNode(self.consume("identifier").value)
def parse_variable(self):
return VarNode(self.consume("identifier").value)
def parse_call(self):
name = self.consume("identifier").value
exprs = self.parse_arg_exprs()
return CallNode(name, exprs)
def parse_arg_exprs(self):
res = []
self.consume("open_paren")
if not self.peek("close_paren"):
res.append(self.parse_expr())
while self.peek("comma"):
self.consume("comma")
res.append(self.parse_expr())
self.consume("close_paren")
return res
def parse_integer(self):
return IntegerNode(int(self.consume("integer").value))
def parse_string(self):
return StringNode(self.consume("string").value)
def parse_arg_names(self):
res = []
self.consume("open_paren")
if self.peek("identifier"):
res.append(self.consume("identifier").value)
while self.peek("comma"):
self.consume("comma")
res.append(self.consume("identifier").value)
self.consume("close_paren")
return res
def consume(self, expected):
token = self.tokens.pop(0)
if token.name == expected:
return token
raise RuntimeError("Expected token type {} but got {}".format(expected, token.name))
def peek(self, expected, idx=0):
return len(self.tokens) > idx and self.tokens[idx].name == expected
class CodeGen:
def generate_all(self, nodes):
res = ""
for node in nodes:
res = "{}\n{}".format(res, self.generate_proto(node))
for node in nodes:
res = "{}\n{}".format(res, self.generate(node))
return res
def generate_proto(self, node):
case = node.__class__
if case == DefNode:
args = ", ".join("int {}".format(n) for n in node.args)
return "int {}({});".format(node.name, args)
return ""
def generate(self, node):
case = node.__class__
if case == DefNode:
args = ", ".join("int {}".format(n) for n in node.args)
body = self.generate(node.body)
return "int {}({}) {{ return {}; }}".format(node.name, args, body)
if case == IntegerNode:
return str(node.value)
if case == CallNode:
return "{}({})".format(node.name, ", ".join(self.generate(e) for e in node.exprs))
if case == VarNode:
return node.name
if case == ImportNode:
return "#include <{}.h>".format(node.name)
if case == StringNode:
return node.value
raise RuntimeError("Unexpected node type: {}".format(case.__name__))
with open(sys.argv[1]) as f:
contents = f.read()
tokens = Tokenizer(contents).tokenize()
ast = Parser(tokens).parse()
generated = CodeGen().generate_all(ast)
print(generated)

13
examples/thing.src Normal file
View File

@@ -0,0 +1,13 @@
import stdio
def f(x, y, z)
g(x, 2)
end
def g(x, y)
x
end
def main()
printf("hi %d!\n", f(1, 2, 3))
end