commit a5ce351f91f02139ec7d3c90ef520a47572178c8 Author: hellerve Date: Sat May 5 00:50:40 2018 +0200 initial diff --git a/README.md b/README.md new file mode 100644 index 0000000..db7cb65 --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +# c + +An incredibly simple compiler from a Kaleidoscope-like language to C. + +Inspired by Gary Bernhardt’s [screencast on building a compiler](https://www.destroyallsoftware.com/screencasts/catalog/a-compiler-from-scratch), +but compiles to C and (kind of) supports strings, and including other source +files. Refer to the [example file](/examples/thing.src) to see how it works. diff --git a/c.py b/c.py new file mode 100755 index 0000000..a6a4095 --- /dev/null +++ b/c.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python + +import re +import sys + +class Token: + def __init__(self, name, value): + self.name = name + self.value = value + + def __str__(self): + return "Token(type='{}', value='{}')".format(self.name, self.value) + + def __unicode__(self): + return self.__str__() + + +class DefNode: + def __init__(self, name, args, body): + self.name = name + self.args = args + self.body = body + + def __str__(self): + return "Def(name='{}', args='{}', body='{}')".format(self.name, self.args, self.body) + + def __unicode__(self): + return self.__str__() + + +class CallNode: + def __init__(self, name, exprs): + self.name = name + self.exprs = exprs + + def __str__(self): + return "Call(name='{}', exprs=[{}])".format(self.name, ", ".join(str(e) for e in self.exprs)) + + def __unicode__(self): + return self.__str__() + + +class IntegerNode: + def __init__(self, value): + self.value = value + + def __str__(self): + return "Integer(value={})".format(self.value) + + def __unicode__(self): + return self.__str__() + +class StringNode: + def __init__(self, value): + self.value = value + + def __str__(self): + return "String(value={})".format(self.value) + + def __unicode__(self): + return self.__str__() + + +class VarNode: + def __init__(self, name): + self.name = name + + def __str__(self): + return "Var(name={})".format(self.name) + + def __unicode__(self): + return self.__str__() + + +class ImportNode: + def __init__(self, name): + self.name = name + + def __str__(self): + return "Import(name={})".format(self.name) + + def __unicode__(self): + return self.__str__() + + +class Tokenizer: + token_types = { + "def": r"\bdef\b", + "end": r"\bend\b", + "import": r"\bimport\b", + "identifier": r"\b[a-zA-Z]+\b", + "string": r'"[^"]*"', + "integer": r"\b[0-9]+\b", + "open_paren": r"\(", + "comma": r",", + "close_paren": r"\)", + } + + def __init__(self, code): + self.code = code + self.token_types = {} + + for token_name, token_re in Tokenizer.token_types.iteritems(): + self.token_types[token_name] = re.compile(token_re) + + def tokenize(self): + res = [] + while len(self.code): + token = self.tokenize_once() + if not token: + return [] + res.append(token) + self.code = self.code.strip() + return res + + def tokenize_once(self): + for token_name, token_re in self.token_types.iteritems(): + match = token_re.match(self.code) + if match: + value = match.group() + self.code = self.code[len(value):] + return Token(token_name, value) + + +class Parser: + def __init__(self, tokens): + self.tokens = tokens + + def parse(self): + res = [] + while len(self.tokens): + if self.peek("import"): + res.append(self.parse_import()) + else: + res.append(self.parse_def()) + return res + + def parse_def(self): + self.consume("def") + name = self.consume("identifier").value + args = self.parse_arg_names() + body = self.parse_expr() + self.consume("end") + return DefNode(name, args, body) + + def parse_expr(self): + if self.peek("integer"): + return self.parse_integer() + if self.peek("string"): + return self.parse_string() + if self.peek("open_paren", 1): + return self.parse_call() + return self.parse_variable() + + def parse_import(self): + self.consume("import") + return ImportNode(self.consume("identifier").value) + + def parse_variable(self): + return VarNode(self.consume("identifier").value) + + def parse_call(self): + name = self.consume("identifier").value + exprs = self.parse_arg_exprs() + return CallNode(name, exprs) + + def parse_arg_exprs(self): + res = [] + self.consume("open_paren") + if not self.peek("close_paren"): + res.append(self.parse_expr()) + while self.peek("comma"): + self.consume("comma") + res.append(self.parse_expr()) + self.consume("close_paren") + return res + + def parse_integer(self): + return IntegerNode(int(self.consume("integer").value)) + + def parse_string(self): + return StringNode(self.consume("string").value) + + def parse_arg_names(self): + res = [] + self.consume("open_paren") + if self.peek("identifier"): + res.append(self.consume("identifier").value) + while self.peek("comma"): + self.consume("comma") + res.append(self.consume("identifier").value) + self.consume("close_paren") + return res + + def consume(self, expected): + token = self.tokens.pop(0) + + if token.name == expected: + return token + raise RuntimeError("Expected token type {} but got {}".format(expected, token.name)) + + def peek(self, expected, idx=0): + return len(self.tokens) > idx and self.tokens[idx].name == expected + + +class CodeGen: + def generate_all(self, nodes): + res = "" + for node in nodes: + res = "{}\n{}".format(res, self.generate_proto(node)) + + for node in nodes: + res = "{}\n{}".format(res, self.generate(node)) + return res + + def generate_proto(self, node): + case = node.__class__ + if case == DefNode: + args = ", ".join("int {}".format(n) for n in node.args) + return "int {}({});".format(node.name, args) + return "" + + def generate(self, node): + case = node.__class__ + if case == DefNode: + args = ", ".join("int {}".format(n) for n in node.args) + body = self.generate(node.body) + return "int {}({}) {{ return {}; }}".format(node.name, args, body) + if case == IntegerNode: + return str(node.value) + if case == CallNode: + return "{}({})".format(node.name, ", ".join(self.generate(e) for e in node.exprs)) + if case == VarNode: + return node.name + if case == ImportNode: + return "#include <{}.h>".format(node.name) + if case == StringNode: + return node.value + raise RuntimeError("Unexpected node type: {}".format(case.__name__)) + + +with open(sys.argv[1]) as f: + contents = f.read() +tokens = Tokenizer(contents).tokenize() +ast = Parser(tokens).parse() +generated = CodeGen().generate_all(ast) +print(generated) diff --git a/examples/thing.src b/examples/thing.src new file mode 100644 index 0000000..a37d2be --- /dev/null +++ b/examples/thing.src @@ -0,0 +1,13 @@ +import stdio + +def f(x, y, z) + g(x, 2) +end + +def g(x, y) + x +end + +def main() + printf("hi %d!\n", f(1, 2, 3)) +end