commit fdeeb209b17b46588ed479a57b8e5f981c42b4d4 Author: hellerve Date: Tue May 30 07:00:27 2017 -0400 initial diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..41226d5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +_build/ +*.byte +*.native diff --git a/README.md b/README.md new file mode 100644 index 0000000..f61bb9d --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +# micro + +A minimal compiler in ML, as seen in [this tutorial](http://troydm.github.io/blog/2014/03/29/writing-micro-compiler-in-ocaml/). + +Work in progess. diff --git a/src/codegen.ml b/src/codegen.ml new file mode 100644 index 0000000..6f6cd23 --- /dev/null +++ b/src/codegen.ml @@ -0,0 +1,47 @@ +type generator = { vars: (string, int) Hashtbl.t; file: string; chan: out_channel } + +let new_generator file = + let fs = (Filename.chop_extension file) ^ ".s" in + { vars=Hashtbl.create 100; file=fs; chan=open_out fs } + +let close_generator g = close_out g.chan + +let gen g v = output_string g.chan v; output_string g.chan "\n" + +let bottom_var _ g = + Hashtbl.fold (fun _ v c -> if v >= c then (v+4) else c) g.vars 0 + +let empty_var s g i = (bottom_var s g) + 4 * (i - 1) + +let var_addr s g v = + if String.length v > 6 && String.sub v 0 6 = "__temp" + then + let i = String.sub v 6 ((String.length v) - 6) in "[esp+" ^ i ^ "]" + else + try "[esp+" ^ string_of_int (Hashtbl.find g.vars v) ^ "]" + with Not_found -> syntax_error s ("identifier " ^ v ^ " not defined") + +let var s g v = "dword " ^ (var_addr s g v) + +let temp_var s g i = + Token.Identifier ("__temp" ^ (string_of_int (empty_var s g i))) + +let is_alloc_var _ g v = Hashtbl.mem g.vars v + +let alloc_var s g v = + if is_alloc_var s g v + then var s g v + else let _ = Hashtbl.replace g.vars v (empty_var s g 1) in var s g v + +let token_var s g v = + match v with + | Token.Identifier i -> var s g i + | _ -> syntax_error s "identifier expected" + +let unop g opcode a = gen g (" " ^ opcode ^ " " ^ a) + +let binop g opcode a b = gen g (" " ^ opcode ^ " " ^ a ^ ", " ^ b) + +let push g a = op g "push" a + + diff --git a/src/compile.ml b/src/compile.ml new file mode 100644 index 0000000..43f4d99 --- /dev/null +++ b/src/compile.ml @@ -0,0 +1,26 @@ +let compile file = + try + let g = Codegen.new_generator file in + let s = Stream.open_stream file in + let o = Filename.chop_extension file in + parse s g; + Stream.close_stream s; + Codegen.close_generator g; + let _ = Sys.command ("nasm -f macho " ^ g.file) in + let _ = Sys.command ("gcc -o " ^out ^ " " ^ out ^ ".o") in + () + with + | Syntax_error e -> + printf "syntax error: %s\n" e; + | Sys_error _ -> + print_string "no file found\n" + +let help name = printf "%s \n" name + +let () = + if Array.length Sys.argv = 1 + then help (Array.get Sys.argv 0) + else + let file = Array.get Sys.argv 1 in + printf "compiling %s\n" file + compile file diff --git a/src/parse.ml b/src/parse.ml new file mode 100644 index 0000000..8be0fa4 --- /dev/null +++ b/src/parse.ml @@ -0,0 +1,29 @@ +let parse stm g = + let s = (Token.new_scanner stm) in + try + Token.program s g + with End_of_file -> + Token.syntax_error s "program reached end of file before end keyword" + +let program s g = + if Token.match_token s Token.Begin then + let _ = generate_begin s g in + let _ = statements s g in + if Token.match_token s Token.End then + let _ = generate_end s g in () + else Token.syntax_error s "program should end with end keyword" + else Token.syntax_error s "program should start with begin keyword" + +let rec statements s g = if statement s g then statements s g else () + +let statement s g = + let t = next_token s in + if match t with + | Token.Read -> read s g + | Token.Write -> write s g + | Token.Identifier i -> assignment s g + | _ -> false + then + if Token.match_token s Token.Semicolon then true + else Token.syntax_error s "statement must end with semicolon" + else false diff --git a/src/stream.ml b/src/stream.ml new file mode 100644 index 0000000..453c044 --- /dev/null +++ b/src/stream.ml @@ -0,0 +1,24 @@ +type stream = { mutable chr: char option; mutable line_num: int; chan: in_channel } + +let open_stream file = { chr=None; line_num=1; chan=open_in file } + +let close_stream stm = close_in stm.chan + +let read_char stm = + match stm.chr with + | None -> + let c = input_char stm.chan in + if c = '\n' + then let _ = stm.line_num <- stm.line_num + 1 in c + else c + | Some c -> stm.chr <- None; c + +let unread_char stm c = stm.chr <- Some c + +let is_digit c = + let code = Char.code c in code >= Char.code('0') && code <= Char.code('9') + +let is_alpha c = + let code = Char.code c in + (code >= Char.code('A') && code <= Char.code('Z')) || + (code >= Char.code('a') && code <= Char.code('z')) diff --git a/src/token.ml b/src/token.ml new file mode 100644 index 0000000..16cfec3 --- /dev/null +++ b/src/token.ml @@ -0,0 +1,59 @@ +type token = Begin + | End + | Identifier of string + | Read + | Write + | Literal of int + | Assign + | LeftParen + | RightParen + | Add + | Sub + | Comma + | Semicolon + +type scanner = { mutable last_token: token option; stm: Stream.stream } + +exception Syntax_error of string + +let syntax_error s msg = + raise (Syntax_error (msg ^" on line " ^ (string_of_int s.stm.line_num))) + +let rec skip_blank_chars stm = + let c = Stream.read_char stm in + match c with + | ' ' | '\t' | '\r' | '\n' -> skip_blank_chars stm + | _ -> Stream.unread_char stm c; () + +let scan s = + let stm = s.stm in + let c = Stream.read_char stm in + let rec scan_iden acc = + let nc = Stream.read_char stm in + if Stream.is_alpha nc || Stream.is_digit nc || nc='_' + then scan_iden (acc ^ (Char.escaped nc)) + else let _ = Stream.unread_char stm nc in + let lc = String.lowercase acc in + if lc = "begin" then Begin + else if lc = "end" then End + else if lc = "read" then Read + else if lc = "write" then Write + else Identifier acc + in + let rec scan_lit acc = + let nc = Stream.read_char stm in + if Stream.is_digit nc + then scan_lit (acc ^ (Char.escaped nc)) + else let _ = Stream.unread_char stm nc in + Literal (int_of_string acc) + in + if Stream.is_alpha c then scan_iden (Char.escaped c) + else if Stream.is_digit c then scan_lit (Char.escaped c) + else if c='+' then Add + else if c='-' then Sub + else if c=',' then Comma + else if c=';' then Semicolon + else if c='(' then LeftParen + else if c=')' then RightParen + else if c=':' && Stream.read_char stm = '=' then Assign + else syntax_error s "Could not identify token"