diff options
Diffstat (limited to 'lib/lexing')
-rw-r--r-- | lib/lexing/headers.ml | 23 | ||||
-rw-r--r-- | lib/lexing/lexer.ml | 97 | ||||
-rw-r--r-- | lib/lexing/utils.ml | 36 |
3 files changed, 156 insertions, 0 deletions
diff --git a/lib/lexing/headers.ml b/lib/lexing/headers.ml new file mode 100644 index 0000000..2452eec --- /dev/null +++ b/lib/lexing/headers.ml @@ -0,0 +1,23 @@ +(* Types for lexing *) + +open Types + +(* Exceptions to aid control flow for lexer *) +exception Not_In_Alphabet of char +exception No_Transition of int * char + +(* Tagged Deterministice Finite Automaton *) +type tdfa = { + alphabet : char list; + start_state: int; + register_init : tok; (* initial contents of the single register for this TDFA *) + delta : int * char -> int * (tok -> tok) +} + +(* Description of a possible state that the TDFA could be in *) +type tdfa_state = { + state_id : int; (* current state *) + register : tok; (* contents of register *) + tokens : tok list; (* tokens emitted so far *) + to_parse : char list; (* characters left to parse *) +} diff --git a/lib/lexing/lexer.ml b/lib/lexing/lexer.ml new file mode 100644 index 0000000..af24e49 --- /dev/null +++ b/lib/lexing/lexer.ml @@ -0,0 +1,97 @@ +(* Lexer for transforming an input string into a list of tokens *) + +open Types +open Headers +open Utils + +(* + Ordered lexer rules: + + true => TRUE [1,2,3,4] + false => FALSE [5,6,7,8,9] + [a-zA-Z]+ as s => LIT s [10] + ¬ => NOT [11] + ∧ => AND [12] + ∨ => OR [13] + → => IMPLIES [14] + ↔ => IFF [15] + ( => LEFT_PAREN [16] + ) => RIGHT_PAREN [17] + [ \t\n] => SKIP [18] +*) + +(* Define a TDFA *) +let my_tdfa : tdfa = + let my_alphabet = + let lowercase = List.init 26 (fun i -> Char.chr (Char.code 'a' + i)) in + let uppercase = List.init 26 (fun i -> Char.chr (Char.code 'A' + i)) in + ['!'; '&'; '|'; '>'; '='; '('; ')'; ' '; '\t'; '\n'] @ lowercase @ uppercase + in + + let my_delta (i, c) = + if + not (List.mem c my_alphabet) + then + raise (Not_In_Alphabet c) + else + let append_lit = function + | LIT s -> LIT (s ^ Char.escaped c) + | _ -> LIT (Char.escaped c) + in + + let is_ltr c = + let ascii = Char.code c in + (65 <= ascii && ascii <= 90) || (97 <= ascii && ascii <= 122) + in + + match i, c with + | 0, '!' -> 11, (fun _ -> NOT) + | 0, '&' -> 12, (fun _ -> AND) + | 0, '|' -> 13, (fun _ -> OR) + | 0, '>' -> 14, (fun _ -> IMPLIES) + | 0, '=' -> 15, (fun _ -> IFF) + | 0, '(' -> 16, (fun _ -> LEFT_PAREN) + | 0, ')' -> 17, (fun _ -> RIGHT_PAREN) + | 0, ' ' -> 18, (fun _ -> SKIP) + | 0, '\t' -> 18, (fun _ -> SKIP) + | 0, '\n' -> 18, (fun _ -> SKIP) + | 0, 't' -> 1, append_lit + | 0, 'f' -> 5, append_lit + | 0, c when is_ltr c -> 10, append_lit + | 1, 'r' -> 2, append_lit + | 1, c when is_ltr c -> 10, append_lit + | 2, 'u' -> 3, append_lit + | 2, c when is_ltr c -> 10, append_lit + | 3, 'e' -> 4, (fun _ -> TRUE) + | 3, c when is_ltr c -> 10, append_lit + | 4, c when is_ltr c -> 10, (fun _ -> (LIT ("true" ^ Char.escaped c))) + | 5, 'a' -> 6, append_lit + | 5, c when is_ltr c -> 10, append_lit + | 6, 'l' -> 7, append_lit + | 6, c when is_ltr c -> 10, append_lit + | 7, 's' -> 8, append_lit + | 7, c when is_ltr c -> 10, append_lit + | 8, 'e' -> 9, (fun _ -> FALSE) + | 8, c when is_ltr c -> 10, append_lit + | 9, c when is_ltr c -> 10, (fun _ -> (LIT ("false" ^ Char.escaped c))) + | 10, c when is_ltr c -> 10, append_lit + | _ -> raise (No_Transition (i, c)) + in + + { + alphabet = my_alphabet; + start_state = 0; + register_init = SKIP; + delta = my_delta + } + + +let lex (s : string) : tok list = + let to_parse = s |> String.to_seq |> List.of_seq in + let initial_tdfa_state = { + state_id = my_tdfa.start_state; + register = my_tdfa.register_init; + tokens = []; + to_parse = to_parse + } in + driver my_tdfa initial_tdfa_state diff --git a/lib/lexing/utils.ml b/lib/lexing/utils.ml new file mode 100644 index 0000000..aa218f5 --- /dev/null +++ b/lib/lexing/utils.ml @@ -0,0 +1,36 @@ +open Types +open Headers + +(* Function for stepping through a TDFA computation *) +let step (automaton : tdfa) (state : tdfa_state) : tdfa_state = + match state.to_parse with + | [] -> state + | c :: cs -> + try + let next_state_id, f = automaton.delta (state.state_id, c) in + let next_tok = f state.register in + { + state_id = next_state_id; + register = next_tok; + tokens = state.tokens; + to_parse = cs + } + with (No_Transition (_, c)) -> + let token_to_add = state.register in + let next_state_id, f = automaton.delta (0, c) in + let next_tok = f SKIP in + { + state_id = next_state_id; + register = next_tok; + tokens = if token_to_add <> SKIP then token_to_add :: state.tokens else state.tokens; + to_parse = cs + } + +(* Driver to perform a TDFA computation *) +let rec driver (automaton : tdfa) (state: tdfa_state) : tok list = + if + state.to_parse = [] + then + List.rev (state.register :: state.tokens) (* ensure that final tag is emitted *) + else + driver automaton (step automaton state) |