summaryrefslogtreecommitdiff
path: root/lib/lexing
diff options
context:
space:
mode:
Diffstat (limited to 'lib/lexing')
-rw-r--r--lib/lexing/headers.ml23
-rw-r--r--lib/lexing/lexer.ml97
-rw-r--r--lib/lexing/utils.ml36
3 files changed, 156 insertions, 0 deletions
diff --git a/lib/lexing/headers.ml b/lib/lexing/headers.ml
new file mode 100644
index 0000000..2452eec
--- /dev/null
+++ b/lib/lexing/headers.ml
@@ -0,0 +1,23 @@
+(* Types for lexing *)
+
+open Types
+
+(* Exceptions to aid control flow for lexer *)
+exception Not_In_Alphabet of char
+exception No_Transition of int * char
+
+(* Tagged Deterministice Finite Automaton *)
+type tdfa = {
+ alphabet : char list;
+ start_state: int;
+ register_init : tok; (* initial contents of the single register for this TDFA *)
+ delta : int * char -> int * (tok -> tok)
+}
+
+(* Description of a possible state that the TDFA could be in *)
+type tdfa_state = {
+ state_id : int; (* current state *)
+ register : tok; (* contents of register *)
+ tokens : tok list; (* tokens emitted so far *)
+ to_parse : char list; (* characters left to parse *)
+}
diff --git a/lib/lexing/lexer.ml b/lib/lexing/lexer.ml
new file mode 100644
index 0000000..af24e49
--- /dev/null
+++ b/lib/lexing/lexer.ml
@@ -0,0 +1,97 @@
+(* Lexer for transforming an input string into a list of tokens *)
+
+open Types
+open Headers
+open Utils
+
+(*
+ Ordered lexer rules:
+
+ true => TRUE [1,2,3,4]
+ false => FALSE [5,6,7,8,9]
+ [a-zA-Z]+ as s => LIT s [10]
+ ¬ => NOT [11]
+ ∧ => AND [12]
+ ∨ => OR [13]
+ → => IMPLIES [14]
+ ↔ => IFF [15]
+ ( => LEFT_PAREN [16]
+ ) => RIGHT_PAREN [17]
+ [ \t\n] => SKIP [18]
+*)
+
+(* Define a TDFA *)
+let my_tdfa : tdfa =
+ let my_alphabet =
+ let lowercase = List.init 26 (fun i -> Char.chr (Char.code 'a' + i)) in
+ let uppercase = List.init 26 (fun i -> Char.chr (Char.code 'A' + i)) in
+ ['!'; '&'; '|'; '>'; '='; '('; ')'; ' '; '\t'; '\n'] @ lowercase @ uppercase
+ in
+
+ let my_delta (i, c) =
+ if
+ not (List.mem c my_alphabet)
+ then
+ raise (Not_In_Alphabet c)
+ else
+ let append_lit = function
+ | LIT s -> LIT (s ^ Char.escaped c)
+ | _ -> LIT (Char.escaped c)
+ in
+
+ let is_ltr c =
+ let ascii = Char.code c in
+ (65 <= ascii && ascii <= 90) || (97 <= ascii && ascii <= 122)
+ in
+
+ match i, c with
+ | 0, '!' -> 11, (fun _ -> NOT)
+ | 0, '&' -> 12, (fun _ -> AND)
+ | 0, '|' -> 13, (fun _ -> OR)
+ | 0, '>' -> 14, (fun _ -> IMPLIES)
+ | 0, '=' -> 15, (fun _ -> IFF)
+ | 0, '(' -> 16, (fun _ -> LEFT_PAREN)
+ | 0, ')' -> 17, (fun _ -> RIGHT_PAREN)
+ | 0, ' ' -> 18, (fun _ -> SKIP)
+ | 0, '\t' -> 18, (fun _ -> SKIP)
+ | 0, '\n' -> 18, (fun _ -> SKIP)
+ | 0, 't' -> 1, append_lit
+ | 0, 'f' -> 5, append_lit
+ | 0, c when is_ltr c -> 10, append_lit
+ | 1, 'r' -> 2, append_lit
+ | 1, c when is_ltr c -> 10, append_lit
+ | 2, 'u' -> 3, append_lit
+ | 2, c when is_ltr c -> 10, append_lit
+ | 3, 'e' -> 4, (fun _ -> TRUE)
+ | 3, c when is_ltr c -> 10, append_lit
+ | 4, c when is_ltr c -> 10, (fun _ -> (LIT ("true" ^ Char.escaped c)))
+ | 5, 'a' -> 6, append_lit
+ | 5, c when is_ltr c -> 10, append_lit
+ | 6, 'l' -> 7, append_lit
+ | 6, c when is_ltr c -> 10, append_lit
+ | 7, 's' -> 8, append_lit
+ | 7, c when is_ltr c -> 10, append_lit
+ | 8, 'e' -> 9, (fun _ -> FALSE)
+ | 8, c when is_ltr c -> 10, append_lit
+ | 9, c when is_ltr c -> 10, (fun _ -> (LIT ("false" ^ Char.escaped c)))
+ | 10, c when is_ltr c -> 10, append_lit
+ | _ -> raise (No_Transition (i, c))
+ in
+
+ {
+ alphabet = my_alphabet;
+ start_state = 0;
+ register_init = SKIP;
+ delta = my_delta
+ }
+
+
+let lex (s : string) : tok list =
+ let to_parse = s |> String.to_seq |> List.of_seq in
+ let initial_tdfa_state = {
+ state_id = my_tdfa.start_state;
+ register = my_tdfa.register_init;
+ tokens = [];
+ to_parse = to_parse
+ } in
+ driver my_tdfa initial_tdfa_state
diff --git a/lib/lexing/utils.ml b/lib/lexing/utils.ml
new file mode 100644
index 0000000..aa218f5
--- /dev/null
+++ b/lib/lexing/utils.ml
@@ -0,0 +1,36 @@
+open Types
+open Headers
+
+(* Function for stepping through a TDFA computation *)
+let step (automaton : tdfa) (state : tdfa_state) : tdfa_state =
+ match state.to_parse with
+ | [] -> state
+ | c :: cs ->
+ try
+ let next_state_id, f = automaton.delta (state.state_id, c) in
+ let next_tok = f state.register in
+ {
+ state_id = next_state_id;
+ register = next_tok;
+ tokens = state.tokens;
+ to_parse = cs
+ }
+ with (No_Transition (_, c)) ->
+ let token_to_add = state.register in
+ let next_state_id, f = automaton.delta (0, c) in
+ let next_tok = f SKIP in
+ {
+ state_id = next_state_id;
+ register = next_tok;
+ tokens = if token_to_add <> SKIP then token_to_add :: state.tokens else state.tokens;
+ to_parse = cs
+ }
+
+(* Driver to perform a TDFA computation *)
+let rec driver (automaton : tdfa) (state: tdfa_state) : tok list =
+ if
+ state.to_parse = []
+ then
+ List.rev (state.register :: state.tokens) (* ensure that final tag is emitted *)
+ else
+ driver automaton (step automaton state)