summaryrefslogtreecommitdiff
path: root/lib/lexing/lexer.ml
blob: af24e491ab8560a4e9f7f2ee12e7b5145915ce38 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
(* Lexer for transforming an input string into a list of tokens *)

open Types
open Headers
open Utils

(*
  Ordered lexer rules:

  true            => TRUE         [1,2,3,4]
  false           => FALSE        [5,6,7,8,9]
  [a-zA-Z]+ as s  => LIT s        [10]
  ¬               => NOT          [11]
  ∧               => AND          [12]
  ∨               => OR           [13]
  →               => IMPLIES      [14]
  ↔               => IFF          [15]
  (               => LEFT_PAREN   [16]
  )               => RIGHT_PAREN  [17]
  [ \t\n]         => SKIP         [18]
*)

(* Define a TDFA *)
let my_tdfa : tdfa =
  let my_alphabet =
    let lowercase = List.init 26 (fun i -> Char.chr (Char.code 'a' + i)) in
    let uppercase = List.init 26 (fun i -> Char.chr (Char.code 'A' + i)) in
    ['!'; '&'; '|'; '>'; '='; '('; ')'; ' '; '\t'; '\n'] @ lowercase @ uppercase
  in

  let my_delta (i, c) =
    if
      not (List.mem c my_alphabet)
    then
      raise (Not_In_Alphabet c)
    else
      let append_lit = function
          | LIT s -> LIT (s ^ Char.escaped c)
          | _ -> LIT (Char.escaped c)
      in

      let is_ltr c =
        let ascii = Char.code c in
        (65 <= ascii && ascii <= 90) || (97 <= ascii && ascii <= 122)
      in

      match i, c with
      | 0, '!'                  -> 11, (fun _ -> NOT)
      | 0, '&'                  -> 12, (fun _ -> AND)
      | 0, '|'                  -> 13, (fun _ -> OR)
      | 0, '>'                  -> 14, (fun _ -> IMPLIES)
      | 0, '='                  -> 15, (fun _ -> IFF)
      | 0, '('                  -> 16, (fun _ -> LEFT_PAREN)
      | 0, ')'                  -> 17, (fun _ -> RIGHT_PAREN)
      | 0, ' '                  -> 18, (fun _ -> SKIP)
      | 0, '\t'                 -> 18, (fun _ -> SKIP)
      | 0, '\n'                 -> 18, (fun _ -> SKIP)
      | 0, 't'                  -> 1, append_lit
      | 0, 'f'                  -> 5, append_lit
      | 0, c when is_ltr c      -> 10, append_lit
      | 1, 'r'                  -> 2, append_lit
      | 1, c when is_ltr c      -> 10, append_lit
      | 2, 'u'                  -> 3, append_lit
      | 2, c when is_ltr c      -> 10, append_lit
      | 3, 'e'                  -> 4, (fun _ -> TRUE)
      | 3, c when is_ltr c      -> 10, append_lit
      | 4, c when is_ltr c      -> 10, (fun _ -> (LIT ("true" ^ Char.escaped c)))
      | 5, 'a'                  -> 6, append_lit
      | 5, c when is_ltr c      -> 10, append_lit
      | 6, 'l'                  -> 7, append_lit
      | 6, c when is_ltr c      -> 10, append_lit
      | 7, 's'                  -> 8, append_lit
      | 7, c when is_ltr c      -> 10, append_lit
      | 8, 'e'                  -> 9, (fun _ -> FALSE)
      | 8, c when is_ltr c      -> 10, append_lit
      | 9, c when is_ltr c      -> 10, (fun _ -> (LIT ("false" ^ Char.escaped c)))
      | 10, c when is_ltr c     -> 10, append_lit
      | _                       -> raise (No_Transition (i, c))
  in

  {
    alphabet = my_alphabet;
    start_state = 0;
    register_init = SKIP;
    delta = my_delta
  }


let lex (s : string) : tok list =
  let to_parse = s |> String.to_seq |> List.of_seq in
  let initial_tdfa_state = {
    state_id = my_tdfa.start_state;
    register = my_tdfa.register_init;
    tokens = [];
    to_parse = to_parse
  } in
  driver my_tdfa initial_tdfa_state