Project

General

Profile

Statistics
| Branch: | Revision:

cool / src / lib / altGenlex.ml @ 7c4d2eb4

History | View | Annotate | Download (9.45 KB)

1
(** An alternative for the module Genlex 
2
    because Genlex cannot deal with key words properly.
3
    @author Florian Widmann
4
 *)
5

    
6

    
7
(** The three different kinds of tokens
8
    that the lexer can recognise (cf. Genlex)
9
 *)
10
type altToken =
11
  | Kwd of string
12
  | Ident of string
13
  | Int of int
14

    
15
(** A tree branching on characters that contains all key words.
16
    The boolean variable at every node indicates whether the
17
    corresponding string is a key word itself (if it is not a key word
18
    then it must be a strict prefix of a key word).
19
 *)
20
type kwSTree = (char * kwTree) list
21
and kwTree = KWTree of bool * kwSTree
22

    
23

    
24
(** An instantiation of a Set (of the standard library) for characters.
25
 *)
26
module CSet = Set.Make(Char)
27

    
28
(** Creates a key word tree from a list of key words.
29
    @param keywords A list of strings representing key words.
30
    @return A key word tree stripped by the uppermost node
31
    (see description of type kwSTree).
32
 *)
33
let make_kwSTree keywords =
34
  let rec mk_kwTree i kwl =
35
    let fldList (iskw, cset, kwl1) s =
36
      if String.length s = i then (true, cset, kwl1)
37
      else (iskw, CSet.add s.[i] cset, s::kwl1)
38
    in
39
    let (isKw, cset, kwl1) = List.fold_left fldList (false, CSet.empty, []) kwl in
40
    let fldCSet c acc =
41
      let kwl2 = List.filter (fun s -> s.[i] = c) kwl1 in
42
      let subtree = mk_kwTree (succ i) kwl2 in
43
      (c, subtree)::acc
44
    in
45
    let reslst = CSet.fold fldCSet cset [] in
46
    KWTree (isKw, reslst)
47
  in
48
  match mk_kwTree 0 keywords with
49
  | KWTree (_, subtree) -> subtree
50

    
51

    
52
(** Produces a lexer that can recognise identifiers, integers,
53
    and a given list of key words.
54
    Identifiers are sequences containing "A..Z", "a..z", "0..9", "_", and "'"
55
    that do not start with a digit.
56
    Integers are sequences of "0".."9" and "-"
57
    that contain at least one digit and
58
    have "-" at most at the very beginning of the sequence.
59
    Furthermore the integers should not exceed
60
    the range of integers representable in type int.
61
    @param keywords A list of strings representing key words.
62
    All key words are accepted, but the empty string,
63
    key words starting with a whitespace,
64
    key words starting with a digit, and
65
    key words starting with a "-" followed by a digit
66
    are never matched (see below).
67
    @return A function f that accepts a string
68
    and returns a stream of tokens (of type altToken).
69
    Intuitively, when a new token is requested,
70
    f works as follows on the suffix of the string
71
    that has not yet been tokenised:
72
    It first strips all leading whitespaces.
73
    If the result is the empty string then it returns None;
74
    otherwise if it can match an integer it returns
75
    the int value corresponding to the longest match (in the number of characters);
76
    otherwise it returns the longest identifier or key word that it can match
77
    (if the result is a key word and a identifier, it is returned as key word).
78
    If none of the above applies then f raises a Stream.Error exception.
79
 *)
80
let make_lexer keywords =
81
  let kwstree = make_kwSTree keywords in
82
  let resfkt input =
83
    let lpos = ref 0 in
84
    let len = String.length input in
85
    let retNumber hpos =
86
      let pos = !lpos in
87
      let s = String.sub input pos (hpos - pos) in
88
      lpos := hpos;
89
      Some (Int (int_of_string s))
90
    in
91
    let rec next_number hpos =
92
      if hpos < len then
93
        let c = input.[hpos] in
94
        match c with
95
        | '0'..'9' -> next_number (succ hpos)
96
        | _ -> retNumber hpos  
97
      else retNumber hpos
98
    in
99
    let retToken idpos kwpos hpos =
100
      let pos = !lpos in
101
      if idpos > kwpos then
102
        let s = String.sub input pos (idpos - pos) in
103
        lpos := idpos;
104
        Some (Ident s)
105
      else
106
        if kwpos > pos then
107
          let s = String.sub input pos (kwpos - pos) in
108
          lpos := kwpos;
109
          Some (Kwd s)
110
        else
111
          let s1 = String.sub input pos (hpos - pos) in
112
          raise (Stream.Error ("Illegal token " ^ s1))
113
    in        
114
    let rec next_token hpos kwstr idpos kwpos =
115
      if hpos < len then
116
        let c = input.[hpos] in
117
        let idpos1 =
118
          if idpos = hpos then
119
            match c with
120
            | 'A'..'Z' | 'a'..'z' | '0'..'9' | '_' | '\'' -> succ idpos
121
            | _ -> idpos
122
          else idpos
123
        in
124
        let hpos1 = succ hpos in
125
        let (maybeKw, kwstr1, kwpos1) =
126
          try
127
            match List.assoc c kwstr with
128
            | KWTree (newKw, lst) -> (true, lst, if newKw then hpos1 else kwpos)
129
          with Not_found -> (false, [], kwpos)
130
        in
131
        if idpos1 = hpos1 || maybeKw then next_token hpos1 kwstr1 idpos1 kwpos1
132
        else retToken idpos1 kwpos1 hpos1
133
      else retToken idpos kwpos hpos
134
    in
135
    let rec streamfkt (_ : int) =
136
      let pos = !lpos in
137
      if pos < len then 
138
        match input.[pos] with
139
        | ' ' | '\010' | '\013' | '\009' | '\026' | '\012' -> incr lpos; streamfkt 0
140
        | '0'..'9' -> next_number (succ pos)
141
        | '-' -> 
142
            let npos = succ pos in
143
            if npos < len then
144
              match input.[npos] with
145
              | '0'..'9' -> next_number (succ npos)
146
              | _ -> next_token pos kwstree pos pos
147
            else next_token pos kwstree pos pos
148
        | _ -> next_token pos kwstree pos pos
149
      else None
150
    in
151
    Stream.from streamfkt
152
  in
153
  resfkt
154

    
155

    
156
(** Produces a lexer that can recognise identifiers, integers,
157
    and a given list of key words.
158
    Identifiers are sequences containing "A..Z", "a..z", "0..9", "_", and "'"
159
    that do not start with a digit.
160
    Integers are sequences of "0".."9" and "-"
161
    that contain at least one digit and
162
    have "-" at most at the very beginning of the sequence.
163
    Furthermore the integers should not exceed
164
    the range of integers representable in type int.
165
    @param comment An optional string representing a pseudo-keyword
166
    which indicates that the rest of the line is a comment.
167
    It will be treated as a keyword during matching,
168
    but when it is matched then it and the rest of the line will be ignored,
169
    and the next token is returned.
170
    Note that if there exists a real keyword which is equal to comment
171
    then the keyword will never be matched.
172
    @param keywords A list of strings representing key words.
173
    All key words are accepted, but the empty string,
174
    key words starting with a whitespace,
175
    key words containing a new line character,
176
    key words starting with a digit, and
177
    key words starting with a "-" followed by a digit
178
    are never matched (see below).
179
    @return A function f that accepts a file handler
180
    and returns a stream of tokens (of type altToken).
181
    Intuitively, when a new token is requested,
182
    f works as follows on the remainder of the file
183
    that has not yet been tokenised:
184
    It first strips all leading whitespaces.
185
    If the end of the file is reached then it returns None;
186
    otherwise if it can match an integer it returns
187
    the int value corresponding to the longest match (in the number of characters);
188
    otherwise it returns the longest identifier or key word that it can match
189
    (if the result is a key word and a identifier, it is returned as key word).
190
    If the matched key word is the comment string
191
    then it is not returned but the rest of the line is discarded
192
    and f is (recursively) invoked on the next line of the file.
193
    If none of the above applies then f raises a Stream.Error exception.
194
 *)
195
let make_lexer_file ?comment keywords =
196
  let (excmt, cmt, keywordsAndComment) =
197
    match comment with
198
    | None -> (false, "", keywords)
199
    | Some s -> (true, s, s::keywords)
200
  in
201
  let lexer = make_lexer keywordsAndComment in
202
  let resfkt file =
203
    let ts = ref (lexer "") in
204
    let rec streamfkt (_ : int) =
205
      match Stream.peek !ts with
206
      | None -> begin
207
          try
208
            ts := lexer (input_line file);
209
            streamfkt 0
210
          with End_of_file -> None
211
        end
212
      | Some _ ->
213
          match Stream.next !ts with
214
          | Kwd s when excmt && s = cmt -> begin
215
              try
216
                ts := lexer (input_line file);
217
                streamfkt 0
218
              with End_of_file -> None
219
            end
220
          | t -> Some t
221
    in
222
    Stream.from streamfkt
223
  in
224
  resfkt
225

    
226

    
227
(** Prints a token.
228
    @param t A token.
229
 *)
230
let printToken t =
231
  match t with
232
  | Kwd s -> print_string s
233
  | Ident s -> print_string s
234
  | Int n -> print_int n
235

    
236
(** Prints an error message and the rest of a token stream (if given)
237
    and then raises an exception.
238
    @param exc A function which accepts a string and constructs an exception.
239
    @param topt An optional token.
240
    @param ts An optional token stream.
241
    @param msg An error message
242
    @raise exc(.) This exception is always raised.
243
 *)
244
let printError exc ?t ?ts msg =
245
  let printTkn t =
246
    printToken t;
247
    print_string " "
248
  in
249
  print_string msg;
250
  let _ =
251
    match t with
252
    | None -> ()
253
    | Some t -> printTkn t
254
  in
255
  let _ =
256
    match ts with
257
    | None -> ()
258
    | Some ts -> Stream.iter printTkn ts
259
  in
260
  print_newline ();
261
  raise (exc "Parsing error")
262

    
263

    
264
(** Tries to read a given list of key words from a token stream.
265
    @param exc A function which accepts a string and constructs an exception.
266
    @param ts A token stream.
267
    @param kwl A list of key words.
268
    @raise exc(.) if the next tokens
269
    are not the key words in kwl (in the right order).
270
 *)
271
let rec getKws exc ts kwl =
272
  match kwl with
273
  | [] -> ()
274
  | kw::tl ->
275
      match Stream.next ts with
276
      | Kwd s when s = kw -> getKws exc ts tl
277
      | t -> printError exc ~t ("key word \"" ^ kw ^ "\" expected: ")