KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > java_cup > lexer


1 package java_cup;
2
3 import java_cup.runtime.Symbol;
4 import java.util.Hashtable JavaDoc;
5 import java.io.InputStream JavaDoc;
6
7 /** This class implements a small scanner (aka lexical analyzer or lexer) for
8  * the JavaCup specification. This scanner reads characters from the input
9  * stream and returns integers corresponding to the terminal
10  * number of the next Symbol. Once end of input is reached the EOF Symbol is
11  * returned on every subsequent call.<p>
12  * Symbols currently returned include: <pre>
13  * Symbol Constant Returned Symbol Constant Returned
14  * ------ ----------------- ------ -----------------
15  * "package" PACKAGE "import" IMPORT
16  * "code" CODE "action" ACTION
17  * "parser" PARSER "terminal" TERMINAL
18  * "non" NON "init" INIT
19  * "scan" SCAN "with" WITH
20  * "start" START "precedence" PRECEDENCE
21  * "left" LEFT "right" RIGHT
22  * "nonassoc" NONASSOC "%prec PRECENT_PREC
23  * [ LBRACK ] RBRACK
24  * ; SEMI
25  * , COMMA * STAR
26  * . DOT : COLON
27  * ::= COLON_COLON_EQUALS | BAR
28  * identifier ID {:...:} CODE_STRING
29  * "nonterminal" NONTERMINAL
30  * </pre>
31  * All symbol constants are defined in sym.java which is generated by
32  * JavaCup from parser.cup.<p>
33  *
34  * In addition to the scanner proper (called first via init() then with
35  * next_token() to get each Symbol) this class provides simple error and
36  * warning routines and keeps a count of errors and warnings that is
37  * publicly accessible.<p>
38  *
39  * This class is "static" (i.e., it has only static members and methods).
40  *
41  * @version last updated: 7/3/96
42  * @author Frank Flannery
43  */

44 public class lexer {
45
46   /*-----------------------------------------------------------*/
47   /*--- Constructor(s) ----------------------------------------*/
48   /*-----------------------------------------------------------*/
49
50   /** The only constructor is private, so no instances can be created. */
51   private lexer() { }
52
53   /*-----------------------------------------------------------*/
54   /*--- Static (Class) Variables ------------------------------*/
55   /*-----------------------------------------------------------*/
56
57   /** First character of lookahead. */
58   protected static int next_char;
59
60   /** Second character of lookahead. */
61   protected static int next_char2;
62
63   /** Second character of lookahead. */
64   protected static int next_char3;
65
66   /** Second character of lookahead. */
67   protected static int next_char4;
68
69   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
70
71   /** EOF constant. */
72   protected static final int EOF_CHAR = -1;
73
74   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
75
76   /** Table of keywords. Keywords are initially treated as identifiers.
77    * Just before they are returned we look them up in this table to see if
78    * they match one of the keywords. The string of the name is the key here,
79    * which indexes Integer objects holding the symbol number.
80    */

81   protected static Hashtable JavaDoc keywords = new Hashtable JavaDoc(23);
82
83   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
84
85   /** Table of single character symbols. For ease of implementation, we
86    * store all unambiguous single character Symbols in this table of Integer
87    * objects keyed by Integer objects with the numerical value of the
88    * appropriate char (currently Character objects have a bug which precludes
89    * their use in tables).
90    */

91   protected static Hashtable JavaDoc char_symbols = new Hashtable JavaDoc(11);
92
93   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
94
95   /** Current line number for use in error messages. */
96   protected static int current_line = 1;
97
98   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
99
100   /** Character position in current line. */
101   protected static int current_position = 1;
102
103   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
104
105   /** Character position in current line. */
106   protected static int absolute_position = 1;
107
108   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
109
110   /** Count of total errors detected so far. */
111   public static int error_count = 0;
112
113   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
114
115   /** Count of warnings issued so far */
116   public static int warning_count = 0;
117
118   /*-----------------------------------------------------------*/
119   /*--- Static Methods ----------------------------------------*/
120   /*-----------------------------------------------------------*/
121
122   /** Initialize the scanner. This sets up the keywords and char_symbols
123     * tables and reads the first two characters of lookahead.
124     */

125   public static void init() throws java.io.IOException JavaDoc
126     {
127       /* set up the keyword table */
128       keywords.put("package", new Integer JavaDoc(sym.PACKAGE));
129       keywords.put("import", new Integer JavaDoc(sym.IMPORT));
130       keywords.put("code", new Integer JavaDoc(sym.CODE));
131       keywords.put("action", new Integer JavaDoc(sym.ACTION));
132       keywords.put("parser", new Integer JavaDoc(sym.PARSER));
133       keywords.put("terminal", new Integer JavaDoc(sym.TERMINAL));
134       keywords.put("non", new Integer JavaDoc(sym.NON));
135       keywords.put("nonterminal",new Integer JavaDoc(sym.NONTERMINAL));// [CSA]
136
keywords.put("init", new Integer JavaDoc(sym.INIT));
137       keywords.put("scan", new Integer JavaDoc(sym.SCAN));
138       keywords.put("with", new Integer JavaDoc(sym.WITH));
139       keywords.put("start", new Integer JavaDoc(sym.START));
140       keywords.put("precedence", new Integer JavaDoc(sym.PRECEDENCE));
141       keywords.put("left", new Integer JavaDoc(sym.LEFT));
142       keywords.put("right", new Integer JavaDoc(sym.RIGHT));
143       keywords.put("nonassoc", new Integer JavaDoc(sym.NONASSOC));
144
145       /* set up the table of single character symbols */
146       char_symbols.put(new Integer JavaDoc(';'), new Integer JavaDoc(sym.SEMI));
147       char_symbols.put(new Integer JavaDoc(','), new Integer JavaDoc(sym.COMMA));
148       char_symbols.put(new Integer JavaDoc('*'), new Integer JavaDoc(sym.STAR));
149       char_symbols.put(new Integer JavaDoc('.'), new Integer JavaDoc(sym.DOT));
150       char_symbols.put(new Integer JavaDoc('|'), new Integer JavaDoc(sym.BAR));
151       char_symbols.put(new Integer JavaDoc('['), new Integer JavaDoc(sym.LBRACK));
152       char_symbols.put(new Integer JavaDoc(']'), new Integer JavaDoc(sym.RBRACK));
153
154       /* read two characters of lookahead */
155       next_char = Main.in.read();
156       if (next_char == EOF_CHAR) {
157     next_char2 = EOF_CHAR;
158         next_char3 = EOF_CHAR;
159         next_char4 = EOF_CHAR;
160       } else {
161     next_char2 = Main.in.read();
162     if (next_char2 == EOF_CHAR) {
163       next_char3 = EOF_CHAR;
164       next_char4 = EOF_CHAR;
165     } else {
166       next_char3 = Main.in.read();
167       if (next_char3 == EOF_CHAR) {
168         next_char4 = EOF_CHAR;
169       } else {
170         next_char4 = Main.in.read();
171       }
172     }
173       }
174     }
175
176   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
177
178   /** Advance the scanner one character in the input stream. This moves
179    * next_char2 to next_char and then reads a new next_char2.
180    */

181   protected static void advance() throws java.io.IOException JavaDoc
182     {
183       int old_char;
184
185       old_char = next_char;
186       next_char = next_char2;
187       if (next_char == EOF_CHAR) {
188     next_char2 = EOF_CHAR;
189         next_char3 = EOF_CHAR;
190     next_char4 = EOF_CHAR;
191       } else {
192     next_char2 = next_char3;
193     if (next_char2 == EOF_CHAR) {
194       next_char3 = EOF_CHAR;
195       next_char4 = EOF_CHAR;
196     } else {
197       next_char3 = next_char4;
198       if (next_char3 == EOF_CHAR) {
199         next_char4 = EOF_CHAR;
200       } else {
201         next_char4 = Main.in.read();
202       }
203     }
204       }
205
206       /* count this */
207       absolute_position++;
208       current_position++;
209       if (old_char == '\n' || (old_char == '\r' && next_char!='\n'))
210     {
211       current_line++;
212       current_position = 1;
213     }
214     }
215
216   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
217
218   /** Emit an error message. The message will be marked with both the
219    * current line number and the position in the line. Error messages
220    * are printed on standard error (System.err).
221    * @param message the message to print.
222    */

223   public static void emit_error(String JavaDoc message)
224     {
225       System.err.println(Main.inFileName + ":" + current_line + ":" + current_position +
226              ": Error: JavaCUP: " + message);
227       error_count++;
228     }
229
230   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
231
232   /** Emit a warning message. The message will be marked with both the
233    * current line number and the position in the line. Messages are
234    * printed on standard error (System.err).
235    * @param message the message to print.
236    */

237   public static void emit_warn(String JavaDoc message)
238     {
239       System.err.println(Main.inFileName + ":" + current_line + ":" + current_position +
240              ": Warning: JavaCUP: " + message);
241       warning_count++;
242     }
243
244   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
245
246   /** Determine if a character is ok to start an id.
247    * @param ch the character in question.
248    */

249   protected static boolean id_start_char(int ch)
250     {
251       /* allow for % in identifiers. a hack to allow my
252      %prec in. Should eventually make lex spec for this
253      frankf */

254       return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
255          (ch == '_');
256
257       // later need to deal with non-8-bit chars here
258
}
259
260   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
261
262   /** Determine if a character is ok for the middle of an id.
263    * @param ch the character in question.
264    */

265   protected static boolean id_char(int ch)
266     {
267       return id_start_char(ch) || (ch >= '0' && ch <= '9');
268     }
269
270   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
271
272   /** Try to look up a single character symbol, returns -1 for not found.
273    * @param ch the character in question.
274    */

275   protected static int find_single_char(int ch)
276     {
277       Integer JavaDoc result;
278
279       result = (Integer JavaDoc)char_symbols.get(new Integer JavaDoc((char)ch));
280       if (result == null)
281     return -1;
282       else
283     return result.intValue();
284     }
285
286   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
287
288   /** Handle swallowing up a comment. Both old style C and new style C++
289    * comments are handled.
290    */

291   protected static void swallow_comment() throws java.io.IOException JavaDoc
292     {
293       /* next_char == '/' at this point */
294
295       /* is it a traditional comment */
296       if (next_char2 == '*')
297     {
298       /* swallow the opener */
299       advance(); advance();
300
301       /* swallow the comment until end of comment or EOF */
302       for (;;)
303         {
304           /* if its EOF we have an error */
305           if (next_char == EOF_CHAR)
306         {
307           emit_error("Specification file ends inside a comment");
308           return;
309         }
310
311           /* if we can see the closer we are done */
312           if (next_char == '*' && next_char2 == '/')
313         {
314           advance();
315           advance();
316           return;
317         }
318
319           /* otherwise swallow char and move on */
320           advance();
321         }
322     }
323
324       /* is its a new style comment */
325       if (next_char2 == '/')
326     {
327       /* swallow the opener */
328       advance(); advance();
329
330       /* swallow to '\n', '\r', '\f', or EOF */
331       while (next_char != '\n' && next_char != '\r' &&
332          next_char != '\f' && next_char!=EOF_CHAR)
333         advance();
334
335       return;
336
337     }
338
339       /* shouldn't get here, but... if we get here we have an error */
340       emit_error("Malformed comment in specification -- ignored");
341       advance();
342     }
343
344   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
345
346   /** Swallow up a code string. Code strings begin with "{:" and include
347       all characters up to the first occurrence of ":}" (there is no way to
348       include ":}" inside a code string). The routine returns a String
349       object suitable for return by the scanner.
350    */

351   protected static Symbol do_code_string() throws java.io.IOException JavaDoc
352     {
353       StringBuffer JavaDoc result = new StringBuffer JavaDoc();
354
355       /* at this point we have lookahead of "{:" -- swallow that */
356       advance(); advance();
357
358       /* save chars until we see ":}" */
359       while (!(next_char == ':' && next_char2 == '}'))
360     {
361       /* if we have run off the end issue a message and break out of loop */
362       if (next_char == EOF_CHAR)
363         {
364           emit_error("Specification file ends inside a code string");
365           break;
366         }
367
368       /* otherwise record the char and move on */
369       result.append(new Character JavaDoc((char)next_char));
370       advance();
371     }
372
373       /* advance past the closer and build a return Symbol */
374       advance(); advance();
375       return new Symbol(sym.CODE_STRING, result.toString());
376     }
377
378   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
379
380   /** Process an identifier. Identifiers begin with a letter, underscore,
381    * or dollar sign, which is followed by zero or more letters, numbers,
382    * underscores or dollar signs. This routine returns a String suitable
383    * for return by the scanner.
384    */

385   protected static Symbol do_id() throws java.io.IOException JavaDoc
386     {
387       StringBuffer JavaDoc result = new StringBuffer JavaDoc();
388       String JavaDoc result_str;
389       Integer JavaDoc keyword_num;
390       char buffer[] = new char[1];
391
392       /* next_char holds first character of id */
393       buffer[0] = (char)next_char;
394       result.append(buffer,0,1);
395       advance();
396
397       /* collect up characters while they fit in id */
398       while(id_char(next_char))
399     {
400           buffer[0] = (char)next_char;
401       result.append(buffer,0,1);
402       advance();
403     }
404
405       /* extract a string and try to look it up as a keyword */
406       result_str = result.toString();
407       keyword_num = (Integer JavaDoc)keywords.get(result_str);
408
409       /* if we found something, return that keyword */
410       if (keyword_num != null)
411     return new Symbol(keyword_num.intValue());
412
413       /* otherwise build and return an id Symbol with an attached string */
414       return new Symbol(sym.ID, result_str);
415     }
416
417   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
418
419   /** Return one Symbol. This is the main external interface to the scanner.
420    * It consumes sufficient characters to determine the next input Symbol
421    * and returns it. To help with debugging, this routine actually calls
422    * real_next_token() which does the work. If you need to debug the
423    * parser, this can be changed to call debug_next_token() which prints
424    * a debugging message before returning the Symbol.
425    */

426   public static Symbol next_token() throws java.io.IOException JavaDoc
427     {
428       return real_next_token();
429     }
430
431   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
432
433   /** Debugging version of next_token(). This routine calls the real scanning
434    * routine, prints a message on System.out indicating what the Symbol is,
435    * then returns it.
436    */

437   public static Symbol debug_next_token() throws java.io.IOException JavaDoc
438     {
439       Symbol result = real_next_token();
440       System.out.println("# next_Symbol() => " + result.sym);
441       return result;
442     }
443
444   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
445
446   /** The actual routine to return one Symbol. This is normally called from
447    * next_token(), but for debugging purposes can be called indirectly from
448    * debug_next_token().
449    */

450   protected static Symbol real_next_token() throws java.io.IOException JavaDoc
451     {
452       int sym_num;
453
454       for (;;)
455     {
456       /* look for white space */
457       if (next_char == ' ' || next_char == '\t' || next_char == '\n' ||
458           next_char == '\f' || next_char == '\r')
459         {
460           /* advance past it and try the next character */
461           advance();
462           continue;
463         }
464
465       /* look for a single character symbol */
466       sym_num = find_single_char(next_char);
467       if (sym_num != -1)
468         {
469           /* found one -- advance past it and return a Symbol for it */
470           advance();
471           return new Symbol(sym_num);
472         }
473
474       /* look for : or ::= */
475       if (next_char == ':')
476         {
477           /* if we don't have a second ':' return COLON */
478           if (next_char2 != ':')
479         {
480           advance();
481           return new Symbol(sym.COLON);
482         }
483
484           /* move forward and look for the '=' */
485           advance();
486           if (next_char2 == '=')
487         {
488           advance(); advance();
489           return new Symbol(sym.COLON_COLON_EQUALS);
490         }
491           else
492         {
493           /* return just the colon (already consumed) */
494           return new Symbol(sym.COLON);
495         }
496         }
497
498       /* find a "%prec" string and return it. otherwise, a '%' was found,
499          which has no right being in the specification otherwise */

500       if (next_char == '%') {
501         advance();
502         if ((next_char == 'p') && (next_char2 == 'r') && (next_char3 == 'e') &&
503         (next_char4 == 'c')) {
504           advance();
505           advance();
506           advance();
507           advance();
508           return new Symbol(sym.PERCENT_PREC);
509         } else {
510           emit_error("Found extraneous percent sign");
511         }
512       }
513
514       /* look for a comment */
515       if (next_char == '/' && (next_char2 == '*' || next_char2 == '/'))
516         {
517           /* swallow then continue the scan */
518           swallow_comment();
519           continue;
520         }
521
522       /* look for start of code string */
523       if (next_char == '{' && next_char2 == ':')
524         return do_code_string();
525
526       /* look for an id or keyword */
527       if (id_start_char(next_char)) return do_id();
528
529       /* look for EOF */
530       if (next_char == EOF_CHAR) return new Symbol(sym.EOF);
531
532       /* if we get here, we have an unrecognized character */
533       emit_warn("Unrecognized character '" +
534         new Character JavaDoc((char)next_char) + "'(" + next_char +
535         ") -- ignored");
536
537       /* advance past it */
538       advance();
539     }
540     }
541
542   /*-----------------------------------------------------------*/
543 }
544
545
Popular Tags