KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > beaver > Parser


1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  * This file is part of Beaver Parser Generator. *
3  * Copyright (C) 2003,2004 Alexander Demenchuk <alder@softanvil.com>. *
4  * All rights reserved. *
5  * See the file "LICENSE" for the terms and conditions for copying, *
6  * distribution and modification of Beaver. *
7  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

8
9 package beaver;
10
11 import java.io.IOException JavaDoc;
12
13 /**
14  * Almost complete implementation of a LALR parser. Two components that it lacks to parse a concrete
15  * grammar -- rule actions and parsing tables -- are provided by a generated subclass.
16  */

17 public abstract class Parser
18 {
19     static public class Exception extends java.lang.Exception JavaDoc
20     {
21         Exception(String JavaDoc msg)
22         {
23             super(msg);
24         }
25     }
26     
27     /**
28      * This class "lists" reportable events that might happen during parsing.
29      */

30     static public class Events
31     {
32         public void scannerError(Scanner.Exception e)
33         {
34             System.err.print("Scanner Error:");
35             if (e.line > 0)
36             {
37                 System.err.print(e.line);
38                 System.err.print(',');
39                 System.err.print(e.column);
40                 System.err.print(':');
41             }
42             System.err.print(' ');
43             System.err.println(e.getMessage());
44         }
45         public void syntaxError(Symbol token)
46         {
47             System.err.print(':');
48             System.err.print(Symbol.getLine(token.start));
49             System.err.print(',');
50             System.err.print(Symbol.getColumn(token.start));
51             System.err.print('-');
52             System.err.print(Symbol.getLine(token.end));
53             System.err.print(',');
54             System.err.print(Symbol.getColumn(token.end));
55             System.err.print(": Syntax Error: unexpected token ");
56             if (token.value != null)
57             {
58                 System.err.print('"');
59                 System.err.print(token.value);
60                 System.err.println('"');
61             }
62             else
63             {
64                 System.err.print('#');
65                 System.err.println(token.id);
66             }
67         }
68         public void unexpectedTokenRemoved(Symbol token)
69         {
70             System.err.print(':');
71             System.err.print(Symbol.getLine(token.start));
72             System.err.print(',');
73             System.err.print(Symbol.getColumn(token.start));
74             System.err.print('-');
75             System.err.print(Symbol.getLine(token.end));
76             System.err.print(',');
77             System.err.print(Symbol.getColumn(token.end));
78             System.err.print(": Recovered: removed unexpected token ");
79             if (token.value != null)
80             {
81                 System.err.print('"');
82                 System.err.print(token.value);
83                 System.err.println('"');
84             }
85             else
86             {
87                 System.err.print('#');
88                 System.err.println(token.id);
89             }
90         }
91         public void missingTokenInserted(Symbol token)
92         {
93             System.err.print(':');
94             System.err.print(Symbol.getLine(token.start));
95             System.err.print(',');
96             System.err.print(Symbol.getColumn(token.start));
97             System.err.print('-');
98             System.err.print(Symbol.getLine(token.end));
99             System.err.print(',');
100             System.err.print(Symbol.getColumn(token.end));
101             System.err.print(": Recovered: inserted missing token ");
102             if (token.value != null)
103             {
104                 System.err.print('"');
105                 System.err.print(token.value);
106                 System.err.println('"');
107             }
108             else
109             {
110                 System.err.print('#');
111                 System.err.println(token.id);
112             }
113         }
114         public void misspelledTokenReplaced(Symbol token)
115         {
116             System.err.print(':');
117             System.err.print(Symbol.getLine(token.start));
118             System.err.print(',');
119             System.err.print(Symbol.getColumn(token.start));
120             System.err.print('-');
121             System.err.print(Symbol.getLine(token.end));
122             System.err.print(',');
123             System.err.print(Symbol.getColumn(token.end));
124             System.err.print(": Recovered: replaced unexpected token with ");
125             if (token.value != null)
126             {
127                 System.err.print('"');
128                 System.err.print(token.value);
129                 System.err.println('"');
130             }
131             else
132             {
133                 System.err.print('#');
134                 System.err.println(token.id);
135             }
136         }
137         public void errorPhraseRemoved(Symbol error)
138         {
139             System.err.print(':');
140             System.err.print(Symbol.getLine(error.start));
141             System.err.print(',');
142             System.err.print(Symbol.getColumn(error.start));
143             System.err.print('-');
144             System.err.print(Symbol.getLine(error.end));
145             System.err.print(',');
146             System.err.print(Symbol.getColumn(error.end));
147             System.err.println(": Recovered: removed error phrase");
148         }
149     }
150     
151     /**
152      * This class wrapps a Scanner and provides a token "accumulator" for a parsing simulation.
153      * <p>If a source we parse does not have syntax errors the only way this warpper affects a
154      * parser is via an extra indirection when it delivers next token. When though parser needs
155      * to recover from a syntax error this wrapper accumulates tokens shifted by a forward parsing
156      * simulation and later feeds them to a recovered parser</p>
157      */

158     private class TokenStream
159     {
160         private Scanner scanner;
161         private Symbol[] buffer;
162         private int n_marked;
163         private int n_read;
164         private int n_written;
165         
166         TokenStream(Scanner scanner)
167         {
168             this.scanner = scanner;
169         }
170
171         TokenStream(Scanner scanner, Symbol first_symbol)
172         {
173             this(scanner);
174             mark(1);
175             buffer[0] = first_symbol;
176             n_written++;
177         }
178         
179         Symbol nextToken() throws IOException JavaDoc
180         {
181             if (buffer != null)
182             {
183                 if (n_read < n_written)
184                     return buffer[n_read++];
185                 
186                 if (n_written < n_marked)
187                 {
188                     n_read++;
189                     return buffer[n_written++] = readToken();
190                 }
191                 buffer = null;
192             }
193             return readToken();
194         }
195
196         /**
197          * Prepare a stream to accumulate tokens.
198          *
199          * @param size number of shifted tokens to accumulate
200          */

201         void mark(int size)
202         {
203             buffer = new Symbol[(n_marked = size) + 1];
204             n_read = n_written = 0;
205         }
206         
207         /**
208          * Prepare accumulated tokens to be reread by a next simulation run
209          * or by a recovered parser.
210          */

211         void reset()
212         {
213             n_read = 0;
214         }
215
216         /**
217          * Checks whether a simulation filled the token accumulator.
218          *
219          * @return true if accumulator is full
220          */

221         boolean isFull()
222         {
223             return n_read == n_marked;
224         }
225         
226         /**
227          * Insert two tokens at the beginning of a stream
228          *
229          * @param token to be inserted
230          */

231         void insert(Symbol t0, Symbol t1)
232         {
233             System.arraycopy(buffer, 0, buffer, 2, n_written);
234             buffer[0] = t0;
235             buffer[1] = t1;
236             n_written += 2;
237         }
238         
239         /**
240          * Removes a token from the accumulator.
241          *
242          * @param i index of a token in the accumulator.
243          * @return removed token
244          */

245         Symbol remove(int i)
246         {
247             Symbol token = buffer[i];
248             int last = n_written - 1;
249             while (i < last)
250             {
251                 buffer[i] = buffer[++i];
252             }
253             n_written = last;
254             return token;
255         }
256         
257         /**
258          * Reads next recognized token from the scanner. If scanner fails to recognize a token and
259          * throws an exception it will be reported via Parser.scannerError().
260          * <p>It is expected that scanner is capable of returning at least an EOF token after the
261          * exception.</p>
262          *
263          * @param src scanner
264          * @return next recognized token
265          * @throws IOException
266          * as thrown by a scanner
267          */

268         private Symbol readToken() throws IOException JavaDoc
269         {
270             while (true)
271             {
272                 try
273                 {
274                     return scanner.nextToken();
275                 }
276                 catch (Scanner.Exception e)
277                 {
278                     report.scannerError(e);
279                 }
280             }
281         }
282     }
283
284     /**
285      * Simulator is a stripped (of action code) version of a parser that will try to parse ahead
286      * token stream after a syntax error. The simulation is considered successful if 3 tokens were
287      * shifted successfully. If during simulation this parser enconters an error it drops the first
288      * token it tried to use and restarts the simulated parsing.
289      * <p>
290      * Note: Without a special "error" rule present in a grammar, which a parser will try to shift
291      * at the beginning of an error recovery, simulation continues without removing anything from
292      * the original states stack. This often will lead to cases when no parsing ahead will recover
293      * the parser from a syntax error.
294      * </p>
295      */

296     private class Simulator
297     {
298         private short[] states;
299         private int top, min_top;
300
301         boolean parse(TokenStream in) throws IOException JavaDoc
302         {
303             initStack();
304             do {
305                 Symbol token = in.nextToken();
306                 while (true)
307                 {
308                     short act = tables.findParserAction(states[top], token.id);
309                     if (act > 0)
310                     {
311                         shift(act);
312                         break;
313                     }
314                     else if (act == accept_action_id)
315                     {
316                         return true;
317                     }
318                     else if (act < 0)
319                     {
320                         short nt_id = reduce(~act);
321
322                         act = tables.findNextState(states[top], nt_id);
323                         if (act > 0)
324                             shift(act);
325                         else
326                             return act == accept_action_id;
327                     }
328                     else // act == 0, i.e. this is an error
329
{
330                         return false;
331                     }
332                 }
333             }
334             while (!in.isFull());
335             return true;
336         }
337
338         private void initStack() throws IOException JavaDoc
339         {
340             if (states == null || states.length < Parser.this.states.length)
341             {
342                 states = new short[Parser.this.states.length];
343                 min_top = 0;
344             }
345             System.arraycopy(Parser.this.states, min_top, states, min_top, (top = Parser.this.top) + 1);
346         }
347
348         private void increaseStackCapacity()
349         {
350             short[] new_states = new short[states.length * 2];
351             System.arraycopy(states, 0, new_states, 0, states.length);
352             states = new_states;
353         }
354
355         private void shift(short state)
356         {
357             if (++top == states.length)
358                 increaseStackCapacity();
359             states[top] = state;
360         }
361
362         private short reduce(int rule_id)
363         {
364             int rule_info = tables.rule_infos[rule_id];
365             int rhs_size = rule_info & 0xFFFF;
366             top -= rhs_size;
367             min_top = Math.min(min_top, top);
368             return (short) (rule_info >>> 16);
369         }
370     }
371
372     /** The automaton tables. */
373     private final ParsingTables tables;
374
375     /** Cached ID of the ACCEPT action. */
376     private final short accept_action_id;
377
378     /** The parser's stack. */
379     private short[] states;
380
381     /** Index of the stack's top element, i.e. it's = -1 when the stack is empty; */
382     private int top;
383
384     /** The stack of shifted symbols. */
385     protected Symbol[] _symbols;
386
387     /** Parsing events notification "gateway" */
388     protected Events report;
389     
390
391     protected Parser(ParsingTables tables)
392     {
393         this.tables = tables;
394         this.accept_action_id = (short) ~tables.rule_infos.length;
395         this.states = new short[256];
396     }
397
398     /**
399      * Parses a source and returns a semantic value of the accepted nonterminal
400      *
401      * @param source of tokens - a Scanner
402      * @return semantic value of the accepted nonterminal
403      */

404     public Object JavaDoc parse(Scanner source) throws IOException JavaDoc, Parser.Exception
405     {
406         init();
407         return parse(new TokenStream(source));
408     }
409     
410     /**
411      * Parses a source and returns a semantic value of the accepted nonterminal.
412      * Before parsing starts injects alternative goal marker into the source to
413      * indicate that an alternative goal should be matched.
414      *
415      * @param source of tokens - a Scanner
416      * @param alt_goal_marker_id ID of a token like symbol that will be used as a marker
417      * @return semantic value of the accepted nonterminal
418      */

419     public Object JavaDoc parse(Scanner source, short alt_goal_marker_id) throws IOException JavaDoc, Parser.Exception
420     {
421         init();
422         TokenStream in = new TokenStream(source, new Symbol(alt_goal_marker_id));
423         return parse(in);
424     }
425     
426     private Object JavaDoc parse(TokenStream in) throws IOException JavaDoc, Parser.Exception
427     {
428         while (true)
429         {
430             Symbol token = in.nextToken();
431             while (true)
432             {
433                 short act = tables.findParserAction(states[top], token.id);
434                 if (act > 0)
435                 {
436                     shift(token, act);
437                     break;
438                 }
439                 else if (act == accept_action_id)
440                 {
441                     Symbol goal = _symbols[top];
442                     _symbols = null; // drop this stack to prevent loitering
443
return goal.value;
444                 }
445                 else if (act < 0)
446                 {
447                     Symbol nt = reduce(~act);
448                     act = tables.findNextState(states[top], nt.id);
449                     if (act > 0)
450                     {
451                         shift(nt, act);
452                     }
453                     else if (act == accept_action_id)
454                     {
455                         _symbols = null; // no loitering
456
return nt.value;
457                     }
458                     else
459                     {
460                         throw new IllegalStateException JavaDoc("Cannot shift a nonterminal");
461                     }
462                 }
463                 else // act == 0, i.e. this is an error
464
{
465                     report.syntaxError(token);
466                     recoverFromError(token, in);
467                     break; // because error recovery altered token stream - parser needs to refetch the next token
468
}
469             }
470         }
471     }
472
473     /**
474      * Invoke actual reduce action routine.
475      * Method must be implemented by a generated parser
476      *
477      * @param rule_num ID of a reduce action routine to invoke
478      * @param offset to the symbol before first action routine argument
479      * @return reduced nonterminal
480      */

481     protected abstract Symbol invokeReduceAction(int rule_num, int offset);
482
483     /**
484      * Performs stacks and, if not initialized yet, reduce actions array initialization.
485      */

486     private void init()
487     {
488         if (report == null) report = new Events();
489         
490         _symbols = new Symbol[states.length];
491         top = 0; // i.e. it's not empty
492
_symbols[top] = new Symbol("none"); // need a symbol here for a default reduce on the very first erroneous token
493
states[top] = 1; // initial/first state
494
}
495
496     /**
497      * Increases the stack capacity if it has no room for new entries.
498      */

499     private void increaseStackCapacity()
500     {
501         short[] new_states = new short[states.length * 2];
502         System.arraycopy(states, 0, new_states, 0, states.length);
503         states = new_states;
504
505         Symbol[] new_stack = new Symbol[states.length];
506         System.arraycopy(_symbols, 0, new_stack, 0, _symbols.length);
507         _symbols = new_stack;
508     }
509
510     /**
511      * Shift a symbol to stack and go to a new state
512      *
513      * @param sym
514      * symbol that will be shifted
515      * @param goto_state
516      * to switch to
517      */

518     private void shift(Symbol sym, short goto_state)
519     {
520         if (++top == states.length)
521             increaseStackCapacity();
522         _symbols[top] = sym;
523         states[top] = goto_state;
524     }
525
526     /**
527      * Perform a reduce action.
528      *
529      * @param rule_id
530      * Number of the production by which to reduce
531      * @return nonterminal created by a reduction
532      */

533     private Symbol reduce(int rule_id)
534     {
535         int rule_info = tables.rule_infos[rule_id];
536         int rhs_size = rule_info & 0xFFFF;
537
538         top -= rhs_size;
539         Symbol lhs_sym = invokeReduceAction(rule_id, top);
540         lhs_sym.id = (short) (rule_info >>> 16);
541         if (rhs_size == 0)
542         {
543             lhs_sym.start = lhs_sym.end = _symbols[top].end;
544         }
545         else
546         {
547             lhs_sym.start = _symbols[top + 1].start;
548             lhs_sym.end = _symbols[top + rhs_size].end;
549         }
550         return lhs_sym;
551     }
552
553     /**
554      * Implements parsing error recovery. Tries several simple approches first, like deleting "bad" token
555      * or replacing the latter with one of the expected in his state (if possible). If simple methods did
556      * not work tries to do error phrase recovery.
557      *
558      * It is expected that normally descendand parsers do not need to alter this method. In same cases though
559      * they may want to override it if they need a different error recovery strategy.
560      *
561      * @param token a lookahead terminal symbol that messed parsing
562      * @param in token stream
563      * @throws IOException propagated from a scanner if it has issues with the source
564      * @throws Parser.Exception if Parser cannot recover
565      */

566     protected void recoverFromError(Symbol token, TokenStream in) throws IOException JavaDoc, Parser.Exception
567     {
568         if (token.id == 0) // end of input
569
throw new Parser.Exception("Cannot recover from the syntax error");
570         
571         Simulator sim = new Simulator();
572         in.mark(3);
573         if (sim.parse(in)) // just delete "token" from the stream
574
{
575             in.reset();
576             report.unexpectedTokenRemoved(token);
577             return;
578         }
579         short current_state = states[top];
580         if (!tables.compressed) // then try other simple recoveries
581
{
582             short first_term_id = tables.findFirstTerminal(current_state);
583             if (first_term_id >= 0)
584             {
585                 Symbol term = new Symbol(first_term_id, _symbols[top].end, token.start);
586                 in.insert(term, token); // insert expected terminal before unexpected one
587
in.reset();
588                 if (sim.parse(in))
589                 {
590                     in.reset();
591                     report.missingTokenInserted(term);
592                     return;
593                 }
594                 
595                 int offset = tables.actn_offsets[current_state];
596                 
597                 for (short term_id = (short) (first_term_id + 1); term_id < tables.n_term; term_id++)
598                 {
599                     int index = offset + term_id;
600                     if (index >= tables.lookaheads.length)
601                         break;
602                     if (tables.lookaheads[index] == term_id)
603                     {
604                         term.id = term_id;
605                         in.reset();
606                         if (sim.parse(in))
607                         {
608                             in.reset();
609                             report.missingTokenInserted(term);
610                             return;
611                         }
612                     }
613                 }
614                 in.remove(1); // unexpected token, i.e. alter stream as if we replaced
615
// an unexpected token to an expected terminal
616
term.start = token.start;
617                 term.end = token.end;
618                 
619                 for (short term_id = first_term_id; term_id < tables.n_term; term_id++)
620                 {
621                     int index = offset + term_id;
622                     if (index >= tables.lookaheads.length)
623                         break;
624                     if (tables.lookaheads[index] == term_id)
625                     {
626                         term.id = term_id;
627                         in.reset();
628                         if (sim.parse(in))
629                         {
630                             in.reset();
631                             report.misspelledTokenReplaced(term);
632                             return;
633                         }
634                     }
635                 }
636                 in.remove(0); // simple recoveries failed - remove all stream changes
637
}
638         }
639         // Simple recoveries failed or are not applicable. Next step is an error phrase recovery.
640
/*
641          * Find a state where parser can shift "error" symbol. Discard already reduced (and shifted)
642          * productions, which are part of a phrase where unexpected terminal is found. (Note that if
643          * "error" symbol was not used by a grammar, in the end the entire input becomes an error phrase,
644          * and ... parser won't recover from it :)
645          */

646         Symbol first_sym = token, last_sym = token;
647         short goto_state;
648         while ((goto_state = tables.findNextState(states[top], tables.error_symbol_id)) <= 0)
649         {
650             // parser cannot shift "error" in this state, so use the top symbol
651
// as the leftmost symbol of an error phrase
652
first_sym = _symbols[top];
653             // and go to the previous state
654
if (--top < 0)
655                 throw new Parser.Exception("Cannot recover from the syntax error");
656         }
657         Symbol error = new Symbol(tables.error_symbol_id, first_sym.start, last_sym.end); // the end is temporary
658
shift(error, goto_state);
659
660         in.reset();
661         while (!sim.parse(in))
662         {
663             last_sym = in.remove(0);
664             if (last_sym.id == 0) // EOF
665
throw new Parser.Exception("Cannot recover from the syntax error");
666             in.reset();
667         }
668         error.end = last_sym.end;
669         in.reset();
670         report.errorPhraseRemoved(error);
671     }
672 }
Popular Tags