Parser


1   /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2    * This file is part of Beaver Parser Generator.                       *
3    * Copyright (C) 2003,2004 Alexander Demenchuk <alder@softanvil.com>.  *
4    * All rights reserved.                                                *
5    * See the file "LICENSE" for the terms and conditions for copying,    *
6    * distribution and modification of Beaver.                            *
7    * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
8   
9   package beaver;
10  
11  import java.io.IOException  ;
12  
13  /**
14   * Almost complete implementation of a LALR parser. Two components that it lacks to parse a concrete
15   * grammar -- rule actions and parsing tables -- are provided by a generated subclass.
16   */
17  public abstract class Parser
18  {
19      static public class Exception extends java.lang.Exception  
20      {
21          Exception(String   msg)
22          {
23              super(msg);
24          }
25      }
26      
27      /**
28       * This class "lists" reportable events that might happen during parsing.
29       */
30      static public class Events
31      {
32          public void scannerError(Scanner.Exception e)
33          {
34              System.err.print("Scanner Error:");
35              if (e.line > 0)
36              {
37                  System.err.print(e.line);
38                  System.err.print(',');
39                  System.err.print(e.column);
40                  System.err.print(':');
41              }
42              System.err.print(' ');
43              System.err.println(e.getMessage());
44          }
45          public void syntaxError(Symbol token)
46          {
47              System.err.print(':');
48              System.err.print(Symbol.getLine(token.start));
49              System.err.print(',');
50              System.err.print(Symbol.getColumn(token.start));
51              System.err.print('-');
52              System.err.print(Symbol.getLine(token.end));
53              System.err.print(',');
54              System.err.print(Symbol.getColumn(token.end));
55              System.err.print(": Syntax Error: unexpected token ");
56              if (token.value != null)
57              {
58                  System.err.print('"');
59                  System.err.print(token.value);
60                  System.err.println('"');
61              }
62              else
63              {
64                  System.err.print('#');
65                  System.err.println(token.id);
66              }
67          }
68          public void unexpectedTokenRemoved(Symbol token)
69          {
70              System.err.print(':');
71              System.err.print(Symbol.getLine(token.start));
72              System.err.print(',');
73              System.err.print(Symbol.getColumn(token.start));
74              System.err.print('-');
75              System.err.print(Symbol.getLine(token.end));
76              System.err.print(',');
77              System.err.print(Symbol.getColumn(token.end));
78              System.err.print(": Recovered: removed unexpected token ");
79              if (token.value != null)
80              {
81                  System.err.print('"');
82                  System.err.print(token.value);
83                  System.err.println('"');
84              }
85              else
86              {
87                  System.err.print('#');
88                  System.err.println(token.id);
89              }
90          }
91          public void missingTokenInserted(Symbol token)
92          {
93              System.err.print(':');
94              System.err.print(Symbol.getLine(token.start));
95              System.err.print(',');
96              System.err.print(Symbol.getColumn(token.start));
97              System.err.print('-');
98              System.err.print(Symbol.getLine(token.end));
99              System.err.print(',');
100             System.err.print(Symbol.getColumn(token.end));
101             System.err.print(": Recovered: inserted missing token ");
102             if (token.value != null)
103             {
104                 System.err.print('"');
105                 System.err.print(token.value);
106                 System.err.println('"');
107             }
108             else
109             {
110                 System.err.print('#');
111                 System.err.println(token.id);
112             }
113         }
114         public void misspelledTokenReplaced(Symbol token)
115         {
116             System.err.print(':');
117             System.err.print(Symbol.getLine(token.start));
118             System.err.print(',');
119             System.err.print(Symbol.getColumn(token.start));
120             System.err.print('-');
121             System.err.print(Symbol.getLine(token.end));
122             System.err.print(',');
123             System.err.print(Symbol.getColumn(token.end));
124             System.err.print(": Recovered: replaced unexpected token with ");
125             if (token.value != null)
126             {
127                 System.err.print('"');
128                 System.err.print(token.value);
129                 System.err.println('"');
130             }
131             else
132             {
133                 System.err.print('#');
134                 System.err.println(token.id);
135             }
136         }
137         public void errorPhraseRemoved(Symbol error)
138         {
139             System.err.print(':');
140             System.err.print(Symbol.getLine(error.start));
141             System.err.print(',');
142             System.err.print(Symbol.getColumn(error.start));
143             System.err.print('-');
144             System.err.print(Symbol.getLine(error.end));
145             System.err.print(',');
146             System.err.print(Symbol.getColumn(error.end));
147             System.err.println(": Recovered: removed error phrase");
148         }
149     }
150     
151     /**
152      * This class wrapps a Scanner and provides a token "accumulator" for a parsing simulation.
153      * <p>If a source we parse does not have syntax errors the only way this warpper affects a 
154      * parser is via an extra indirection when it delivers next token. When though parser needs
155      * to recover from a syntax error this wrapper accumulates tokens shifted by a forward parsing
156      * simulation and later feeds them to a recovered parser</p>
157      */
158     private class TokenStream
159     {
160         private Scanner  scanner;
161         private Symbol[] buffer;
162         private int      n_marked;
163         private int      n_read;
164         private int      n_written;
165         
166         TokenStream(Scanner scanner)
167         {
168             this.scanner = scanner;
169         }
170 
171         TokenStream(Scanner scanner, Symbol first_symbol)
172         {
173             this(scanner);
174             mark(1);
175             buffer[0] = first_symbol;
176             n_written++;
177         }
178         
179         Symbol nextToken() throws IOException  
180         {
181             if (buffer != null)
182             {               
183                 if (n_read < n_written)
184                     return buffer[n_read++];
185                 
186                 if (n_written < n_marked)
187                 {
188                     n_read++;
189                     return buffer[n_written++] = readToken();
190                 }
191                 buffer = null;
192             }
193             return readToken();
194         }
195 
196         /**
197          * Prepare a stream to accumulate tokens.
198          * 
199          * @param size number of shifted tokens to accumulate
200          */
201         void mark(int size)
202         {
203             buffer = new Symbol[(n_marked = size) + 1];
204             n_read = n_written = 0;
205         }
206         
207         /**
208          * Prepare accumulated tokens to be reread by a next simulation run
209          * or by a recovered parser.
210          */
211         void reset()
212         {
213             n_read = 0;
214         }
215 
216         /**
217          * Checks whether a simulation filled the token accumulator. 
218          * 
219          * @return true if accumulator is full
220          */
221         boolean isFull()
222         {
223             return n_read == n_marked;
224         }
225         
226         /**
227          * Insert two tokens at the beginning of a stream 
228          * 
229          * @param token to be inserted
230          */
231         void insert(Symbol t0, Symbol t1)
232         {
233             System.arraycopy(buffer, 0, buffer, 2, n_written);
234             buffer[0] = t0;
235             buffer[1] = t1;
236             n_written += 2;
237         }
238         
239         /**
240          * Removes a token from the accumulator.
241          * 
242          * @param i index of a token in the accumulator.
243          * @return removed token
244          */
245         Symbol remove(int i)
246         {
247             Symbol token = buffer[i];
248             int last = n_written - 1;
249             while (i < last)
250             {
251                 buffer[i] = buffer[++i];
252             }
253             n_written = last;
254             return token;
255         }
256         
257         /**
258          * Reads next recognized token from the scanner. If scanner fails to recognize a token and
259          * throws an exception it will be reported via Parser.scannerError().
260          * <p>It is expected that scanner is capable of returning at least an EOF token after the
261          * exception.</p>
262          * 
263          * @param src scanner
264          * @return next recognized token
265          * @throws IOException
266          *             as thrown by a scanner
267          */
268         private Symbol readToken() throws IOException  
269         {
270             while (true)
271             {
272                 try
273                 {
274                     return scanner.nextToken();
275                 }
276                 catch (Scanner.Exception e)
277                 {
278                     report.scannerError(e);
279                 }
280             }
281         }
282     }
283 
284     /**
285      * Simulator is a stripped (of action code) version of a parser that will try to parse ahead
286      * token stream after a syntax error. The simulation is considered successful if 3 tokens were
287      * shifted successfully. If during simulation this parser enconters an error it drops the first
288      * token it tried to use and restarts the simulated parsing.
289      * <p>
290      * Note: Without a special "error" rule present in a grammar, which a parser will try to shift
291      * at the beginning of an error recovery, simulation continues without removing anything from
292      * the original states stack. This often will lead to cases when no parsing ahead will recover
293      * the parser from a syntax error.
294      * </p>
295      */
296     private class Simulator
297     {
298         private short[] states;
299         private int top, min_top;
300 
301         boolean parse(TokenStream in) throws IOException  
302         {
303             initStack();
304             do {
305                 Symbol token = in.nextToken();
306                 while (true)
307                 {
308                     short act = tables.findParserAction(states[top], token.id);
309                     if (act > 0)
310                     {
311                         shift(act);
312                         break;
313                     }
314                     else if (act == accept_action_id)
315                     {
316                         return true;
317                     }
318                     else if (act < 0)
319                     {
320                         short nt_id = reduce(~act);
321 
322                         act = tables.findNextState(states[top], nt_id);
323                         if (act > 0)
324                             shift(act);
325                         else
326                             return act == accept_action_id;
327                     }
328                     else // act == 0, i.e. this is an error
329                     {
330                         return false;
331                     }
332                 }
333             }
334             while (!in.isFull());
335             return true;
336         }
337 
338         private void initStack() throws IOException  
339         {
340             if (states == null || states.length < Parser.this.states.length)
341             {
342                 states = new short[Parser.this.states.length];
343                 min_top = 0;
344             }
345             System.arraycopy(Parser.this.states, min_top, states, min_top, (top = Parser.this.top) + 1);
346         }
347 
348         private void increaseStackCapacity()
349         {
350             short[] new_states = new short[states.length * 2];
351             System.arraycopy(states, 0, new_states, 0, states.length);
352             states = new_states;
353         }
354 
355         private void shift(short state)
356         {
357             if (++top == states.length)
358                 increaseStackCapacity();
359             states[top] = state;
360         }
361 
362         private short reduce(int rule_id)
363         {
364             int rule_info = tables.rule_infos[rule_id];
365             int rhs_size = rule_info & 0xFFFF;
366             top -= rhs_size;
367             min_top = Math.min(min_top, top);
368             return (short) (rule_info >>> 16);
369         }
370     }
371 
372     /** The automaton tables. */
373     private final ParsingTables tables;
374 
375     /** Cached ID of the ACCEPT action. */
376     private final short accept_action_id;
377 
378     /** The parser's stack. */
379     private short[] states;
380 
381     /** Index of the stack's top element, i.e. it's = -1 when the stack is empty; */
382     private int top;
383 
384     /** The stack of shifted symbols. */
385     protected Symbol[] _symbols;
386 
387     /** Parsing events notification "gateway" */
388     protected Events report;
389     
390 
391     protected Parser(ParsingTables tables)
392     {
393         this.tables = tables;
394         this.accept_action_id = (short) ~tables.rule_infos.length;
395         this.states = new short[256];
396     }
397 
398     /**
399      * Parses a source and returns a semantic value of the accepted nonterminal
400      * 
401      * @param source of tokens - a Scanner
402      * @return semantic value of the accepted nonterminal
403      */
404     public Object   parse(Scanner source) throws IOException  , Parser.Exception
405     {
406         init();
407         return parse(new TokenStream(source));
408     }
409     
410     /**
411      * Parses a source and returns a semantic value of the accepted nonterminal.
412      * Before parsing starts injects alternative goal marker into the source to
413      * indicate that an alternative goal should be matched.
414      * 
415      * @param source of tokens - a Scanner
416      * @param alt_goal_marker_id ID of a token like symbol that will be used as a marker
417      * @return semantic value of the accepted nonterminal
418      */
419     public Object   parse(Scanner source, short alt_goal_marker_id) throws IOException  , Parser.Exception
420     {
421         init();
422         TokenStream in = new TokenStream(source, new Symbol(alt_goal_marker_id));
423         return parse(in);
424     }
425     
426     private Object   parse(TokenStream in) throws IOException  , Parser.Exception
427     {
428         while (true)
429         {
430             Symbol token = in.nextToken();
431             while (true)
432             {
433                 short act = tables.findParserAction(states[top], token.id);
434                 if (act > 0)
435                 {
436                     shift(token, act);
437                     break;
438                 }
439                 else if (act == accept_action_id)
440                 {
441                     Symbol goal = _symbols[top];
442                     _symbols = null; // drop this stack to prevent loitering
443                     return goal.value;
444                 }
445                 else if (act < 0)
446                 {
447                     Symbol nt = reduce(~act);
448                     act = tables.findNextState(states[top], nt.id);
449                     if (act > 0)
450                     {
451                         shift(nt, act);
452                     }
453                     else if (act == accept_action_id)
454                     {
455                         _symbols = null; // no loitering
456                         return nt.value;
457                     }
458                     else
459                     {
460                         throw new IllegalStateException  ("Cannot shift a nonterminal");
461                     }
462                 }
463                 else // act == 0, i.e. this is an error
464                 {
465                     report.syntaxError(token);
466                     recoverFromError(token, in);
467                     break; // because error recovery altered token stream - parser needs to refetch the next token
468                 }
469             }
470         }
471     }
472 
473     /**
474      * Invoke actual reduce action routine.
475      * Method must be implemented by a generated parser
476      * 
477      * @param rule_num ID of a reduce action routine to invoke
478      * @param offset to the symbol before first action routine argument
479      * @return reduced nonterminal
480      */
481     protected abstract Symbol invokeReduceAction(int rule_num, int offset);
482 
483     /**
484      * Performs stacks and, if not initialized yet, reduce actions array initialization.
485      */
486     private void init()
487     {
488         if (report == null) report = new Events();
489         
490         _symbols = new Symbol[states.length];
491         top = 0; // i.e. it's not empty
492         _symbols[top] = new Symbol("none"); // need a symbol here for a default reduce on the very first erroneous token  
493         states[top] = 1; // initial/first state
494     }
495 
496     /**
497      * Increases the stack capacity if it has no room for new entries.
498      */
499     private void increaseStackCapacity()
500     {
501         short[] new_states = new short[states.length * 2];
502         System.arraycopy(states, 0, new_states, 0, states.length);
503         states = new_states;
504 
505         Symbol[] new_stack = new Symbol[states.length];
506         System.arraycopy(_symbols, 0, new_stack, 0, _symbols.length);
507         _symbols = new_stack;
508     }
509 
510     /**
511      * Shift a symbol to stack and go to a new state 
512      * 
513      * @param sym
514      *            symbol that will be shifted
515      * @param goto_state
516      *            to switch to
517      */
518     private void shift(Symbol sym, short goto_state)
519     {
520         if (++top == states.length)
521             increaseStackCapacity();
522         _symbols[top] = sym;
523         states[top] = goto_state;
524     }
525 
526     /**
527      * Perform a reduce action.
528      * 
529      * @param rule_id
530      *            Number of the production by which to reduce
531      * @return nonterminal created by a reduction
532      */
533     private Symbol reduce(int rule_id)
534     {
535         int rule_info = tables.rule_infos[rule_id];
536         int rhs_size = rule_info & 0xFFFF;
537 
538         top -= rhs_size;
539         Symbol lhs_sym = invokeReduceAction(rule_id, top);
540         lhs_sym.id = (short) (rule_info >>> 16);
541         if (rhs_size == 0)
542         {
543             lhs_sym.start = lhs_sym.end = _symbols[top].end;
544         }
545         else
546         {
547             lhs_sym.start = _symbols[top + 1].start;
548             lhs_sym.end = _symbols[top + rhs_size].end;
549         }
550         return lhs_sym;
551     }
552 
553     /**
554      * Implements parsing error recovery. Tries several simple approches first, like deleting "bad" token
555      * or replacing the latter with one of the expected in his state (if possible). If simple methods did
556      * not work tries to do error phrase recovery.
557      * 
558      * It is expected that normally descendand parsers do not need to alter this method. In same cases though
559      * they may want to override it if they need a different error recovery strategy. 
560      * 
561      * @param token a lookahead terminal symbol that messed parsing 
562      * @param in token stream
563      * @throws IOException propagated from a scanner if it has issues with the source
564      * @throws Parser.Exception if Parser cannot recover
565      */
566     protected void recoverFromError(Symbol token, TokenStream in) throws IOException  , Parser.Exception
567     {
568         if (token.id == 0) // end of input
569             throw new Parser.Exception("Cannot recover from the syntax error");
570         
571         Simulator sim = new Simulator();
572         in.mark(3);
573         if (sim.parse(in)) // just delete "token" from the stream
574         {
575             in.reset();
576             report.unexpectedTokenRemoved(token);
577             return;
578         }
579         short current_state = states[top];
580         if (!tables.compressed) // then try other simple recoveries
581         {
582             short first_term_id = tables.findFirstTerminal(current_state);
583             if (first_term_id >= 0)
584             {
585                 Symbol term = new Symbol(first_term_id, _symbols[top].end, token.start);
586                 in.insert(term, token); // insert expected terminal before unexpected one
587                 in.reset();
588                 if (sim.parse(in))
589                 {
590                     in.reset();
591                     report.missingTokenInserted(term);
592                     return;
593                 }
594                 
595                 int offset = tables.actn_offsets[current_state];
596                 
597                 for (short term_id = (short) (first_term_id + 1); term_id < tables.n_term; term_id++)
598                 {
599                     int index = offset + term_id;
600                     if (index >= tables.lookaheads.length)
601                         break;
602                     if (tables.lookaheads[index] == term_id)
603                     {
604                         term.id = term_id;
605                         in.reset();
606                         if (sim.parse(in))
607                         {
608                             in.reset();
609                             report.missingTokenInserted(term);
610                             return;
611                         }
612                     }
613                 }
614                 in.remove(1); // unexpected token, i.e. alter stream as if we replaced 
615                               // an unexpected token to an expected terminal
616                 term.start = token.start;
617                 term.end = token.end;
618                 
619                 for (short term_id = first_term_id; term_id < tables.n_term; term_id++)
620                 {
621                     int index = offset + term_id;
622                     if (index >= tables.lookaheads.length)
623                         break;
624                     if (tables.lookaheads[index] == term_id)
625                     {
626                         term.id = term_id;
627                         in.reset();
628                         if (sim.parse(in))
629                         {
630                             in.reset();
631                             report.misspelledTokenReplaced(term);
632                             return;
633                         }
634                     }
635                 }
636                 in.remove(0); // simple recoveries failed - remove all stream changes 
637             }
638         }
639         // Simple recoveries failed or are not applicable. Next step is an error phrase recovery.
640         /*
641          * Find a state where parser can shift "error" symbol. Discard already reduced (and shifted)
642          * productions, which are part of a phrase where unexpected terminal is found. (Note that if
643          * "error" symbol was not used by a grammar, in the end the entire input becomes an error phrase,
644          * and ... parser won't recover from it :)
645          */
646         Symbol first_sym = token, last_sym = token;
647         short goto_state;
648         while ((goto_state = tables.findNextState(states[top], tables.error_symbol_id)) <= 0)
649         {
650             // parser cannot shift "error" in this state, so use the top symbol
651             // as the leftmost symbol of an error phrase
652             first_sym = _symbols[top];
653             // and go to the previous state
654             if (--top < 0)
655                 throw new Parser.Exception("Cannot recover from the syntax error");
656         }
657         Symbol error = new Symbol(tables.error_symbol_id, first_sym.start, last_sym.end); // the end is temporary
658         shift(error, goto_state);
659 
660         in.reset();
661         while (!sim.parse(in))
662         {
663             last_sym = in.remove(0);
664             if (last_sym.id == 0) // EOF
665                 throw new Parser.Exception("Cannot recover from the syntax error");
666             in.reset();
667         }
668         error.end = last_sym.end;
669         in.reset();
670         report.errorPhraseRemoved(error);
671     }
672 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags