CodeGenerator


1   package persistence.antlr;
2   
3   /* ANTLR Translator Generator
4    * Project led by Terence Parr at http://www.jGuru.com
5    * Software rights: http://www.antlr.org/license.html
6    *
7    */
8   
9   import java.io.PrintWriter  ;
10  import java.io.IOException  ;
11  import java.io.FileWriter  ;
12  
13  import persistence.antlr.collections.impl.Vector;
14  import persistence.antlr.collections.impl.BitSet;
15  
16  /**A generic ANTLR code generator.  All code generators
17   * Derive from this class.
18   *
19   * <p>
20   * A CodeGenerator knows about a Grammar data structure and
21   * a grammar analyzer.  The Grammar is walked to generate the
22   * appropriate code for both a parser and lexer (if present).
23   * This interface may change slightly so that the lexer is
24   * itself living inside of a Grammar object (in which case,
25   * this class generates only one recognizer).  The main method
26   * to call is <tt>gen()</tt>, which initiates all code gen.
27   *
28   * <p>
29   * The interaction of the code generator with the analyzer is
30   * simple: each subrule block calls deterministic() before generating
31   * code for the block.  Method deterministic() sets lookahead caches
32   * in each Alternative object.  Technically, a code generator
33   * doesn't need the grammar analyzer if all lookahead analysis
34   * is done at runtime, but this would result in a slower parser.
35   *
36   * <p>
37   * This class provides a set of support utilities to handle argument
38   * list parsing and so on.
39   *
40   * @author  Terence Parr, John Lilley
41   * @version 2.00a
42   * @see     persistence.antlr.JavaCodeGenerator
43   * @see     persistence.antlr.DiagnosticCodeGenerator
44   * @see     persistence.antlr.LLkAnalyzer
45   * @see     persistence.antlr.Grammar
46   * @see     persistence.antlr.AlternativeElement
47   * @see     persistence.antlr.Lookahead
48   */
49  public abstract class CodeGenerator {
50      protected persistence.antlr.Tool antlrTool;
51  
52      /** Current tab indentation for code output */
53      protected int tabs = 0;
54  
55      /** Current output Stream */
56      transient protected PrintWriter   currentOutput; // SAS: for proper text i/o
57  
58      /** The grammar for which we generate code */
59      protected Grammar grammar = null;
60  
61      /** List of all bitsets that must be dumped.  These are Vectors of BitSet. */
62      protected Vector bitsetsUsed;
63  
64      /** The grammar behavior */
65      protected DefineGrammarSymbols behavior;
66  
67      /** The LLk analyzer */
68      protected LLkGrammarAnalyzer analyzer;
69  
70      /** Object used to format characters in the target language.
71       * subclass must initialize this to the language-specific formatter
72       */
73      protected CharFormatter charFormatter;
74  
75      /** Use option "codeGenDebug" to generate debugging output */
76      protected boolean DEBUG_CODE_GENERATOR = false;
77  
78      /** Default values for code-generation thresholds */
79      protected static final int DEFAULT_MAKE_SWITCH_THRESHOLD = 2;
80      protected static final int DEFAULT_BITSET_TEST_THRESHOLD = 4;
81  
82      /** If there are more than 8 long words to init in a bitset,
83       *  try to optimize it; e.g., detect runs of -1L and 0L.
84       */
85      protected static final int BITSET_OPTIMIZE_INIT_THRESHOLD = 8;
86  
87      /** This is a hint for the language-specific code generator.
88       * A switch() or language-specific equivalent will be generated instead
89       * of a series of if/else statements for blocks with number of alternates
90       * greater than or equal to this number of non-predicated LL(1) alternates.
91       * This is modified by the grammar option "codeGenMakeSwitchThreshold"
92       */
93      protected int makeSwitchThreshold = DEFAULT_MAKE_SWITCH_THRESHOLD;
94  
95      /** This is a hint for the language-specific code generator.
96       * A bitset membership test will be generated instead of an
97       * ORed series of LA(k) comparisions for lookahead sets with
98       * degree greater than or equal to this value.
99       * This is modified by the grammar option "codeGenBitsetTestThreshold"
100      */
101     protected int bitsetTestThreshold = DEFAULT_BITSET_TEST_THRESHOLD;
102 
103     private static boolean OLD_ACTION_TRANSLATOR = true;
104 
105     public static String   TokenTypesFileSuffix = "TokenTypes";
106     public static String   TokenTypesFileExt = ".txt";
107 
108     /** Construct code generator base class */
109     public CodeGenerator() {
110     }
111 
112     /** Output a String to the currentOutput stream.
113      * Ignored if string is null.
114      * @param s The string to output
115      */
116     protected void _print(String   s) {
117         if (s != null) {
118             currentOutput.print(s);
119         }
120     }
121 
122     /** Print an action without leading tabs, attempting to
123      * preserve the current indentation level for multi-line actions
124      * Ignored if string is null.
125      * @param s The action string to output
126      */
127     protected void _printAction(String   s) {
128         if (s == null) {
129             return;
130         }
131 
132         // Skip leading newlines, tabs and spaces
133         int start = 0;
134         while (start < s.length() && Character.isSpaceChar(s.charAt(start))) {
135             start++;
136         }
137 
138         // Skip leading newlines, tabs and spaces
139         int end = s.length() - 1;
140         while (end > start && Character.isSpaceChar(s.charAt(end))) {
141             end--;
142         }
143 
144         char c = 0;
145         for (int i = start; i <= end;) {
146             c = s.charAt(i);
147             i++;
148             boolean newline = false;
149             switch (c) {
150                 case '\n':
151                     newline = true;
152                     break;
153                 case '\r':
154                     if (i <= end && s.charAt(i) == '\n') {
155                         i++;
156                     }
157                     newline = true;
158                     break;
159                 default:
160                     currentOutput.print(c);
161                     break;
162             }
163             if (newline) {
164                 currentOutput.println();
165                 printTabs();
166                 // Absorb leading whitespace
167                 while (i <= end && Character.isSpaceChar(s.charAt(i))) {
168                     i++;
169                 }
170                 newline = false;
171             }
172         }
173         currentOutput.println();
174     }
175 
176     /** Output a String followed by newline, to the currentOutput stream.
177      * Ignored if string is null.
178      * @param s The string to output
179      */
180     protected void _println(String   s) {
181         if (s != null) {
182             currentOutput.println(s);
183         }
184     }
185 
186     /** Test if a set element array represents a contiguous range.
187      * @param elems The array of elements representing the set, usually from BitSet.toArray().
188      * @return true if the elements are a contiguous range (with two or more).
189      */
190     public static boolean elementsAreRange(int[] elems) {
191         if (elems.length == 0) {
192             return false;
193         }
194         int begin = elems[0];
195         int end = elems[elems.length - 1];
196         if (elems.length <= 2) {
197             // Not enough elements for a range expression
198             return false;
199         }
200         if (end - begin + 1 > elems.length) {
201             // The set does not represent a contiguous range
202             return false;
203         }
204         int v = begin + 1;
205         for (int i = 1; i < elems.length - 1; i++) {
206             if (v != elems[i]) {
207                 // The set does not represent a contiguous range
208                 return false;
209             }
210             v++;
211         }
212         return true;
213     }
214 
215     /** Get the identifier portion of an argument-action token.
216      * The ID of an action is assumed to be a trailing identifier.
217      * Specific code-generators may want to override this
218      * if the language has unusual declaration syntax.
219      * @param t The action token
220      * @return A string containing the text of the identifier
221      */
222     protected String   extractIdOfAction(Token t) {
223         return extractIdOfAction(t.getText(), t.getLine(), t.getColumn());
224     }
225 
226     /** Get the identifier portion of an argument-action.
227      * The ID of an action is assumed to be a trailing identifier.
228      * Specific code-generators may want to override this
229      * if the language has unusual declaration syntax.
230      * @param s The action text
231      * @param line Line used for error reporting.
232      * @param column Line used for error reporting.
233      * @return A string containing the text of the identifier
234      */
235     protected String   extractIdOfAction(String   s, int line, int column) {
236         s = removeAssignmentFromDeclaration(s);
237         // Search back from the end for a non alphanumeric.  That marks the
238         // beginning of the identifier
239         for (int i = s.length() - 2; i >= 0; i--) {
240             // TODO: make this work for language-independent identifiers?
241             if (!Character.isLetterOrDigit(s.charAt(i)) && s.charAt(i) != '_') {
242                 // Found end of type part
243                 return s.substring(i + 1);
244             }
245         }
246         // Something is bogus, but we cannot parse the language-specific
247         // actions any better.  The compiler will have to catch the problem.
248         antlrTool.warning("Ill-formed action", grammar.getFilename(), line, column);
249         return "";
250     }
251 
252     /** Get the type string out of an argument-action token.
253      * The type of an action is assumed to precede a trailing identifier
254      * Specific code-generators may want to override this
255      * if the language has unusual declaration syntax.
256      * @param t The action token
257      * @return A string containing the text of the type
258      */
259     protected String   extractTypeOfAction(Token t) {
260         return extractTypeOfAction(t.getText(), t.getLine(), t.getColumn());
261     }
262 
263     /** Get the type portion of an argument-action.
264      * The type of an action is assumed to precede a trailing identifier
265      * Specific code-generators may want to override this
266      * if the language has unusual declaration syntax.
267      * @param s The action text
268      * @param line Line used for error reporting.
269      * @return A string containing the text of the type
270      */
271     protected String   extractTypeOfAction(String   s, int line, int column) {
272         s = removeAssignmentFromDeclaration(s);
273         // Search back from the end for a non alphanumeric.  That marks the
274         // beginning of the identifier
275         for (int i = s.length() - 2; i >= 0; i--) {
276             // TODO: make this work for language-independent identifiers?
277             if (!Character.isLetterOrDigit(s.charAt(i)) && s.charAt(i) != '_') {
278                 // Found end of type part
279                 return s.substring(0, i + 1);
280             }
281         }
282         // Something is bogus, but we cannot parse the language-specific
283         // actions any better.  The compiler will have to catch the problem.
284         antlrTool.warning("Ill-formed action", grammar.getFilename(), line, column);
285         return "";
286     }
287 
288     /** Generate the code for all grammars
289      */
290     public abstract void gen();
291 
292     /** Generate code for the given grammar element.
293      * @param action The {...} action to generate
294      */
295     public abstract void gen(ActionElement action);
296 
297     /** Generate code for the given grammar element.
298      * @param blk The "x|y|z|..." block to generate
299      */
300     public abstract void gen(AlternativeBlock blk);
301 
302     /** Generate code for the given grammar element.
303      * @param end The block-end element to generate.  Block-end
304      * elements are synthesized by the grammar parser to represent
305      * the end of a block.
306      */
307     public abstract void gen(BlockEndElement end);
308 
309     /** Generate code for the given grammar element.
310      * @param atom The character literal reference to generate
311      */
312     public abstract void gen(CharLiteralElement atom);
313 
314     /** Generate code for the given grammar element.
315      * @param r The character-range reference to generate
316      */
317     public abstract void gen(CharRangeElement r);
318 
319     /** Generate the code for a parser */
320     public abstract void gen(LexerGrammar g) throws IOException  ;
321 
322     /** Generate code for the given grammar element.
323      * @param blk The (...)+ block to generate
324      */
325     public abstract void gen(OneOrMoreBlock blk);
326 
327     /** Generate the code for a parser */
328     public abstract void gen(ParserGrammar g) throws IOException  ;
329 
330     /** Generate code for the given grammar element.
331      * @param rr The rule-reference to generate
332      */
333     public abstract void gen(RuleRefElement rr);
334 
335     /** Generate code for the given grammar element.
336      * @param atom The string-literal reference to generate
337      */
338     public abstract void gen(StringLiteralElement atom);
339 
340     /** Generate code for the given grammar element.
341      * @param r The token-range reference to generate
342      */
343     public abstract void gen(TokenRangeElement r);
344 
345     /** Generate code for the given grammar element.
346      * @param atom The token-reference to generate
347      */
348     public abstract void gen(TokenRefElement atom);
349 
350     /** Generate code for the given grammar element.
351      * @param blk The tree to generate code for.
352      */
353     public abstract void gen(TreeElement t);
354 
355     /** Generate the code for a parser */
356     public abstract void gen(TreeWalkerGrammar g) throws IOException  ;
357 
358     /** Generate code for the given grammar element.
359      * @param wc The wildcard element to generate
360      */
361     public abstract void gen(WildcardElement wc);
362 
363     /** Generate code for the given grammar element.
364      * @param blk The (...)* block to generate
365      */
366     public abstract void gen(ZeroOrMoreBlock blk);
367 
368     /** Generate the token types as a text file for persistence across shared lexer/parser */
369     protected void genTokenInterchange(TokenManager tm) throws IOException   {
370         // Open the token output Java file and set the currentOutput stream
371         String   fName = tm.getName() + TokenTypesFileSuffix + TokenTypesFileExt;
372         currentOutput = antlrTool.openOutputFile(fName);
373 
374         println("// $ANTLR " + antlrTool.version + ": " +
375                 antlrTool.fileMinusPath(antlrTool.grammarFile) +
376                 " -> " +
377                 fName +
378                 "$");
379 
380         tabs = 0;
381 
382         // Header
383         println(tm.getName() + "    // output token vocab name");
384 
385         // Generate a definition for each token type
386         Vector v = tm.getVocabulary();
387         for (int i = Token.MIN_USER_TYPE; i < v.size(); i++) {
388             String   s = (String  )v.elementAt(i);
389             if (DEBUG_CODE_GENERATOR) {
390                 System.out.println("gen persistence file entry for: " + s);
391             }
392             if (s != null && !s.startsWith("<")) {
393                 // if literal, find label
394                 if (s.startsWith("\"")) {
395                     StringLiteralSymbol sl = (StringLiteralSymbol)tm.getTokenSymbol(s);
396                     if (sl != null && sl.label != null) {
397                         print(sl.label + "=");
398                     }
399                     println(s + "=" + i);
400                 }
401                 else {
402                     print(s);
403                     // check for a paraphrase
404                     TokenSymbol ts = (TokenSymbol)tm.getTokenSymbol(s);
405                     if (ts == null) {
406                         antlrTool.warning("undefined token symbol: " + s);
407                     }
408                     else {
409                         if (ts.getParaphrase() != null) {
410                             print("(" + ts.getParaphrase() + ")");
411                         }
412                     }
413                     println("=" + i);
414                 }
415             }
416         }
417 
418         // Close the tokens output file
419         currentOutput.close();
420         currentOutput = null;
421     }
422 
423     /** Process a string for an simple expression for use in xx/action.g
424      * it is used to cast simple tokens/references to the right type for
425      * the generated language.
426      * @param str A String.
427      */
428     public String   processStringForASTConstructor(String   str) {
429         return str;
430     }
431 
432     /** Get a string for an expression to generate creation of an AST subtree.
433      * @param v A Vector of String, where each element is an expression in the target language yielding an AST node.
434      */
435     public abstract String   getASTCreateString(Vector v);
436 
437     /** Get a string for an expression to generate creating of an AST node
438      * @param str The text of the arguments to the AST construction
439      */
440     public abstract String   getASTCreateString(GrammarAtom atom, String   str);
441 
442     /** Given the index of a bitset in the bitset list, generate a unique name.
443      * Specific code-generators may want to override this
444      * if the language does not allow '_' or numerals in identifiers.
445      * @param index  The index of the bitset in the bitset list.
446      */
447     protected String   getBitsetName(int index) {
448         return "_tokenSet_" + index;
449     }
450 
451     public static String   encodeLexerRuleName(String   id) {
452         return "m" + id;
453     }
454 
455     public static String   decodeLexerRuleName(String   id) {
456         if ( id==null ) {
457             return null;
458         }
459         return id.substring(1,id.length());
460     }
461 
462     /** Map an identifier to it's corresponding tree-node variable.
463      * This is context-sensitive, depending on the rule and alternative
464      * being generated
465      * @param id The identifier name to map
466      * @param forInput true if the input tree node variable is to be returned, otherwise the output variable is returned.
467      * @return The mapped id (which may be the same as the input), or null if the mapping is invalid due to duplicates
468      */
469     public abstract String   mapTreeId(String   id, ActionTransInfo tInfo);
470 
471     /** Add a bitset to the list of bitsets to be generated.
472      * if the bitset is already in the list, ignore the request.
473      * Always adds the bitset to the end of the list, so the
474      * caller can rely on the position of bitsets in the list.
475      * The returned position can be used to format the bitset
476      * name, since it is invariant.
477      * @param p Bit set to mark for code generation
478      * @param forParser true if the bitset is used for the parser, false for the lexer
479      * @return The position of the bitset in the list.
480      */
481     protected int markBitsetForGen(BitSet p) {
482         // Is the bitset (or an identical one) already marked for gen?
483         for (int i = 0; i < bitsetsUsed.size(); i++) {
484             BitSet set = (BitSet)bitsetsUsed.elementAt(i);
485             if (p.equals(set)) {
486                 // Use the identical one already stored
487                 return i;
488             }
489         }
490 
491         // Add the new bitset
492         bitsetsUsed.appendElement(p.clone());
493         return bitsetsUsed.size() - 1;
494     }
495 
496     /** Output tab indent followed by a String, to the currentOutput stream.
497      * Ignored if string is null.
498      * @param s The string to output.
499      */
500     protected void print(String   s) {
501         if (s != null) {
502             printTabs();
503             currentOutput.print(s);
504         }
505     }
506 
507     /** Print an action with leading tabs, attempting to
508      * preserve the current indentation level for multi-line actions
509      * Ignored if string is null.
510      * @param s The action string to output
511      */
512     protected void printAction(String   s) {
513         if (s != null) {
514             printTabs();
515             _printAction(s);
516         }
517     }
518 
519     /** Output tab indent followed by a String followed by newline,
520      * to the currentOutput stream.  Ignored if string is null.
521      * @param s The string to output
522      */
523     protected void println(String   s) {
524         if (s != null) {
525             printTabs();
526             currentOutput.println(s);
527         }
528     }
529 
530     /** Output the current tab indentation.  This outputs the number of tabs
531      * indicated by the "tabs" variable to the currentOutput stream.
532      */
533     protected void printTabs() {
534         for (int i = 1; i <= tabs; i++) {
535             currentOutput.print("\t");
536         }
537     }
538 
539     /** Lexically process $ and # references within the action.
540      *  This will replace #id and #(...) with the appropriate
541      *  function calls and/or variables etc...
542      */
543     protected abstract String   processActionForSpecialSymbols(String   actionStr,
544                                                              int line,
545                                                              RuleBlock currentRule,
546                                                              ActionTransInfo tInfo);
547 
548     public String   getFOLLOWBitSet(String   ruleName, int k) {
549         GrammarSymbol rs = grammar.getSymbol(ruleName);
550         if ( !(rs instanceof RuleSymbol) ) {
551             return null;
552         }
553         RuleBlock blk = ((RuleSymbol)rs).getBlock();
554         Lookahead follow = grammar.theLLkAnalyzer.FOLLOW(k, blk.endNode);
555         String   followSetName = getBitsetName(markBitsetForGen(follow.fset));
556         return followSetName;
557     }
558 
559     public String   getFIRSTBitSet(String   ruleName, int k) {
560         GrammarSymbol rs = grammar.getSymbol(ruleName);
561         if ( !(rs instanceof RuleSymbol) ) {
562             return null;
563         }
564         RuleBlock blk = ((RuleSymbol)rs).getBlock();
565         Lookahead first = grammar.theLLkAnalyzer.look(k, blk);
566         String   firstSetName = getBitsetName(markBitsetForGen(first.fset));
567         return firstSetName;
568     }
569 
570     /**
571      * Remove the assignment portion of a declaration, if any.
572      * @param d the declaration
573      * @return the declaration without any assignment portion
574      */
575     protected String   removeAssignmentFromDeclaration(String   d) {
576         // If d contains an equal sign, then it's a declaration
577         // with an initialization.  Strip off the initialization part.
578         if (d.indexOf('=') >= 0) d = d.substring(0, d.indexOf('=')).trim();
579         return d;
580     }
581 
582     /** Set all fields back like one just created */
583     private void reset() {
584         tabs = 0;
585         // Allocate list of bitsets tagged for code generation
586         bitsetsUsed = new Vector();
587         currentOutput = null;
588         grammar = null;
589         DEBUG_CODE_GENERATOR = false;
590         makeSwitchThreshold = DEFAULT_MAKE_SWITCH_THRESHOLD;
591         bitsetTestThreshold = DEFAULT_BITSET_TEST_THRESHOLD;
592     }
593 
594     public static String   reverseLexerRuleName(String   id) {
595         return id.substring(1, id.length());
596     }
597 
598     public void setAnalyzer(LLkGrammarAnalyzer analyzer_) {
599         analyzer = analyzer_;
600     }
601 
602     public void setBehavior(DefineGrammarSymbols behavior_) {
603         behavior = behavior_;
604     }
605 
606     /** Set a grammar for the code generator to use */
607     protected void setGrammar(Grammar g) {
608         reset();
609         grammar = g;
610         // Lookup make-switch threshold in the grammar generic options
611         if (grammar.hasOption("codeGenMakeSwitchThreshold")) {
612             try {
613                 makeSwitchThreshold = grammar.getIntegerOption("codeGenMakeSwitchThreshold");
614                 //System.out.println("setting codeGenMakeSwitchThreshold to " + makeSwitchThreshold);
615             }
616             catch (NumberFormatException   e) {
617                 Token tok = grammar.getOption("codeGenMakeSwitchThreshold");
618                 antlrTool.error(
619                     "option 'codeGenMakeSwitchThreshold' must be an integer",
620                     grammar.getClassName(),
621                     tok.getLine(), tok.getColumn()
622                 );
623             }
624         }
625 
626         // Lookup bitset-test threshold in the grammar generic options
627         if (grammar.hasOption("codeGenBitsetTestThreshold")) {
628             try {
629                 bitsetTestThreshold = grammar.getIntegerOption("codeGenBitsetTestThreshold");
630                 //System.out.println("setting codeGenBitsetTestThreshold to " + bitsetTestThreshold);
631             }
632             catch (NumberFormatException   e) {
633                 Token tok = grammar.getOption("codeGenBitsetTestThreshold");
634                 antlrTool.error(
635                     "option 'codeGenBitsetTestThreshold' must be an integer",
636                     grammar.getClassName(),
637                     tok.getLine(), tok.getColumn()
638                 );
639             }
640         }
641 
642         // Lookup debug code-gen in the grammar generic options
643         if (grammar.hasOption("codeGenDebug")) {
644             Token t = grammar.getOption("codeGenDebug");
645             if (t.getText().equals("true")) {
646                 //System.out.println("setting code-generation debug ON");
647                 DEBUG_CODE_GENERATOR = true;
648             }
649             else if (t.getText().equals("false")) {
650                 //System.out.println("setting code-generation debug OFF");
651                 DEBUG_CODE_GENERATOR = false;
652             }
653             else {
654                 antlrTool.error("option 'codeGenDebug' must be true or false", grammar.getClassName(), t.getLine(), t.getColumn());
655             }
656         }
657     }
658 
659     public void setTool(Tool tool) {
660         antlrTool = tool;
661     }
662 }
663
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags