KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > fri > patterns > interpreter > parsergenerator > lexer > LexerBuilder


1 package fri.patterns.interpreter.parsergenerator.lexer;
2
3 import java.util.*;
4 import java.io.IOException JavaDoc;
5 import fri.patterns.interpreter.parsergenerator.Lexer;
6 import fri.patterns.interpreter.parsergenerator.Token;
7 import fri.patterns.interpreter.parsergenerator.syntax.*;
8 import fri.patterns.interpreter.parsergenerator.syntax.builder.SyntaxSeparation;
9
10 /**
11     Generates a Lexer from a Syntax. The Syntax can contain also parser rules.
12     These will be retrievable (without the removed lexer rules) after build by
13     calling <i>lexer.getParserSyntax()</i>.
14     <p>
15     The syntax rules may not contain '(', ')', '*', '+' or '?', but they may
16     contain character set symbols like ".." (set definitions) and "-" (intersections).
17     <p>
18     The syntax may contain identifiers enclosed within `backquotes`.
19     This marks predefined lexer rules defined in <i>StandardLexerRules</i>.
20     That class contains default rules for numbers, identifiers, stringdefinitions,
21     characterdefinitions and many other (e.g. for XML), which can be used to build
22     lexers.<br>
23     <b>CAUTION:</b> Lexer and parser rules have the same namespace, you can not define
24     <pre>
25         identifier ::= `identifier`; // wrong!
26     </pre>
27     Nevertheless you need not to care about the names silently imported from
28     <i>StandardLexerRules</i>, they will not reduce the parser syntax namespace,
29     only the toplevel rules will.
30     <p>
31     The syntax may contain (case-sensitive) these nonterminals:
32     <ul>
33         <li>token</li>
34         <li>ignored</li>
35     </ul>
36     These are lexer-reserved identifiers and can be used to mark top level lexer
37     rules (tokens). When <b>token</b> is used, the builder does not try to recognize any
38     rule as lexer rule, so this must be good modeled. Be careful: you can read away
39     comments only by using <b>ignored</b>. But you can define <b>ignored</b> without <b>token</b>,
40     then nevertheless the builder tries to recognize lexer rules.<br>
41     When the <b>token</b> marker is not used, the builder tries to separate lexer from
42     parser rules.
43     <p>
44     Example:
45     <pre>
46         token ::= `identifier` ; // using StandardLexerRules
47         ignored ::= `spaces` ;
48         ignored ::= `newline` ;
49         ignored ::= comment ;
50         comment ::= "//" char_minus_newline_list_opt ;
51         char_minus_newline ::= chars - newline;
52         char_minus_newline_list ::= char_minus_newline_list char_minus_newline;
53         char_minus_newline_list ::= char_minus_newline ;
54         char_minus_newline_list_opt ::= char_minus_newline_list;
55         char_minus_newline_list_opt ::= ; // nothing
56     </pre>
57     Mind that the builder input can not be a text file, it must be wrapped into <i>Syntax</i>.
58     Use syntax builder to convert a text into a <i>Syntax</i> object.
59     <p>
60     Java code fragment:
61     <pre>
62         SyntaxSeparation separation = new SyntaxSeparation(new Syntax(myRules));
63         LexerBuilder builder = new LexerBuilder(separation.getLexerSyntax(), separation.getIgnoredSymbols());
64         Lexer lexer = builder.getLexer();
65         // when using the lexer standalone (without Parser), you must put the token terminal symbols into it now:
66         lexer.setTerminals(separation.getTokenSymbols());
67     </pre>
68     
69     @see fri.patterns.interpreter.parsergenerator.syntax.builder.SyntaxSeparation
70     @see fri.patterns.interpreter.parsergenerator.lexer.StandardLexerRules
71     @author (c) 2002, Fritz Ritzberger
72 */

73
74 public class LexerBuilder
75 {
76     protected Map charConsumers;
77     protected List ignoredSymbols;
78     public static boolean DEBUG; // defaults to false
79

80     /**
81         Creates a LexerBuilder (from lexer rules) that provides a Lexer.
82         @param lexerSyntax lexer rule (without token and ignored, use SyntaxSeparation for that)
83         @param ignoredSymbols list of ignored symbols, NOT enclosed in backquotes!
84     */

85     public LexerBuilder(Syntax lexerSyntax, List ignoredSymbols)
86         throws LexerException, SyntaxException
87     {
88         this.ignoredSymbols = ignoredSymbols;
89         build(lexerSyntax);
90     }
91
92
93     /** Returns the built Lexer. */
94     public Lexer getLexer() {
95         return new LexerImpl(ignoredSymbols, charConsumers);
96     }
97         
98     /** Returns the built Lexer, loaded with passed input (file, stream, string, ...). */
99     public Lexer getLexer(Object JavaDoc input)
100         throws IOException JavaDoc
101     {
102         Lexer lexer = getLexer();
103         lexer.setInput(input);
104         return lexer;
105     }
106
107
108     private void build(Syntax lexerSyntax)
109         throws LexerException, SyntaxException
110     {
111         SyntaxSeparation.IntArray deleteIndexes = new SyntaxSeparation.IntArray(lexerSyntax.size());
112         if (DEBUG)
113             System.err.println("Processing lexer rules: \n"+lexerSyntax);
114
115         // resolve scanner rules to Consumers and put it into a hashtable
116
this.charConsumers = new Hashtable(lexerSyntax.size());
117         for (int i = 0; i < lexerSyntax.size(); i++)
118             translateLexerRule(lexerSyntax.getRule(i), i, deleteIndexes);
119         deleteIndexes.removeIndexesFrom(lexerSyntax);
120         
121         // check for unresolved repeatable and nullable rules and delete them from lexer syntax
122
for (int i = 0; i < lexerSyntax.size(); i++) {
123             Rule rule = lexerSyntax.getRule(i);
124             String JavaDoc nonterm = rule.getNonterminal();
125             if (checkNullableRule(nonterm, rule, i, deleteIndexes) == false)
126                 if (checkRepeatableRule(nonterm, rule, i, deleteIndexes) == false)
127                     throw new LexerException("Found no character consumer for nullable or repeatable rule "+rule);
128         }
129         deleteIndexes.removeIndexesFrom(lexerSyntax);
130         
131         if (lexerSyntax.size() > 0) { // not all rules have been resolved to character consumers
132
throw new LexerException("Could not process rules in lexer syntax: "+lexerSyntax);
133         }
134
135         // resolve all symbolic consumer references after all consumers have been created
136
Map done = new Hashtable(); // beware of recursion
137
for (Iterator it = charConsumers.entrySet().iterator(); it.hasNext(); ) {
138             Consumer cc = (Consumer) ((Map.Entry)it.next()).getValue();
139             cc.resolveConsumerReferences(charConsumers, done);
140         }
141     }
142
143
144     private void translateLexerRule(Rule rule, int index, SyntaxSeparation.IntArray deleteIndexes)
145         throws LexerException
146     {
147         String JavaDoc nonterm = rule.getNonterminal();
148         if (rule.rightSize() <= 0 || rule.getRightSymbol(0).equals(nonterm)) // nullable rules and left recursive rules will be resolved later
149
return;
150         
151         //System.err.println("translating lexer rule: "+rule);
152

153         // ExtendedGrammar should have resolved all parenthesis expressions and wildcards.
154
// We take away rules that are:
155
// - single character position definitions like
156
// nonterm ::= '0' .. '9'
157
// nonterm ::= 'a' .. 'z' - 'm' .. 'n'
158
// nonterm ::= something - "string" // "something" must be among scanner rules
159
// - single string terminal definitions like
160
// nonterm ::= "string"
161
// nonterm ::= 'c' 'd' 'e' "fgh" // do concatenation
162
// - scanner nonterminals concatenations like
163
// nonterm ::= something1 something2 // "somethingN" must be among scanner rules
164
// - scanner nonterminals concatenations like
165
// nonterm ::= "string"
166

167         int CONCATENATION = 0, SET = 1, SUBTRACTION = 2;
168         int state = CONCATENATION;
169         boolean intersectionHappened = false;
170         Consumer consumer = new Consumer(rule); // master consumer
171
Consumer currentConsumer = new Consumer();
172         Consumer setConsumer = currentConsumer; // will host set definitions
173
consumer.append(currentConsumer); // will be resolved when trivial
174

175         for (int i = 0; i < rule.rightSize(); i++) { // loop all symbols on right side
176
String JavaDoc sym = rule.getRightSymbol(i);
177             
178             if (sym.equals(Token.BUTNOT)) {
179                 if (i == 0 || state != CONCATENATION)
180                     throw new LexerException("Missing symbol to subtract from: "+rule);
181                 state = SUBTRACTION;
182             }
183             else
184             if (sym.equals(Token.UPTO)) {
185                 if (i == 0 || state != CONCATENATION)
186                     throw new LexerException("Missing lower limit of set: "+rule);
187                 state = SET;
188             }
189             else {
190                 String JavaDoc convertedSym = convertSymbol(sym); // remove quotes or convert number to char
191
boolean isNonterm = convertedSym.equals(sym);
192                 if (isNonterm && state == SET)
193                     throw new LexerException("Can not append nonterminal to set: "+rule);
194
195                 boolean setWillHappen = rule.rightSize() > i + 1 && rule.getRightSymbol(i + 1).equals(Token.UPTO); // next symbol will be ".."
196

197                 if (state == SET) {
198                     setConsumer.appendSet(convertedSym);
199                     setConsumer = currentConsumer; // reset if intersection happened
200
}
201                 else
202                 if (state == SUBTRACTION) {
203                     intersectionHappened = true;
204                     if (isNonterm)
205                         if (setWillHappen)
206                             throw new LexerException("Nonterminal can not open set after subtraction: "+rule);
207                         else
208                             currentConsumer.subtract(new Consumer.Reference(sym));
209                     else
210                         if (setWillHappen)
211                             currentConsumer.subtract(setConsumer = new Consumer(convertedSym));
212                         else
213                             currentConsumer.subtract(new Consumer(convertedSym));
214                 }
215                 else
216                 if (state == CONCATENATION) {
217                     if (intersectionHappened) { // start new consumer
218
intersectionHappened = false;
219                         currentConsumer = new Consumer();
220                         consumer.append(currentConsumer);
221                     }
222                     
223                     if (isNonterm)
224                         if (setWillHappen)
225                             throw new LexerException("Nonterminal can not open set in concatenation: "+rule);
226                         else
227                             currentConsumer.append(new Consumer.Reference(sym));
228                     else
229                         currentConsumer.append(convertedSym); // a following set will be recognized by consumer
230
}
231
232                 state = CONCATENATION; // reset to normal state
233

234             } // end switch current symbol
235
} // end for right side of rule
236

237         putCharConsumer(nonterm, consumer.optimize());
238         deleteIndexes.add(index);
239     }
240
241
242     private void putCharConsumer(String JavaDoc key, Consumer consumer) {
243         //System.err.println("putting character consumer for "+key);
244
Object JavaDoc o = charConsumers.get(key); // test if existing
245

246         if (o == null) { // not in list
247
charConsumers.put(key, consumer);
248         }
249         else {
250             ConsumerAlternatives ca;
251
252             if (o instanceof ConsumerAlternatives == false) {
253                 ca = new ConsumerAlternatives((Consumer)o);
254                 charConsumers.put(key, ca); // replace consumer
255
}
256             else {
257                 ca = (ConsumerAlternatives)o;
258             }
259             
260             ca.addAlternate(consumer); // add a new alternative
261
}
262     }
263
264
265
266     private boolean checkNullableRule(String JavaDoc nonterm, Rule rule, int index, SyntaxSeparation.IntArray deleteIndexes) {
267         // We take away rules that are optional nonterminals like
268
// nonterm ::= something // "nonterm" is already among scanner rules
269
// nonterm ::= /*nothing*/ // this is the rule to remove now
270

271         if (rule.rightSize() <= 0) {
272             Object JavaDoc o = charConsumers.get(nonterm);
273             ((Consumer)o).setNullable();
274             deleteIndexes.add(index);
275             return true; // do not explore empty rule, return "found nullable"
276
}
277         return false;
278     }
279
280
281     private boolean checkRepeatableRule(String JavaDoc nonterm, Rule rule, int index, SyntaxSeparation.IntArray deleteIndexes) {
282         // We take away rules that are left recursive like
283
// nonterm ::= nonterm something // this is the rule to remove now
284
// nonterm ::= something // "nonterm" must be already among scanner rules
285

286         // check for nonterm in hashtable, set it repeatable if found
287
if (rule.rightSize() >= 2 && rule.getRightSymbol(0).equals(nonterm)) { // left recursive
288
Consumer cc = (Consumer) charConsumers.get(nonterm);
289             if (cc.matchesRepeatableRule(rule)) { // check if rest on the right is same
290
cc.setRepeatable();
291                 deleteIndexes.add(index);
292                 return true;
293             }
294         }
295         return false;
296     }
297
298     
299     /**
300         Converts a character or string definition to its processable form.
301         This implementation must be according to "bnf_chardef" in <i>StandardLexerRules</i>.
302         <ul>
303             <li>0xFFFF hexadecimal, convert to character</li>
304             <li>0777 octal, convert to character</li>
305             <li>12345 decimal, convert to character</li>
306             <li>'c' character, remove quotes</li>
307             <li>'\n', "\n" escaped character, decode</li>
308             <li>"string" string, remove quotes</li>
309             <li>nonterminal - will stay unchanged</li>
310         </ul>
311     */

312     private String JavaDoc convertSymbol(String JavaDoc sym) {
313         if (sym.charAt(0) == '\'' || sym.charAt(0) == '"') {
314             String JavaDoc s = sym.substring(1, sym.length() - 1);
315             if (s.length() <= 0)
316                 throw new IllegalArgumentException JavaDoc("Empty character or string definition: "+sym);
317
318             StringBuffer JavaDoc sb = new StringBuffer JavaDoc(s.length()); // convert escape sequences to their real meaning
319
for (int i = 0; i < s.length(); i++) {
320                 char c = s.charAt(i);
321                 if (c == '\\') {
322                     char c1 = s.length() > i + 1 ? s.charAt(i + 1) : 0;
323                     switch (c1) {
324                         case 'n': sb.append('\n'); i++; break;
325                         case 'r': sb.append('\r'); i++; break;
326                         case 't': sb.append('\t'); i++; break;
327                         case 'f': sb.append('\f'); i++; break;
328                         case 'b': sb.append('\b'); i++; break;
329                         case '\'': sb.append('\''); i++; break;
330                         case '"': sb.append('"'); i++; break;
331                         case '\\': sb.append('\\'); i++; break;
332                         default: sb.append(c); break;
333                     }
334                 }
335                 else {
336                     sb.append(c);
337                 }
338             }
339             return sb.toString();
340         }
341         else { // must be starting with digit or be a nonterminal
342
char c;
343             if (sym.startsWith("0x") || sym.startsWith("0X")) // hexadecimal number
344
c = (char) Integer.valueOf(sym.substring(2), 16).intValue();
345             else
346             if (sym.startsWith("0")) // octal number
347
c = (char) Integer.valueOf(sym.substring(1), 8).intValue();
348             else
349             if (Character.isDigit(sym.charAt(0)))
350                 c = (char) Integer.valueOf(sym).intValue(); // will throw NumberFormatException when not number
351
else
352                 return sym; // is a nonterminal
353

354             return new String JavaDoc(new char [] { c });
355         }
356     }
357     
358 }
359
Popular Tags