KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > fri > patterns > interpreter > parsergenerator > syntax > builder > SyntaxSeparation


1 package fri.patterns.interpreter.parsergenerator.syntax.builder;
2
3 import java.util.*;
4 import fri.patterns.interpreter.parsergenerator.Token;
5 import fri.patterns.interpreter.parsergenerator.syntax.*;
6 import fri.patterns.interpreter.parsergenerator.lexer.StandardLexerRules;
7
8 /**
9     Separates lexer from parser rules. Provides token and ignored symbols
10     after construction, which can be used for LexerBuilder construction.
11     <p>
12     Example:
13     <pre>
14         SyntaxSeparation separation = new SyntaxSeparation(new Syntax(myRules));
15         LexerBuilder builder = new LexerBuilder(separation.getLexerSyntax(), separation.getIgnoredSymbols());
16         Lexer lexer = builder.getLexer();
17         // when using the lexer standalone, you must put the token terminal symbols into it now:
18         lexer.setTerminals(separation.getTokenSymbols());
19         // when using a Parser the Parser will do this:
20     </pre>
21     
22     @see fri.patterns.interpreter.parsergenerator.lexer.LexerBuilder
23     @author (c) 2002, Fritz Ritzberger
24 */

25
26 public class SyntaxSeparation
27 {
28     private List tokenSymbols;
29     private List ignoredSymbols;
30     private Syntax parserSyntax;
31     private Syntax lexerSyntax;
32     public static boolean DEBUG = true;
33
34     /** Splits a syntax into two syntaxes containing parser and lexer rules, tokens and ignored get collected. */
35     public SyntaxSeparation(Syntax syntax)
36         throws SyntaxException
37     {
38         separate(syntax, new IntArray(syntax.size()));
39     }
40
41     /** Returns the lexer syntax that remains after removing all parser rules and token/ignored directives. */
42     public Syntax getLexerSyntax() {
43         return lexerSyntax;
44     }
45
46     /** Returns the parser syntax that remains after removing all lexer rules. All lexer tokens are marked with `backquotes`. */
47     public Syntax getParserSyntax() {
48         return parserSyntax;
49     }
50
51     /**
52         Returns the top level token symbols for the lexer. These symbols ARE enclosed within `backquotes`.
53         This can be used at <i>setTerminals</i> call for a standalone Lexer.
54     */

55     public List getTokenSymbols() {
56         return tokenSymbols;
57     }
58
59     /**
60         Returns the top level ignored token symbols for the lexer. These symbols are NOT enclosed within `backquotes`.
61         This will be used by the LexerBuilder for feeding the LexerImpl with ignored symbols.
62     */

63     public List getIgnoredSymbols() {
64         return ignoredSymbols;
65     }
66
67
68     private void separate(Syntax syntax, IntArray deleteIndexes)
69         throws SyntaxException
70     {
71         Hashtable tokenSymbols = new Hashtable();
72         Hashtable ignoredSymbols = new Hashtable();
73         List commandsDefinedAsTOKEN = new ArrayList();
74
75         // take away all rules defined by "token" and "ignored"
76
for (int i = 0; i < syntax.size(); i++) {
77             Rule rule = syntax.getRule(i);
78             String JavaDoc nonterm = rule.getNonterminal();
79             
80             boolean token = nonterm.equals(Token.TOKEN);
81             boolean ignored = token == false && nonterm.equals(Token.IGNORED);
82             
83             if (token || ignored) { // special token definition rule
84
if (rule.rightSize() != 1) // token/ignored must not contain more than one item on right side (alternatives being resoved)
85
throw new SyntaxException("\"token\" and \"ignored\" are predefined lexer keywords and must contain exactly one nonterminal symbol after resolve: "+rule);
86
87                 deleteIndexes.add(i);
88
89                 String JavaDoc sym = rule.getRightSymbol(0); // the token identifier
90
if (sym.charAt(0) == Token.COMMAND_QUOTE) {
91                     sym = sym.substring(1, sym.length() - 1); // take off quotes
92
if (token)
93                         commandsDefinedAsTOKEN.add(sym);
94                 }
95
96                 if (token)
97                     tokenSymbols.put(sym, sym);
98                 else
99                     ignoredSymbols.put(sym, sym);
100             }
101         }
102         deleteIndexes.removeIndexesFrom(syntax);
103
104         // check if tokens are defined as ignored or vice versa
105
for (Iterator it = tokenSymbols.keySet().iterator(); it.hasNext(); ) {
106             Object JavaDoc o = it.next();
107             if (ignoredSymbols.get(o) != null)
108                 throw new SyntaxException("Can not define token as ignored: "+o);
109         }
110         for (Iterator it = ignoredSymbols.keySet().iterator(); it.hasNext(); ) {
111             Object JavaDoc o = it.next();
112             if (tokenSymbols.get(o) != null)
113                 throw new SyntaxException("Can not define ignored as token: "+o);
114         }
115         
116         boolean tokensWereDeclared = tokenSymbols.size() > 0;
117         List commandTokens = new ArrayList();
118         
119         // collect all `lexertokens`.
120
for (int i = 0; i < syntax.size(); i++) {
121             Rule rule = syntax.getRule(i);
122
123             for (int j = 0; j < rule.rightSize(); j++) { // loop right side of rule
124
String JavaDoc sym = rule.getRightSymbol(j);
125                 
126                 if (sym.charAt(0) == Token.COMMAND_QUOTE) {
127                     sym = sym.substring(1, sym.length() - 1); // remove command quotes
128
tokenSymbols.put(sym, sym);
129                     commandTokens.add(sym);
130                     rule.setRightSymbol(sym, j); // remove the quotes for being able to recognize the symbol later
131
}
132                 else
133                 // get out all rules containing ".." and "-" when no "token" was defined explicitely, as these must be scanner rules
134
if (tokensWereDeclared == false && (sym.equals(Token.BUTNOT) || sym.equals(Token.UPTO))) {
135                     String JavaDoc s = rule.getNonterminal(); // take this rule to lexer rules
136
tokenSymbols.put(s, s);
137                 }
138             }
139         }
140
141         // Get out all scanner rules from parser syntax, recursively
142
this.lexerSyntax = new Syntax(tokenSymbols.size() + ignoredSymbols.size());
143         Hashtable [] varr = new Hashtable [] { tokenSymbols, ignoredSymbols };
144         
145         for (int j = 0; j < varr.length; j++) { // loop token and ignored lists
146
Hashtable symbols = varr[j];
147             
148             for (Enumeration e = symbols.keys(); e.hasMoreElements(); ) {
149                 String JavaDoc nonterm = (String JavaDoc)e.nextElement();
150                 
151                 getRulesUnderSymbol(nonterm, syntax, lexerSyntax, deleteIndexes);
152                 
153                 if (deleteIndexes.isEmpty() && lexerSyntax.hasRule(nonterm) == false) { // if there were no rules for symbol
154
String JavaDoc [][] predefinedRules = StandardLexerRules.rulesForIdentifier(nonterm); // look for predefined rule
155
if (predefinedRules == null || predefinedRules.length <= 0)
156                         throw new SyntaxException("Found nonterminal that has no rule and is no predefined lexer nonterminal: >"+nonterm+"<");
157                     
158                     lexerSyntax.appendRules(SyntaxUtil.ruleArrayToList(predefinedRules));
159                 }
160                 
161                 deleteIndexes.removeIndexesFrom(syntax);
162             }
163         }
164         
165         // add ignoredSymbols to member variable lists
166
this.ignoredSymbols = new ArrayList(ignoredSymbols.size());
167         for (Enumeration e = ignoredSymbols.keys(); e.hasMoreElements(); )
168             this.ignoredSymbols.add(e.nextElement());
169
170         this.parserSyntax = provideParserSyntax(syntax, lexerSyntax, tokensWereDeclared, tokenSymbols, commandTokens, commandsDefinedAsTOKEN);
171
172         //System.err.println("Parser syntax:\n"+syntax);
173
//System.err.println("Lexer syntax:\n"+lexerSyntax);
174
//System.err.println("token symbols: "+tokenSymbols);
175
//System.err.println("ignored symbols: "+ignoredSymbols);
176
}
177
178
179     private Syntax provideParserSyntax(Syntax parserSyntax, Syntax lexerSyntax, boolean tokensWereDeclared, Map tokenSymbols, List commandTokens, List commandsDefinedAsTOKEN)
180         throws SyntaxException
181     {
182         boolean lexerOnlyHandling = false;
183         
184         // when this was a mixed grammar: mark all tokens and ignored symbols as lexer `terminals` in parserSyntax
185
if (parserSyntax.size() > 0) {
186             if (DEBUG) System.err.println("INFO: Mixed parser and lexer specification, "+lexerSyntax.size()+" lexer rules, "+parserSyntax.size()+" parser rules.");
187             this.tokenSymbols = new ArrayList(tokenSymbols.size()); // keep only toplevel token symbols
188

189             for (int i = 0; i < parserSyntax.size(); i++) {
190                 Rule rule = parserSyntax.getRule(i);
191                 
192                 for (int j = 0; j < rule.rightSize(); j++) {
193                     String JavaDoc sym = rule.getRightSymbol(j);
194                     
195                     if (tokenSymbols.get(sym) != null) { // this is a lexer token, mask it with `backquotes`
196
String JavaDoc parserSymbol = Token.COMMAND_QUOTE + sym + Token.COMMAND_QUOTE;
197                         if (sym.charAt(0) != Token.COMMAND_QUOTE) // mark symbol with backquotes as lexer token
198
rule.setRightSymbol(parserSymbol, j);
199                             
200                         if (this.tokenSymbols.indexOf(parserSymbol) < 0)
201                             this.tokenSymbols.add(parserSymbol);
202                             // add top level token as `symbol` (with backquotes), as Parser will pass terminals like that,
203
// and the Lexer should also be usable without Parser.
204
}
205                     else
206                     if (sym.equals(Token.UPTO) || sym.equals(Token.BUTNOT)) {
207                         throw new SyntaxException("Found lexer rule in parser syntax: "+rule+". Please define \"token\" and \"ignored\" better!");
208                     }
209                     else
210                     if (Token.isTerminal(sym) == false) { // check if there is a nonterminal without rule on right side and try to find it among lexer rules
211
boolean found = parserSyntax.hasRule(sym);
212                         if (found == false) { // try to find it in lexer rules and make it a token when found, else throw error
213
if (lexerSyntax.hasRule(sym)) {
214                                 String JavaDoc parserSymbol = Token.COMMAND_QUOTE + sym + Token.COMMAND_QUOTE;
215                                 rule.setRightSymbol(parserSymbol, j);
216                                 if (this.tokenSymbols.indexOf(parserSymbol) < 0)
217                                     this.tokenSymbols.add(parserSymbol);
218                             }
219                             else {
220                                 throw new SyntaxException("Parser nonterminal without rule: "+sym);
221                             }
222                         }
223                     }
224                 }
225             }
226         }
227         else // no declared tokens and no parser syntax remained
228
if (tokensWereDeclared == false) { // find top level lexer rules and put them into token list marked with `backquotes`
229
if (DEBUG) System.err.println("INFO: No tokens were defined, lexer specification without parser rules, "+lexerSyntax.size()+" lexer rules.");
230             List startRules = lexerSyntax.findStartRules();
231             
232             if (startRules.size() > 0) {
233                 this.tokenSymbols = new ArrayList(startRules.size()); // allocate new list only if there were start rules
234

235                 for (int i = 0; i < startRules.size(); i++) {
236                     String JavaDoc symbol = Token.COMMAND_QUOTE + ((Rule) startRules.get(i)).getNonterminal() + Token.COMMAND_QUOTE;
237                     if (this.tokenSymbols.indexOf(symbol) < 0) // add unique
238
this.tokenSymbols.add(symbol);
239                 }
240             }
241             else {
242                 lexerOnlyHandling = true;
243             }
244         }
245         else {
246             if (DEBUG) System.err.println("INFO: tokens were defined, lexer specification without parser rules, "+lexerSyntax.size()+" lexer rules.");
247             lexerOnlyHandling = true;
248         }
249         
250         if (lexerOnlyHandling) { // no parser syntax remained, and tokens were defined or no start rules were found
251
for (int i = 0; i < commandTokens.size(); i++) { // no parser syntax, remove collected `lexercommands` when they were not defined in TOKEN definition
252
String JavaDoc sym = (String JavaDoc) commandTokens.get(i);
253                 if (commandsDefinedAsTOKEN.indexOf(sym) < 0)
254                     tokenSymbols.remove(sym);
255             }
256
257             // add tokenSymbols to member variable lists
258
this.tokenSymbols = new ArrayList(tokenSymbols.size());
259             for (Iterator it = tokenSymbols.keySet().iterator(); it.hasNext(); )
260                 this.tokenSymbols.add(Token.COMMAND_QUOTE + it.next().toString() + Token.COMMAND_QUOTE);
261                 // must enclose all explicit token symbols in `backquotes`
262
}
263         
264         return parserSyntax;
265     }
266
267
268     /*
269         Appends all rules in passed syntax deriving passed symbol and all their
270         sub-rules, recursively, to passed result-syntax. Fills the deleteIndex
271         with every appended rule index, but does not delete the rule from source syntax.
272         @param symbol nonterminal to search
273         @param syntax source syntax that will be searched
274         @param resultSyntax target syntax to append the found rules to
275         @param deleteIndexes receives indexes of all appended rules for later deletion
276     */

277     private void getRulesUnderSymbol(String JavaDoc symbol, Syntax syntax, Syntax resultSyntax, IntArray deleteIndexes) {
278         for (int i = 0; i < syntax.size(); i++) {
279             Rule rule = syntax.getRule(i);
280             String JavaDoc nonterm = rule.getNonterminal();
281             
282             // check if rule derives symbol and was not already appended (recursive rules!)
283
if (deleteIndexes.contains(i) == false && nonterm.equals(symbol)) {
284                 resultSyntax.addRule(rule);
285                 deleteIndexes.add(i);
286                 
287                 // check for further nonterminals on right side
288
for (int j = 0; j < rule.rightSize(); j++) {
289                     String JavaDoc sym = rule.getRightSymbol(j);
290                     if (Token.isTerminal(sym) == false && sym.equals(Token.BUTNOT) == false && sym.equals(Token.UPTO) == false)
291                         getRulesUnderSymbol(sym, syntax, resultSyntax, deleteIndexes);
292                 }
293             }
294         }
295     }
296
297
298
299     // Array implementation used to store indexes for fast and safe deletion of rules from syntaxes
300
public static class IntArray
301     {
302         private int [] array;
303         private int pos;
304         
305         public IntArray(int size) {
306             array = new int [size];
307         }
308         
309         public void add(int i) {
310             if (pos >= array.length) {
311                 int [] newArray = new int [array.length * 2];
312                 System.arraycopy(array, 0, newArray, 0, array.length);
313                 array = newArray;
314             }
315             array[pos] = i;
316             pos++;
317         }
318         
319         public boolean isEmpty() {
320             return pos == 0;
321         }
322         
323         public boolean contains(int j) {
324             for (int i = 0; i < pos; i++)
325                 if (array[i] == j)
326                     return true;
327             return false;
328         }
329         
330         public void removeIndexesFrom(Syntax syntax) {
331             Arrays.sort(array, 0, pos); // to delete from list indexes must be sorted
332
for (int i = pos - 1; i >= 0; i--)
333                 syntax.removeRule(array[i]); // remove rule
334
pos = 0; // reset position
335
}
336         
337     } // end class IntArray
338

339 }
340
Popular Tags