LexerImpl


1   package fri.patterns.interpreter.parsergenerator.lexer;
2   
3   import java.util.*;
4   import java.io.*;
5   import fri.patterns.interpreter.parsergenerator.Lexer;
6   import fri.patterns.interpreter.parsergenerator.Token;
7   
8   /**
9       This Lexer must be created using LexerBuilder. It knows token and ignored terminals.
10      To get this Lexer working the <i>setTerminals()</i> call must be called at least once.
11      When using the Lexer standalone, the client must do this, else the Parser will
12      call that method.
13      <p>
14      This lexer can be reused, but it can not be loaded with other syntaxes after it
15      has been built for one.
16      
17      @author (c) 2002, Fritz Ritzberger
18  */
19  
20  public class LexerImpl implements
21      Lexer,
22      StrategyFactoryMethod,
23      Serializable
24  {
25      protected Strategy strategy;
26      private List ignoredSymbols;
27      private Map charConsumers;
28      private transient InputText input;
29      private List listeners;
30      private transient boolean debug;
31          
32      /**
33          Creates a Lexer from token- and ignored symbols, and a map of character consumers (built by LexerBuilder).
34          @param ignoredSymbols list of Strings containing ignored symbols to scan. These are NOT enclosed in `backquotes` like tokens.
35          @param charConsumers map with key = nonterminal and value = Consumer.
36      */
37      public LexerImpl(List ignoredSymbols, Map charConsumers)    {
38          setConsumers(ignoredSymbols, charConsumers);
39      }
40      
41      /** Do-nothing constructor for subclasses (currently unused). */
42      protected LexerImpl()   {
43      }
44  
45      /** Implements Lexer. Adds the passed token listener to listener list. */
46      public void addTokenListener(Lexer.TokenListener tokenListener) {
47          if (listeners == null)
48              listeners = new ArrayList(1);
49          listeners.add(tokenListener);
50      }
51      /** Implements Lexer. Removes the passed token listener from listener list. */
52      public void removeTokenListener(Lexer.TokenListener tokenListener)  {
53          if (listeners != null)
54              listeners.remove(tokenListener);
55      }
56      
57  
58      private void setConsumers(List ignoredSymbols, Map charConsumers)   {
59          this.charConsumers = charConsumers; // store for check at setTerminals()
60          this.ignoredSymbols = ignoredSymbols;   // need to know which token should be ignored
61                  
62          for (int i = 0; ignoredSymbols != null && i < ignoredSymbols.size(); i++)   {   // ignored symbols will not be passed by the parser
63              String   sym = (String  ) ignoredSymbols.get(i);
64              Consumer cc = (Consumer) charConsumers.get(sym);
65              ensureStrategy().addIgnoringConsumer(sym, cc);
66          }
67          
68          // propagate this LexerImpl as StrategyFactoryMethod to ConsumerAlternatives
69          for (Iterator it = charConsumers.entrySet().iterator(); it.hasNext(); ) {
70              Consumer c = (Consumer) ((Map.Entry) it.next()).getValue();
71              if (c instanceof ConsumerAlternatives)  {
72                  ((ConsumerAlternatives) c).setStrategyFactoryMethod(this);
73              }
74          }
75      }
76  
77      private Strategy ensureStrategy()   {
78          if (strategy == null)
79              strategy = newStrategy();
80          return strategy;
81      }
82      
83      /** Implements StrategyFactoryMethod. To be overridden to create a derived Strategy implementation. */
84      public Strategy newStrategy()   {
85          return new Strategy();
86      }
87      
88      /**
89          When false, the sort order (significance) of scan items without fixed start character decide what token is returned.
90          When true (default), the scan item (without fixed start character) that scnas longest wins.
91      */
92      public void setCompeteForLongestInput(boolean competeForLongestInput)   {
93          ensureStrategy().setCompeteForLongestInput(competeForLongestInput);
94      }
95      
96  
97      // implementing Lexer
98      
99      /**
100         Implements Lexer: set the input to be scanned. If text is InputStream, no Reader
101         will be used (characters will not be converted).
102         @param text text to scan, as String, StringBuffer, File, InputStream, Reader.
103     */
104     public void setInput(Object   text)
105         throws IOException
106     {
107         input = new InputText(text);
108     }
109 
110     /**
111         Implements Lexer: Parser call to pass all tokens symbols (all enclosed in `backquote`) and literals ("xyz").
112         @param terminals List of String containing "literals" and `lexertokens`.
113     */
114     public void setTerminals(List terminals)    {
115         for (int i = 0; i < terminals.size(); i++)  {
116             String   symbol = (String  ) terminals.get(i);
117             
118             // check if it is a terminal as this is a public call
119             if (symbol.length() <= 2 || Token.isTerminal(symbol) == false)
120                 throw new IllegalArgumentException  ("Terminals must be enclosed within quotes: "+symbol);
121             
122             String   text = symbol.substring(1, symbol.length() - 1); // remove quotes
123             
124             if (ensureStrategy().hasTerminal(symbol) == false)  {   // could have been called for second time
125                 if (symbol.charAt(0) == Token.COMMAND_QUOTE)    {   // is a scan terminal covered by a Consumer
126                     Consumer cc = (Consumer) charConsumers.get(text);
127                     if (cc == null)
128                         throw new IllegalArgumentException  ("Lexer token is not among character consumers: "+text);
129                     else
130                         ensureStrategy().addTokenConsumer(symbol, cc);
131                 }
132                 else    {
133                     ensureStrategy().addTokenConsumer(symbol, new Consumer(text));
134                 }
135             }
136         }   // end for
137         
138         if (debug)
139             System.err.println("StrategyList is:\n"+strategy);
140     }
141 
142 
143     /** Implements Lexer: Does nothing as no states are stored. This Lexer can not be loaded with new syntaxes. */
144     public void clear() {
145     }
146 
147 
148 
149     /**
150         This is an optional functionality of Lexer. It is <b>NOT</b> called by the Parser.
151         It can be used for heuristic reading from an input (not knowing if there is more input
152         after the token was read).
153         <p />
154         The passed LexerSemantic will receive every matched rule (top-down) together with
155         its ResultTree. See <i>lex()</i> for details.
156         
157         @param lexerSemantic the LexerSemantic to be called with every evaluated Rule and its lexing ResultTree.
158         @return a Token with a terminal symbol and its instance text, or a Token with null symbol for error.
159     */
160     public Token getNextToken(LexerSemantic lexerSemantic)
161         throws IOException
162     {
163         return getNextToken(lexerSemantic, null);
164     }
165     
166     /**
167         Implements Lexer: returns the next token from input, or EPSILON when no more input.
168         This is called by the Parser to get the next syntax token from input.
169         When returned <i>token.symbol</i> is null, no input could be recognized (ERROR).
170         @param expectedTokenSymbols contains the expected String token symbols (in keys),
171                 can be null when no Parser drives this Lexer.
172         @return a Token with a terminal symbol and its instance text, or a Token with null symbol for error.
173     */
174     public Token getNextToken(Map expectedTokenSymbols)
175         throws IOException
176     {
177         return getNextToken(null, expectedTokenSymbols);
178     }
179     
180     private Token getNextToken(LexerSemantic lexerSemantic, Map expectedTokenSymbols)
181         throws IOException
182     {
183         if (input == null)
184             throw new IllegalStateException  ("Lexer has no input, call setInput(...).");
185 
186         Token.Address start = new Token.Address(input.getScanLine(), input.getScanColumn(), input.getScanOffset());
187         int c = input.peek();   // read lookahead
188         if (c == Input.EOF)
189             return createToken(Token.EPSILON, null, new Token.Range(start, start));
190 
191         // not EOF, there must be a lexer item or error
192         Strategy.Item item = getNextLexerItem(expectedTokenSymbols, c);
193 
194         if (item != null)   {   // successful scan
195             if (ignoredSymbols != null && ignoredSymbols.indexOf(item.getSymbol()) >= 0)    {
196                 if (listeners != null && listeners.size() > 0)  // creating a token takes time, do it only when listeners are present
197                     fireTokenReceived(createToken(item.getTokenIdentifier(), item.getResultTree(), lexerSemantic), true);
198                 return getNextToken(expectedTokenSymbols);
199             }
200             else    {
201                 Token token = createToken(item.getTokenIdentifier(), item.getResultTree(), lexerSemantic);
202                 fireTokenReceived(token, false);
203                 return token;
204             }
205         }
206 
207         // error state, return an error Token with null symbol
208         Token.Address end = new Token.Address(input.getReadLine(), input.getReadColumn(), input.getScanOffset());
209         return createToken(null, input.getUnreadText(), new Token.Range(start, end));
210     }
211     
212     // strategic scan of next item
213     private Strategy.Item getNextLexerItem(Map expectedTokenSymbols, int lookahead)
214         throws IOException
215     {
216         if (strategy == null)
217             throw new IllegalStateException  ("Lexer has no terminals, call setTerminals(syntaxSeparation.getTokenSymbols()).");
218 
219         Strategy.Item item = strategy.consume(input, lookahead, expectedTokenSymbols);
220         
221         if (item != null)
222             input.resolveBuffer();  // forget old contents
223             
224         return item;
225     }
226     
227     // calls the token listeners with scanned token
228     private void fireTokenReceived(Token token, boolean ignored)    {
229         for (int i = 0; listeners != null && i < listeners.size(); i++)
230             ((Lexer.TokenListener) listeners.get(i)).tokenReceived(token, ignored);
231     }
232 
233     /** Token factory method. Can be overridden to access the lexing ResultTree. Delegates to createToken(tokenIdentifier, text, range). */
234     protected Token createToken(String   tokenIdentifier, ResultTree result, LexerSemantic lexerSemantic) {
235         if (lexerSemantic != null)
236             loopResultTree(result, lexerSemantic);
237         return createToken(tokenIdentifier, result.toString(), result.getRange());  // toString() takes time as it builds the token text
238     }
239     
240     /** Token factory method. Can be overridden to convert token.text to some Java object. */
241     protected Token createToken(String   tokenIdentifier, String   text, Token.Range range) {
242         return new Token(tokenIdentifier, text, range);
243     }
244 
245 
246     /**
247         This is an optional functionality of Lexer. It is <b>NOT</b> called by the Parser.
248         It can be used to run a standalone Lexer with a LexerSemantic, processing a ready-scanned
249         syntax tree. Other than with Parser Semantic no value stack is available for LexerSemantic,
250         and all input will have been read when LexerSemantic is called with the built syntax tree.
251         <p />
252         The passed LexerSemantic will receive every matched rule (top-down) together with
253         its results ResultTree, containing the range within input.
254         ResultTree can be converted to text by calling <i>resultTree.toString()</i>.
255         <p />
256         This method evaluates the input using end-of-input like a parser, that means it returns
257         false if the input was either syntactically incorrect or EOF was not received when all rules
258         have been evaluated.
259         <p />
260         <b>MIND:</b> This method does not call any TokenListener, as the LexerSemantic is expected to
261             dispatch results!
262         
263         @param lexerSemantic the LexerSemantic to be called with every evaluated Rule and its lexing ResultTree.
264         @return true when lexer succeeded (input was syntactically ok), else false.
265     */
266     public boolean lex(LexerSemantic lexerSemantic)
267         throws IOException
268     {
269         int c = input.peek();
270         boolean eof = (c == Input.EOF);
271         boolean error = eof;
272         
273         if (error == false) {
274             Strategy.Item item = getNextLexerItem(null, c);
275             error = (item == null || item.getTokenIdentifier() == null);
276 
277             if (error == false && lexerSemantic != null)
278                 loopResultTree(item.getResultTree(), lexerSemantic);
279 
280             c = input.peek();
281             eof = (c == Input.EOF);
282             error = (eof == false);
283         }
284         
285         if (error)  {
286             dump(System.err);
287             System.err.println("Could not process character '"+(char)c+"' (int "+c+"), at line/column "+input.getScanLine()+"/"+input.getScanColumn()+", at offset "+input.getScanOffset());
288         }
289 
290         return error == false;
291     }
292 
293     /**
294         After top-down lexing this method is called to dispatch all results. Can be overridden to change dispatch logic.
295         This method calls itself recursively with all result tree children. Nonterminals starting with "_" are ignored
296         by default, as this marks artificial rules.
297         @param result lexer result, returns text on getText().
298         @param semantic semantic that dispatches the lexer results.
299         @return a Token with the range and return of the Semantic call for this Rule/ResultTree.
300     */
301     protected void loopResultTree(ResultTree result, LexerSemantic lexerSemantic)   {
302         Set wantedNonterminals = lexerSemantic.getWantedNonterminals();
303         Set ignoredNonterminals = lexerSemantic.getIgnoredNonterminals();
304         String   nonterminal = result.getRule().getNonterminal();
305         
306         if (nonterminal.startsWith(Token.ARTIFICIAL_NONTERMINAL_START_CHARACTER) == false &&
307                 (wantedNonterminals == null || wantedNonterminals.contains(nonterminal)) &&
308                 (ignoredNonterminals == null || ignoredNonterminals.contains(nonterminal) == false))
309         {
310             lexerSemantic.ruleEvaluated(result.getRule(), result);
311         }
312         
313         for (int i = 0; i < result.getChildCount(); i++)    {
314             Object   child = result.getChild(i);
315             if (child instanceof ResultTree)
316                 loopResultTree((ResultTree) child, lexerSemantic);
317         }
318     }
319 
320 
321 
322     // debug methods
323     
324     /** Implements Lexer: Set debug on to output information about scanned tokens. */
325     public void setDebug(boolean debug) {
326         this.debug = debug;
327     }
328 
329     /** Returns the current line, as far as read. */
330     public String   getLineText() {
331         return input.getLine();
332     }
333     
334     /** Returns the number of the current line, 1-n. */
335     public int getLine()    {
336         return input.getReadLine();
337     }
338 
339     /** Returns the position within the current line, 0-n. */
340     public int getColumn()  {
341         return input.getReadColumn();
342     }
343     
344     /** Returns the offset read so far from input. This is an absolute offset, including newlines. */
345     public int getOffset()  {
346         return input.getScanOffset();
347     }
348     
349 
350     /** Outputs current and previous line, with line numbers. Call this on ERROR. */
351     public void dump(PrintStream out)   {
352         int lineNr = input.getReadLine();
353         String   line = getLineText();
354         
355         if (lineNr > 1) {
356             String   prevLine = input.getPreviousLine();
357             out.print((lineNr - 1)+":\t");
358             out.println(prevLine);
359         }
360         
361         out.print(lineNr+":\t");
362         out.println(line);
363 
364         int nrLen = Integer.toString(lineNr).length();
365         for (int i = 0; i < nrLen; i++)
366             out.print(" ");
367 
368         out.print("\t");
369 
370         int errPos = input.getReadColumn();
371 
372         for (int i = 0; i < errPos && i < line.length(); i++)
373             if (line.charAt(i) == '\t')
374                 out.print("\t");
375             else
376                 out.print(" ");
377 
378         out.println("^");
379     }
380 
381 }
382
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags